Skip to content

Commit 2006c90

Browse files
authored
feat(operator): add per-tenant namespace RBAC for multi-tenant job execution (#656)
* feat(operator): add per-tenant namespace RBAC for multi-tenant job execution * feat(operator): annotate tenant RBAC resources with owner
1 parent 3fd576a commit 2006c90

File tree

5 files changed

+712
-0
lines changed

5 files changed

+712
-0
lines changed

config/rbac/role.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ rules:
3333
- apiGroups:
3434
- ""
3535
resources:
36+
- namespaces
3637
- persistentvolumes
3738
verbs:
3839
- get

controllers/evalhub/constants.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,11 @@ const (
5858
providerNameLabel = "trustyai.opendatahub.io/evalhub-provider-name"
5959
providersVolumeName = "evalhub-providers"
6060
providersMountPath = configDirPath + "/providers"
61+
62+
// Tenant namespace configuration
63+
tenantAnnotation = "mlflow.kubeflow.org/workspace-description"
64+
tenantLabel = "trustyai.opendatahub.io/managed-by"
65+
tenantOwnerAnnotation = "trustyai.opendatahub.io/owner"
6166
)
6267

6368
var (

controllers/evalhub/evalhub_controller.go

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,10 @@ import (
1717
ctrl "sigs.k8s.io/controller-runtime"
1818
"sigs.k8s.io/controller-runtime/pkg/client"
1919
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
20+
"sigs.k8s.io/controller-runtime/pkg/handler"
2021
"sigs.k8s.io/controller-runtime/pkg/log"
2122
"sigs.k8s.io/controller-runtime/pkg/manager"
23+
"sigs.k8s.io/controller-runtime/pkg/reconcile"
2224
)
2325

2426
func ControllerSetUp(mgr manager.Manager, ns string, recorder record.EventRecorder) error {
@@ -54,6 +56,7 @@ type EvalHubReconciler struct {
5456
//+kubebuilder:rbac:groups="",resources=events,verbs=create;patch;update
5557
//+kubebuilder:rbac:groups=rbac.authorization.k8s.io,resources=roles,verbs=get;list;watch;create;update;patch;delete
5658
//+kubebuilder:rbac:groups=rbac.authorization.k8s.io,resources=rolebindings,verbs=get;list;watch;create;update;patch;delete
59+
//+kubebuilder:rbac:groups="",resources=namespaces,verbs=get;list;watch
5760

5861
// Reconcile is part of the main kubernetes reconciliation loop which aims to
5962
// move the current state of the cluster closer to the desired state.
@@ -121,6 +124,12 @@ func (r *EvalHubReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
121124
return RequeueWithError(err)
122125
}
123126

127+
// Reconcile tenant namespace RBAC (non-fatal)
128+
if err := r.reconcileTenantNamespaces(ctx, instance); err != nil {
129+
log.Error(err, "Failed to reconcile tenant namespaces")
130+
r.EventRecorder.Event(instance, corev1.EventTypeWarning, "TenantRBACError", err.Error())
131+
}
132+
124133
// Reconcile ConfigMap
125134
if err := r.reconcileConfigMap(ctx, instance); err != nil {
126135
log.Error(err, "Failed to reconcile ConfigMap")
@@ -200,9 +209,36 @@ func (r *EvalHubReconciler) SetupWithManager(mgr ctrl.Manager) error {
200209
Owns(&appsv1.Deployment{}).
201210
Owns(&corev1.Service{}).
202211
Owns(&corev1.ConfigMap{}).
212+
Watches(&corev1.Namespace{},
213+
handler.EnqueueRequestsFromMapFunc(r.namespacesToEvalHub),
214+
).
203215
Complete(r)
204216
}
205217

218+
// namespacesToEvalHub maps namespace events to EvalHub reconcile requests.
219+
// When a namespace is created/updated/deleted, all EvalHub instances are re-reconciled
220+
// so they can provision or clean up tenant RBAC.
221+
func (r *EvalHubReconciler) namespacesToEvalHub(ctx context.Context, obj client.Object) []reconcile.Request {
222+
log := log.FromContext(ctx)
223+
224+
evalHubList := &evalhubv1alpha1.EvalHubList{}
225+
if err := r.List(ctx, evalHubList); err != nil {
226+
log.Error(err, "Failed to list EvalHub instances for namespace watch")
227+
return nil
228+
}
229+
230+
var requests []reconcile.Request
231+
for _, eh := range evalHubList.Items {
232+
requests = append(requests, reconcile.Request{
233+
NamespacedName: types.NamespacedName{
234+
Name: eh.Name,
235+
Namespace: eh.Namespace,
236+
},
237+
})
238+
}
239+
return requests
240+
}
241+
206242
// Helper functions for reconcile results
207243
func DoNotRequeue() (ctrl.Result, error) {
208244
return ctrl.Result{}, nil
@@ -231,6 +267,12 @@ func (r *EvalHubReconciler) handleDeletion(ctx context.Context, instance *evalhu
231267
return RequeueWithError(err)
232268
}
233269

270+
// Clean up tenant namespace resources (label-based, no owner refs)
271+
if err := r.cleanupTenantResources(ctx, instance); err != nil {
272+
log.Error(err, "Failed to cleanup tenant resources")
273+
return RequeueWithError(err)
274+
}
275+
234276
// Remove finalizer
235277
controllerutil.RemoveFinalizer(instance, evalhubv1alpha1.FinalizerName)
236278
if err := r.Update(ctx, instance); err != nil {

controllers/evalhub/service_accounts.go

Lines changed: 255 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"context"
55
"crypto/sha256"
66
"encoding/hex"
7+
"fmt"
78
"regexp"
89
"sort"
910
"strings"
@@ -12,6 +13,7 @@ import (
1213
"github.com/trustyai-explainability/trustyai-service-operator/controllers/constants"
1314
"k8s.io/apimachinery/pkg/api/errors"
1415
"k8s.io/apimachinery/pkg/types"
16+
"sigs.k8s.io/controller-runtime/pkg/client"
1517
"sigs.k8s.io/controller-runtime/pkg/log"
1618

1719
corev1 "k8s.io/api/core/v1"
@@ -752,3 +754,256 @@ func (r *EvalHubReconciler) createJobsServiceAccount(ctx context.Context, instan
752754

753755
return nil
754756
}
757+
758+
// reconcileTenantNamespaces discovers namespaces with the tenant annotation and
759+
// provisions per-tenant RBAC (SA + RoleBindings) so the API SA can create jobs
760+
// in tenant namespaces. It also cleans up resources in namespaces that lost the
761+
// annotation.
762+
func (r *EvalHubReconciler) reconcileTenantNamespaces(ctx context.Context, instance *evalhubv1alpha1.EvalHub) error {
763+
log := log.FromContext(ctx)
764+
765+
// List all namespaces
766+
nsList := &corev1.NamespaceList{}
767+
if err := r.List(ctx, nsList); err != nil {
768+
return fmt.Errorf("listing namespaces: %w", err)
769+
}
770+
771+
// Build set of annotated tenant namespaces (excluding control-plane)
772+
tenantNS := make(map[string]bool)
773+
for _, ns := range nsList.Items {
774+
if ns.Name == instance.Namespace {
775+
continue
776+
}
777+
if _, ok := ns.Annotations[tenantAnnotation]; ok {
778+
tenantNS[ns.Name] = true
779+
}
780+
}
781+
782+
// Reconcile each tenant namespace
783+
for ns := range tenantNS {
784+
if err := r.reconcileTenantNamespace(ctx, instance, ns); err != nil {
785+
log.Error(err, "Failed to reconcile tenant namespace", "namespace", ns)
786+
return fmt.Errorf("reconciling tenant namespace %s: %w", ns, err)
787+
}
788+
}
789+
790+
// Cleanup: find managed resources in namespaces that no longer have the annotation
791+
managedLabel := client.MatchingLabels{tenantLabel: instance.Name}
792+
793+
// Cleanup stale ServiceAccounts
794+
saList := &corev1.ServiceAccountList{}
795+
if err := r.List(ctx, saList, managedLabel); err != nil {
796+
return fmt.Errorf("listing managed service accounts: %w", err)
797+
}
798+
for i := range saList.Items {
799+
sa := &saList.Items[i]
800+
if !tenantNS[sa.Namespace] && sa.Namespace != instance.Namespace {
801+
log.Info("Cleaning up stale tenant SA", "namespace", sa.Namespace, "name", sa.Name)
802+
if err := r.Delete(ctx, sa); err != nil && !errors.IsNotFound(err) {
803+
return fmt.Errorf("deleting stale SA %s/%s: %w", sa.Namespace, sa.Name, err)
804+
}
805+
}
806+
}
807+
808+
// Cleanup stale RoleBindings
809+
rbList := &rbacv1.RoleBindingList{}
810+
if err := r.List(ctx, rbList, managedLabel); err != nil {
811+
return fmt.Errorf("listing managed role bindings: %w", err)
812+
}
813+
for i := range rbList.Items {
814+
rb := &rbList.Items[i]
815+
if !tenantNS[rb.Namespace] && rb.Namespace != instance.Namespace {
816+
log.Info("Cleaning up stale tenant RoleBinding", "namespace", rb.Namespace, "name", rb.Name)
817+
if err := r.Delete(ctx, rb); err != nil && !errors.IsNotFound(err) {
818+
return fmt.Errorf("deleting stale RoleBinding %s/%s: %w", rb.Namespace, rb.Name, err)
819+
}
820+
}
821+
}
822+
823+
return nil
824+
}
825+
826+
// reconcileTenantNamespace creates per-tenant RBAC resources in the given namespace.
827+
// All resources are labelled with tenantLabel for cleanup (no owner refs, since
828+
// cross-namespace owner references are forbidden).
829+
func (r *EvalHubReconciler) reconcileTenantNamespace(ctx context.Context, instance *evalhubv1alpha1.EvalHub, namespace string) error {
830+
log := log.FromContext(ctx)
831+
log.Info("Reconciling tenant namespace RBAC", "namespace", namespace)
832+
833+
apiSAName := generateServiceAccountName(instance)
834+
jobsSAName := generateJobsServiceAccountName(instance)
835+
836+
managedLabels := map[string]string{
837+
tenantLabel: instance.Name,
838+
"app": "eval-hub",
839+
"app.kubernetes.io/instance": instance.Name,
840+
"app.kubernetes.io/part-of": "eval-hub",
841+
}
842+
843+
managedAnnotations := map[string]string{
844+
tenantOwnerAnnotation: "eval-hub",
845+
}
846+
847+
// 1. Create jobs SA in the tenant namespace
848+
if err := r.ensureTenantServiceAccount(ctx, jobsSAName, namespace, managedLabels, managedAnnotations); err != nil {
849+
return err
850+
}
851+
852+
// 2. RoleBinding: API SA → jobs-writer ClusterRole (create/delete jobs in tenant ns)
853+
if err := r.ensureTenantRoleBinding(ctx, instance.Name+"-tenant-jobs-writer", namespace, managedLabels, managedAnnotations,
854+
[]rbacv1.Subject{{
855+
Kind: "ServiceAccount",
856+
Name: apiSAName,
857+
Namespace: instance.Namespace,
858+
}},
859+
rbacv1.RoleRef{Kind: "ClusterRole", Name: jobsWriterClusterRoleName, APIGroup: rbacv1.GroupName},
860+
); err != nil {
861+
return err
862+
}
863+
864+
// 3. RoleBinding: API SA → job-config ClusterRole (create/get/list configmaps in tenant ns)
865+
if err := r.ensureTenantRoleBinding(ctx, instance.Name+"-tenant-job-config", namespace, managedLabels, managedAnnotations,
866+
[]rbacv1.Subject{{
867+
Kind: "ServiceAccount",
868+
Name: apiSAName,
869+
Namespace: instance.Namespace,
870+
}},
871+
rbacv1.RoleRef{Kind: "ClusterRole", Name: jobConfigClusterRoleName, APIGroup: rbacv1.GroupName},
872+
); err != nil {
873+
return err
874+
}
875+
876+
// 4. RoleBinding: API SA + Jobs SA (tenant) → mlflow-access ClusterRole
877+
if err := r.ensureTenantRoleBinding(ctx, instance.Name+"-tenant-mlflow", namespace, managedLabels, managedAnnotations,
878+
[]rbacv1.Subject{
879+
{
880+
Kind: "ServiceAccount",
881+
Name: apiSAName,
882+
Namespace: instance.Namespace,
883+
},
884+
{
885+
Kind: "ServiceAccount",
886+
Name: jobsSAName,
887+
Namespace: namespace,
888+
},
889+
},
890+
rbacv1.RoleRef{Kind: "ClusterRole", Name: mlflowAccessClusterRoleName, APIGroup: rbacv1.GroupName},
891+
); err != nil {
892+
return err
893+
}
894+
895+
return nil
896+
}
897+
898+
// ensureTenantServiceAccount creates a ServiceAccount in the given namespace if it
899+
// does not exist. No owner reference is set (cross-namespace not allowed).
900+
func (r *EvalHubReconciler) ensureTenantServiceAccount(ctx context.Context, name, namespace string, labels map[string]string, annotations map[string]string) error {
901+
sa := &corev1.ServiceAccount{}
902+
err := r.Get(ctx, types.NamespacedName{Name: name, Namespace: namespace}, sa)
903+
if err == nil {
904+
return nil // already exists
905+
}
906+
if !errors.IsNotFound(err) {
907+
return err
908+
}
909+
910+
log.FromContext(ctx).Info("Creating tenant SA", "namespace", namespace, "name", name)
911+
sa = &corev1.ServiceAccount{
912+
ObjectMeta: metav1.ObjectMeta{
913+
Name: name,
914+
Namespace: namespace,
915+
Labels: labels,
916+
Annotations: annotations,
917+
},
918+
}
919+
return r.Create(ctx, sa)
920+
}
921+
922+
// ensureTenantRoleBinding creates or updates a RoleBinding in the given namespace.
923+
// No owner reference is set (cross-namespace not allowed).
924+
func (r *EvalHubReconciler) ensureTenantRoleBinding(ctx context.Context, name, namespace string, labels map[string]string, annotations map[string]string, subjects []rbacv1.Subject, roleRef rbacv1.RoleRef) error {
925+
log := log.FromContext(ctx)
926+
927+
desired := &rbacv1.RoleBinding{
928+
ObjectMeta: metav1.ObjectMeta{
929+
Name: name,
930+
Namespace: namespace,
931+
Labels: labels,
932+
Annotations: annotations,
933+
},
934+
Subjects: subjects,
935+
RoleRef: roleRef,
936+
}
937+
938+
found := &rbacv1.RoleBinding{}
939+
err := r.Get(ctx, types.NamespacedName{Name: name, Namespace: namespace}, found)
940+
if err != nil && errors.IsNotFound(err) {
941+
log.Info("Creating tenant RoleBinding", "namespace", namespace, "name", name)
942+
return r.Create(ctx, desired)
943+
} else if err != nil {
944+
return err
945+
}
946+
947+
// Update if subjects or roleRef changed
948+
subjectsEqual := equalRoleBindingSubjects(found.Subjects, desired.Subjects)
949+
roleRefEqual := equalRoleBindingRoleRef(found.RoleRef, desired.RoleRef)
950+
951+
if !subjectsEqual || !roleRefEqual {
952+
if roleRefEqual && !subjectsEqual {
953+
found.Subjects = desired.Subjects
954+
log.Info("Updating tenant RoleBinding subjects", "name", name)
955+
return r.Update(ctx, found)
956+
}
957+
// RoleRef is immutable; delete and recreate
958+
log.Info("RoleRef differs, recreating tenant RoleBinding", "name", name)
959+
if err := r.Delete(ctx, found); err != nil {
960+
return err
961+
}
962+
return r.Create(ctx, desired)
963+
}
964+
965+
return nil
966+
}
967+
968+
// cleanupTenantResources removes all tenant-namespace resources managed by this
969+
// EvalHub instance (identified by tenantLabel). Called during EvalHub deletion.
970+
func (r *EvalHubReconciler) cleanupTenantResources(ctx context.Context, instance *evalhubv1alpha1.EvalHub) error {
971+
log := log.FromContext(ctx)
972+
log.Info("Cleaning up tenant resources", "instance", instance.Name)
973+
974+
managedLabel := client.MatchingLabels{tenantLabel: instance.Name}
975+
976+
// Delete managed RoleBindings across all namespaces
977+
rbList := &rbacv1.RoleBindingList{}
978+
if err := r.List(ctx, rbList, managedLabel); err != nil {
979+
return fmt.Errorf("listing managed RoleBindings for cleanup: %w", err)
980+
}
981+
for i := range rbList.Items {
982+
rb := &rbList.Items[i]
983+
if rb.Namespace == instance.Namespace {
984+
continue // control-plane resources cleaned by owner-ref GC
985+
}
986+
log.Info("Deleting tenant RoleBinding", "namespace", rb.Namespace, "name", rb.Name)
987+
if err := r.Delete(ctx, rb); err != nil && !errors.IsNotFound(err) {
988+
return fmt.Errorf("deleting tenant RoleBinding %s/%s: %w", rb.Namespace, rb.Name, err)
989+
}
990+
}
991+
992+
// Delete managed ServiceAccounts across all namespaces
993+
saList := &corev1.ServiceAccountList{}
994+
if err := r.List(ctx, saList, managedLabel); err != nil {
995+
return fmt.Errorf("listing managed SAs for cleanup: %w", err)
996+
}
997+
for i := range saList.Items {
998+
sa := &saList.Items[i]
999+
if sa.Namespace == instance.Namespace {
1000+
continue
1001+
}
1002+
log.Info("Deleting tenant SA", "namespace", sa.Namespace, "name", sa.Name)
1003+
if err := r.Delete(ctx, sa); err != nil && !errors.IsNotFound(err) {
1004+
return fmt.Errorf("deleting tenant SA %s/%s: %w", sa.Namespace, sa.Name, err)
1005+
}
1006+
}
1007+
1008+
return nil
1009+
}

0 commit comments

Comments
 (0)