Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions config/rbac/role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ rules:
- apiGroups:
- ""
resources:
- namespaces
- persistentvolumes
verbs:
- get
Expand Down
5 changes: 5 additions & 0 deletions controllers/evalhub/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,11 @@ const (
providerNameLabel = "trustyai.opendatahub.io/evalhub-provider-name"
providersVolumeName = "evalhub-providers"
providersMountPath = configDirPath + "/providers"

// Tenant namespace configuration
tenantAnnotation = "mlflow.kubeflow.org/workspace-description"
tenantLabel = "trustyai.opendatahub.io/managed-by"
tenantOwnerAnnotation = "trustyai.opendatahub.io/owner"
)

var (
Expand Down
42 changes: 42 additions & 0 deletions controllers/evalhub/evalhub_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,10 @@ import (
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
"sigs.k8s.io/controller-runtime/pkg/handler"
"sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/manager"
"sigs.k8s.io/controller-runtime/pkg/reconcile"
)

func ControllerSetUp(mgr manager.Manager, ns string, recorder record.EventRecorder) error {
Expand Down Expand Up @@ -54,6 +56,7 @@ type EvalHubReconciler struct {
//+kubebuilder:rbac:groups="",resources=events,verbs=create;patch;update
//+kubebuilder:rbac:groups=rbac.authorization.k8s.io,resources=roles,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=rbac.authorization.k8s.io,resources=rolebindings,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups="",resources=namespaces,verbs=get;list;watch

// Reconcile is part of the main kubernetes reconciliation loop which aims to
// move the current state of the cluster closer to the desired state.
Expand Down Expand Up @@ -121,6 +124,12 @@ func (r *EvalHubReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
return RequeueWithError(err)
}

// Reconcile tenant namespace RBAC (non-fatal)
if err := r.reconcileTenantNamespaces(ctx, instance); err != nil {
log.Error(err, "Failed to reconcile tenant namespaces")
r.EventRecorder.Event(instance, corev1.EventTypeWarning, "TenantRBACError", err.Error())
}

// Reconcile ConfigMap
if err := r.reconcileConfigMap(ctx, instance); err != nil {
log.Error(err, "Failed to reconcile ConfigMap")
Expand Down Expand Up @@ -200,9 +209,36 @@ func (r *EvalHubReconciler) SetupWithManager(mgr ctrl.Manager) error {
Owns(&appsv1.Deployment{}).
Owns(&corev1.Service{}).
Owns(&corev1.ConfigMap{}).
Watches(&corev1.Namespace{},
handler.EnqueueRequestsFromMapFunc(r.namespacesToEvalHub),
).
Complete(r)
}

// namespacesToEvalHub maps namespace events to EvalHub reconcile requests.
// When a namespace is created/updated/deleted, all EvalHub instances are re-reconciled
// so they can provision or clean up tenant RBAC.
func (r *EvalHubReconciler) namespacesToEvalHub(ctx context.Context, obj client.Object) []reconcile.Request {
log := log.FromContext(ctx)

evalHubList := &evalhubv1alpha1.EvalHubList{}
if err := r.List(ctx, evalHubList); err != nil {
log.Error(err, "Failed to list EvalHub instances for namespace watch")
return nil
}

var requests []reconcile.Request
for _, eh := range evalHubList.Items {
requests = append(requests, reconcile.Request{
NamespacedName: types.NamespacedName{
Name: eh.Name,
Namespace: eh.Namespace,
},
})
}
return requests
}

// Helper functions for reconcile results
func DoNotRequeue() (ctrl.Result, error) {
return ctrl.Result{}, nil
Expand Down Expand Up @@ -231,6 +267,12 @@ func (r *EvalHubReconciler) handleDeletion(ctx context.Context, instance *evalhu
return RequeueWithError(err)
}

// Clean up tenant namespace resources (label-based, no owner refs)
if err := r.cleanupTenantResources(ctx, instance); err != nil {
log.Error(err, "Failed to cleanup tenant resources")
return RequeueWithError(err)
}

// Remove finalizer
controllerutil.RemoveFinalizer(instance, evalhubv1alpha1.FinalizerName)
if err := r.Update(ctx, instance); err != nil {
Expand Down
255 changes: 255 additions & 0 deletions controllers/evalhub/service_accounts.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"context"
"crypto/sha256"
"encoding/hex"
"fmt"
"regexp"
"sort"
"strings"
Expand All @@ -12,6 +13,7 @@ import (
"github.com/trustyai-explainability/trustyai-service-operator/controllers/constants"
"k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/types"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/log"

corev1 "k8s.io/api/core/v1"
Expand Down Expand Up @@ -752,3 +754,256 @@ func (r *EvalHubReconciler) createJobsServiceAccount(ctx context.Context, instan

return nil
}

// reconcileTenantNamespaces discovers namespaces with the tenant annotation and
// provisions per-tenant RBAC (SA + RoleBindings) so the API SA can create jobs
// in tenant namespaces. It also cleans up resources in namespaces that lost the
// annotation.
func (r *EvalHubReconciler) reconcileTenantNamespaces(ctx context.Context, instance *evalhubv1alpha1.EvalHub) error {
log := log.FromContext(ctx)

// List all namespaces
nsList := &corev1.NamespaceList{}
if err := r.List(ctx, nsList); err != nil {
return fmt.Errorf("listing namespaces: %w", err)
}

// Build set of annotated tenant namespaces (excluding control-plane)
tenantNS := make(map[string]bool)
for _, ns := range nsList.Items {
if ns.Name == instance.Namespace {
continue
}
if _, ok := ns.Annotations[tenantAnnotation]; ok {
tenantNS[ns.Name] = true
}
}

// Reconcile each tenant namespace
for ns := range tenantNS {
if err := r.reconcileTenantNamespace(ctx, instance, ns); err != nil {
log.Error(err, "Failed to reconcile tenant namespace", "namespace", ns)
return fmt.Errorf("reconciling tenant namespace %s: %w", ns, err)
}
}

// Cleanup: find managed resources in namespaces that no longer have the annotation
managedLabel := client.MatchingLabels{tenantLabel: instance.Name}

Comment on lines +791 to +792
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Tenant ownership selector is not unique enough across namespaces.

Using only instance.Name for tenantLabel allows collisions when two EvalHub CRs share a name in different namespaces, which can cause cross-instance update/cleanup of tenant RBAC.

🛠️ Suggested ownership key hardening
@@
- managedLabel := client.MatchingLabels{tenantLabel: instance.Name}
+ ownerKey := normalizeDNS1123LabelValue(instance.Namespace + "-" + instance.Name)
+ managedLabel := client.MatchingLabels{tenantLabel: ownerKey}
@@
- managedLabels := map[string]string{
-   tenantLabel:                  instance.Name,
+ ownerKey := normalizeDNS1123LabelValue(instance.Namespace + "-" + instance.Name)
+ managedLabels := map[string]string{
+   tenantLabel:                  ownerKey,
@@
- managedLabel := client.MatchingLabels{tenantLabel: instance.Name}
+ ownerKey := normalizeDNS1123LabelValue(instance.Namespace + "-" + instance.Name)
+ managedLabel := client.MatchingLabels{tenantLabel: ownerKey}

Also applies to: 836-838, 974-975

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@controllers/evalhub/service_accounts.go` around lines 791 - 792, The tenant
ownership selector uses only instance.Name (managedLabel :=
client.MatchingLabels{tenantLabel: instance.Name}) which can collide across
namespaces; change the label value to include the namespace (e.g., combine
instance.Namespace and instance.Name or use instance.GetNamespace()) so the
selector is unique per EvalHub CR, and update the other occurrences of
tenantLabel usage (the other managedLabel/selector constructions around the
references noted) to use the same namespaced label format so RBAC cleanup/update
won't cross namespaces.

// Cleanup stale ServiceAccounts
saList := &corev1.ServiceAccountList{}
if err := r.List(ctx, saList, managedLabel); err != nil {
return fmt.Errorf("listing managed service accounts: %w", err)
}
for i := range saList.Items {
sa := &saList.Items[i]
if !tenantNS[sa.Namespace] && sa.Namespace != instance.Namespace {
log.Info("Cleaning up stale tenant SA", "namespace", sa.Namespace, "name", sa.Name)
if err := r.Delete(ctx, sa); err != nil && !errors.IsNotFound(err) {
return fmt.Errorf("deleting stale SA %s/%s: %w", sa.Namespace, sa.Name, err)
}
}
}

// Cleanup stale RoleBindings
rbList := &rbacv1.RoleBindingList{}
if err := r.List(ctx, rbList, managedLabel); err != nil {
return fmt.Errorf("listing managed role bindings: %w", err)
}
for i := range rbList.Items {
rb := &rbList.Items[i]
if !tenantNS[rb.Namespace] && rb.Namespace != instance.Namespace {
log.Info("Cleaning up stale tenant RoleBinding", "namespace", rb.Namespace, "name", rb.Name)
if err := r.Delete(ctx, rb); err != nil && !errors.IsNotFound(err) {
return fmt.Errorf("deleting stale RoleBinding %s/%s: %w", rb.Namespace, rb.Name, err)
}
}
}

return nil
}

// reconcileTenantNamespace creates per-tenant RBAC resources in the given namespace.
// All resources are labelled with tenantLabel for cleanup (no owner refs, since
// cross-namespace owner references are forbidden).
func (r *EvalHubReconciler) reconcileTenantNamespace(ctx context.Context, instance *evalhubv1alpha1.EvalHub, namespace string) error {
log := log.FromContext(ctx)
log.Info("Reconciling tenant namespace RBAC", "namespace", namespace)

apiSAName := generateServiceAccountName(instance)
jobsSAName := generateJobsServiceAccountName(instance)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

generateJobsServiceAccountName adds the -api suffix. For jobs SA we should probably have -job suffix.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@mariusdanciu thanks, I'll check again. I thought the two separate functions created

  • generateServiceAccountName(): {name}-api (the API/proxy SA)
  • generateJobsServiceAccountName(): {name}-jobs (the jobs SA)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I might have misread the diff.


managedLabels := map[string]string{
tenantLabel: instance.Name,
"app": "eval-hub",
"app.kubernetes.io/instance": instance.Name,
"app.kubernetes.io/part-of": "eval-hub",
}

managedAnnotations := map[string]string{
tenantOwnerAnnotation: "eval-hub",
}

// 1. Create jobs SA in the tenant namespace
if err := r.ensureTenantServiceAccount(ctx, jobsSAName, namespace, managedLabels, managedAnnotations); err != nil {
return err
}

// 2. RoleBinding: API SA → jobs-writer ClusterRole (create/delete jobs in tenant ns)
if err := r.ensureTenantRoleBinding(ctx, instance.Name+"-tenant-jobs-writer", namespace, managedLabels, managedAnnotations,
[]rbacv1.Subject{{
Kind: "ServiceAccount",
Name: apiSAName,
Namespace: instance.Namespace,
}},
rbacv1.RoleRef{Kind: "ClusterRole", Name: jobsWriterClusterRoleName, APIGroup: rbacv1.GroupName},
); err != nil {
return err
}

// 3. RoleBinding: API SA → job-config ClusterRole (create/get/list configmaps in tenant ns)
if err := r.ensureTenantRoleBinding(ctx, instance.Name+"-tenant-job-config", namespace, managedLabels, managedAnnotations,
[]rbacv1.Subject{{
Kind: "ServiceAccount",
Name: apiSAName,
Namespace: instance.Namespace,
}},
rbacv1.RoleRef{Kind: "ClusterRole", Name: jobConfigClusterRoleName, APIGroup: rbacv1.GroupName},
); err != nil {
return err
}

// 4. RoleBinding: API SA + Jobs SA (tenant) → mlflow-access ClusterRole
if err := r.ensureTenantRoleBinding(ctx, instance.Name+"-tenant-mlflow", namespace, managedLabels, managedAnnotations,
[]rbacv1.Subject{
{
Kind: "ServiceAccount",
Name: apiSAName,
Namespace: instance.Namespace,
},
{
Kind: "ServiceAccount",
Name: jobsSAName,
Namespace: namespace,
},
},
rbacv1.RoleRef{Kind: "ClusterRole", Name: mlflowAccessClusterRoleName, APIGroup: rbacv1.GroupName},
); err != nil {
return err
}

return nil
}

// ensureTenantServiceAccount creates a ServiceAccount in the given namespace if it
// does not exist. No owner reference is set (cross-namespace not allowed).
func (r *EvalHubReconciler) ensureTenantServiceAccount(ctx context.Context, name, namespace string, labels map[string]string, annotations map[string]string) error {
sa := &corev1.ServiceAccount{}
err := r.Get(ctx, types.NamespacedName{Name: name, Namespace: namespace}, sa)
if err == nil {
return nil // already exists
}
if !errors.IsNotFound(err) {
return err
}

log.FromContext(ctx).Info("Creating tenant SA", "namespace", namespace, "name", name)
sa = &corev1.ServiceAccount{
ObjectMeta: metav1.ObjectMeta{
Name: name,
Namespace: namespace,
Labels: labels,
Annotations: annotations,
},
}
return r.Create(ctx, sa)
}

// ensureTenantRoleBinding creates or updates a RoleBinding in the given namespace.
// No owner reference is set (cross-namespace not allowed).
func (r *EvalHubReconciler) ensureTenantRoleBinding(ctx context.Context, name, namespace string, labels map[string]string, annotations map[string]string, subjects []rbacv1.Subject, roleRef rbacv1.RoleRef) error {
log := log.FromContext(ctx)

desired := &rbacv1.RoleBinding{
ObjectMeta: metav1.ObjectMeta{
Name: name,
Namespace: namespace,
Labels: labels,
Annotations: annotations,
},
Subjects: subjects,
RoleRef: roleRef,
}

found := &rbacv1.RoleBinding{}
err := r.Get(ctx, types.NamespacedName{Name: name, Namespace: namespace}, found)
if err != nil && errors.IsNotFound(err) {
log.Info("Creating tenant RoleBinding", "namespace", namespace, "name", name)
return r.Create(ctx, desired)
} else if err != nil {
return err
}

// Update if subjects or roleRef changed
subjectsEqual := equalRoleBindingSubjects(found.Subjects, desired.Subjects)
roleRefEqual := equalRoleBindingRoleRef(found.RoleRef, desired.RoleRef)

if !subjectsEqual || !roleRefEqual {
if roleRefEqual && !subjectsEqual {
found.Subjects = desired.Subjects
log.Info("Updating tenant RoleBinding subjects", "name", name)
return r.Update(ctx, found)
}
// RoleRef is immutable; delete and recreate
log.Info("RoleRef differs, recreating tenant RoleBinding", "name", name)
if err := r.Delete(ctx, found); err != nil {
return err
}
return r.Create(ctx, desired)
}

return nil
}

// cleanupTenantResources removes all tenant-namespace resources managed by this
// EvalHub instance (identified by tenantLabel). Called during EvalHub deletion.
func (r *EvalHubReconciler) cleanupTenantResources(ctx context.Context, instance *evalhubv1alpha1.EvalHub) error {
log := log.FromContext(ctx)
log.Info("Cleaning up tenant resources", "instance", instance.Name)

managedLabel := client.MatchingLabels{tenantLabel: instance.Name}

// Delete managed RoleBindings across all namespaces
rbList := &rbacv1.RoleBindingList{}
if err := r.List(ctx, rbList, managedLabel); err != nil {
return fmt.Errorf("listing managed RoleBindings for cleanup: %w", err)
}
for i := range rbList.Items {
rb := &rbList.Items[i]
if rb.Namespace == instance.Namespace {
continue // control-plane resources cleaned by owner-ref GC
}
log.Info("Deleting tenant RoleBinding", "namespace", rb.Namespace, "name", rb.Name)
if err := r.Delete(ctx, rb); err != nil && !errors.IsNotFound(err) {
return fmt.Errorf("deleting tenant RoleBinding %s/%s: %w", rb.Namespace, rb.Name, err)
}
}

// Delete managed ServiceAccounts across all namespaces
saList := &corev1.ServiceAccountList{}
if err := r.List(ctx, saList, managedLabel); err != nil {
return fmt.Errorf("listing managed SAs for cleanup: %w", err)
}
for i := range saList.Items {
sa := &saList.Items[i]
if sa.Namespace == instance.Namespace {
continue
}
log.Info("Deleting tenant SA", "namespace", sa.Namespace, "name", sa.Name)
if err := r.Delete(ctx, sa); err != nil && !errors.IsNotFound(err) {
return fmt.Errorf("deleting tenant SA %s/%s: %w", sa.Namespace, sa.Name, err)
}
}

return nil
}
Loading