diff --git a/config/rbac/evalhub_proxy_role.yaml b/config/rbac/evalhub/evalhub_auth_reviewer_role.yaml similarity index 63% rename from config/rbac/evalhub_proxy_role.yaml rename to config/rbac/evalhub/evalhub_auth_reviewer_role.yaml index 54277e539..8ed8b1f38 100644 --- a/config/rbac/evalhub_proxy_role.yaml +++ b/config/rbac/evalhub/evalhub_auth_reviewer_role.yaml @@ -3,12 +3,12 @@ kind: ClusterRole metadata: labels: app.kubernetes.io/name: clusterrole - app.kubernetes.io/instance: evalhub-proxy-role + app.kubernetes.io/instance: evalhub-auth-reviewer-role app.kubernetes.io/component: kube-rbac-proxy app.kubernetes.io/created-by: trustyai-service-operator app.kubernetes.io/part-of: trustyai-service-operator app.kubernetes.io/managed-by: kustomize - name: evalhub-proxy-role + name: evalhub-auth-reviewer-role rules: - apiGroups: - authentication.k8s.io @@ -22,17 +22,3 @@ rules: - subjectaccessreviews verbs: - create - - apiGroups: - - trustyai.opendatahub.io - resources: - - evalhubs - verbs: - - get - - apiGroups: - - trustyai.opendatahub.io - resources: - - evalhubs/proxy - verbs: - - get - - create - - update diff --git a/config/rbac/evalhub_resource_manager_binding.yaml b/config/rbac/evalhub/evalhub_job_config_binding.yaml similarity index 81% rename from config/rbac/evalhub_resource_manager_binding.yaml rename to config/rbac/evalhub/evalhub_job_config_binding.yaml index 3cd48e48b..7b4109a87 100644 --- a/config/rbac/evalhub_resource_manager_binding.yaml +++ b/config/rbac/evalhub/evalhub_job_config_binding.yaml @@ -4,11 +4,11 @@ metadata: labels: app.kubernetes.io/component: evalhub app.kubernetes.io/name: trustyai-service-operator - name: evalhub-resource-manager-binding + name: evalhub-job-config-binding roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole - name: evalhub-resource-manager + name: evalhub-job-config subjects: - kind: ServiceAccount name: controller-manager diff --git a/config/rbac/evalhub/evalhub_job_config_role.yaml b/config/rbac/evalhub/evalhub_job_config_role.yaml new file mode 100644 index 000000000..791454fd0 --- /dev/null +++ b/config/rbac/evalhub/evalhub_job_config_role.yaml @@ -0,0 +1,14 @@ +--- +# ClusterRole for EvalHub job ConfigMap management +# Split from the monolithic evalhub-resource-manager for least-privilege. +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: evalhub-job-config + labels: + app.kubernetes.io/name: trustyai-service-operator + app.kubernetes.io/component: evalhub +rules: + - apiGroups: [""] + resources: ["configmaps"] + verbs: ["create", "get", "update", "delete"] diff --git a/config/rbac/evalhub/evalhub_jobs_writer_binding.yaml b/config/rbac/evalhub/evalhub_jobs_writer_binding.yaml new file mode 100644 index 000000000..560e0fa13 --- /dev/null +++ b/config/rbac/evalhub/evalhub_jobs_writer_binding.yaml @@ -0,0 +1,15 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + labels: + app.kubernetes.io/component: evalhub + app.kubernetes.io/name: trustyai-service-operator + name: evalhub-jobs-writer-binding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: evalhub-jobs-writer +subjects: +- kind: ServiceAccount + name: controller-manager + namespace: system diff --git a/config/rbac/evalhub/evalhub_jobs_writer_role.yaml b/config/rbac/evalhub/evalhub_jobs_writer_role.yaml new file mode 100644 index 000000000..15db2f8f9 --- /dev/null +++ b/config/rbac/evalhub/evalhub_jobs_writer_role.yaml @@ -0,0 +1,14 @@ +--- +# ClusterRole for EvalHub job creation +# Split from the monolithic evalhub-resource-manager for least-privilege. +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: evalhub-jobs-writer + labels: + app.kubernetes.io/name: trustyai-service-operator + app.kubernetes.io/component: evalhub +rules: + - apiGroups: ["batch"] + resources: ["jobs"] + verbs: ["create", "delete"] diff --git a/config/rbac/evalhub/evalhub_mlflow_access_binding.yaml b/config/rbac/evalhub/evalhub_mlflow_access_binding.yaml new file mode 100644 index 000000000..ed8859f34 --- /dev/null +++ b/config/rbac/evalhub/evalhub_mlflow_access_binding.yaml @@ -0,0 +1,19 @@ +--- +# ClusterRoleBinding granting the operator SA the evalhub-mlflow-access permissions. +# The operator needs to hold these permissions itself in order to create +# namespace-scoped RoleBindings that reference this ClusterRole (RBAC escalation prevention). +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + labels: + app.kubernetes.io/component: evalhub + app.kubernetes.io/name: trustyai-service-operator + name: evalhub-mlflow-access-binding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: evalhub-mlflow-access +subjects: +- kind: ServiceAccount + name: controller-manager + namespace: system diff --git a/config/rbac/evalhub/evalhub_mlflow_access_role.yaml b/config/rbac/evalhub/evalhub_mlflow_access_role.yaml new file mode 100644 index 000000000..356b2b863 --- /dev/null +++ b/config/rbac/evalhub/evalhub_mlflow_access_role.yaml @@ -0,0 +1,18 @@ +--- +# ClusterRole for MLflow kubernetes-auth access +# MLflow's kubernetes-workspace-provider checks permissions via SelfSubjectAccessReview +# against the "mlflow.kubeflow.org" API group (not core Kubernetes resources). +# This role is pre-created at operator installation time and bound to +# EvalHub ServiceAccounts via RoleBindings created by the operator. +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: evalhub-mlflow-access + labels: + app.kubernetes.io/name: trustyai-service-operator + app.kubernetes.io/component: evalhub +rules: + - apiGroups: ["mlflow.kubeflow.org"] + resources: + - experiments + verbs: ["create", "get", "list", "update", "delete"] diff --git a/config/rbac/evalhub/evalhub_mlflow_jobs_binding.yaml b/config/rbac/evalhub/evalhub_mlflow_jobs_binding.yaml new file mode 100644 index 000000000..a34dd845b --- /dev/null +++ b/config/rbac/evalhub/evalhub_mlflow_jobs_binding.yaml @@ -0,0 +1,19 @@ +--- +# ClusterRoleBinding granting the operator SA the evalhub-mlflow-jobs-access permissions. +# The operator needs to hold these permissions itself in order to create +# namespace-scoped RoleBindings that reference this ClusterRole (RBAC escalation prevention). +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + labels: + app.kubernetes.io/component: evalhub + app.kubernetes.io/name: trustyai-service-operator + name: evalhub-mlflow-jobs-access-binding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: evalhub-mlflow-jobs-access +subjects: +- kind: ServiceAccount + name: controller-manager + namespace: system diff --git a/config/rbac/evalhub/evalhub_mlflow_jobs_role.yaml b/config/rbac/evalhub/evalhub_mlflow_jobs_role.yaml new file mode 100644 index 000000000..6944b0254 --- /dev/null +++ b/config/rbac/evalhub/evalhub_mlflow_jobs_role.yaml @@ -0,0 +1,17 @@ +--- +# ClusterRole for MLflow kubernetes-auth access (jobs only) +# Jobs only need to create experiments and log metrics — they should not +# update or delete existing experiments. The proxy SA retains full CRUD +# via the broader evalhub-mlflow-access ClusterRole. +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: evalhub-mlflow-jobs-access + labels: + app.kubernetes.io/name: trustyai-service-operator + app.kubernetes.io/component: evalhub +rules: + - apiGroups: ["mlflow.kubeflow.org"] + resources: + - experiments + verbs: ["create", "get", "list"] diff --git a/config/rbac/evalhub_jobs_proxy_role.yaml b/config/rbac/evalhub_jobs_proxy_role.yaml deleted file mode 100644 index 15b868342..000000000 --- a/config/rbac/evalhub_jobs_proxy_role.yaml +++ /dev/null @@ -1,20 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - labels: - app.kubernetes.io/name: clusterrole - app.kubernetes.io/instance: evalhub-jobs-proxy-role - app.kubernetes.io/component: evalhub-jobs - app.kubernetes.io/created-by: trustyai-service-operator - app.kubernetes.io/part-of: trustyai-service-operator - app.kubernetes.io/managed-by: kustomize - name: evalhub-jobs-proxy-role -rules: - - apiGroups: - - trustyai.opendatahub.io - resources: - - evalhubs/proxy - verbs: - - get - - create - - update diff --git a/config/rbac/evalhub_resource_manager_role.yaml b/config/rbac/evalhub_resource_manager_role.yaml deleted file mode 100644 index b2a81d2ff..000000000 --- a/config/rbac/evalhub_resource_manager_role.yaml +++ /dev/null @@ -1,26 +0,0 @@ ---- -# ClusterRole for EvalHub resource management -# This role is pre-created at operator installation time and bound to -# EvalHub proxy ServiceAccounts via RoleBindings created by the operator. -# This approach avoids giving the operator broad permissions that it would -# need to dynamically create Roles with these permissions. -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: evalhub-resource-manager - labels: - app.kubernetes.io/name: trustyai-service-operator - app.kubernetes.io/component: evalhub -rules: - # ConfigMap management for job specs - - apiGroups: [""] - resources: ["configmaps"] - verbs: ["create", "get", "list", "update", "patch", "delete"] - # Job management for evaluation jobs - - apiGroups: ["batch"] - resources: ["jobs"] - verbs: ["create", "get", "list", "watch", "update", "patch", "delete"] - # Service proxy access for job callbacks (scoped by RoleBinding to specific services) - - apiGroups: [""] - resources: ["services/proxy"] - verbs: ["get", "create", "update"] diff --git a/config/rbac/kustomization.yaml b/config/rbac/kustomization.yaml index 0235953b7..10e4c9d34 100644 --- a/config/rbac/kustomization.yaml +++ b/config/rbac/kustomization.yaml @@ -8,10 +8,15 @@ resources: - auth_proxy_role.yaml - auth_proxy_role_binding.yaml - auth_proxy_client_clusterrole.yaml - - evalhub_proxy_role.yaml - - evalhub_jobs_proxy_role.yaml - - evalhub_resource_manager_role.yaml - - evalhub_resource_manager_binding.yaml + - evalhub/evalhub_auth_reviewer_role.yaml + - evalhub/evalhub_jobs_writer_role.yaml + - evalhub/evalhub_jobs_writer_binding.yaml + - evalhub/evalhub_job_config_role.yaml + - evalhub/evalhub_job_config_binding.yaml + - evalhub/evalhub_mlflow_access_role.yaml + - evalhub/evalhub_mlflow_access_binding.yaml + - evalhub/evalhub_mlflow_jobs_role.yaml + - evalhub/evalhub_mlflow_jobs_binding.yaml - nemoguardrail_editor_role.yaml - nemoguardrail_viewer_role.yaml - trustyaiservice_editor_role.yaml diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index 4bdd852af..a47605eda 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -161,6 +161,7 @@ rules: - rbac.authorization.k8s.io resources: - rolebindings + - roles verbs: - create - delete @@ -251,7 +252,6 @@ rules: verbs: - create - get - - update - apiGroups: - trustyai.opendatahub.io resources: diff --git a/controllers/evalhub/configmap.go b/controllers/evalhub/configmap.go index eedbd74e4..d121e582d 100644 --- a/controllers/evalhub/configmap.go +++ b/controllers/evalhub/configmap.go @@ -326,9 +326,12 @@ func (r *EvalHubReconciler) generateProxyConfigData(instance *evalhubv1alpha1.Ev proxyConfig := map[string]interface{}{ "authorization": map[string]interface{}{ "resourceAttributes": map[string]interface{}{ - "namespace": instance.Namespace, - "apiGroup": "trustyai.opendatahub.io", - "resource": "evalhubs", + "namespace": instance.Namespace, + "apiGroup": "trustyai.opendatahub.io", + "resource": "evalhubs", + // kube-rbac-proxy expects the Kubernetes ResourceAttributes key "name". + // Keep "resourceName" for compatibility with older config consumers. + "name": instance.Name, "resourceName": instance.Name, "subresource": "proxy", }, diff --git a/controllers/evalhub/constants.go b/controllers/evalhub/constants.go index 600098da5..42109092f 100644 --- a/controllers/evalhub/constants.go +++ b/controllers/evalhub/constants.go @@ -38,6 +38,17 @@ const ( dbDriver = "pgx" dbDefaultMaxOpen = 25 dbDefaultMaxIdle = 5 + + // Service CA configuration + serviceCAVolumeName = "service-ca" + serviceCAMountPath = "/etc/evalhub/ca" + serviceCACertFile = "service-ca.crt" + + // MLFlow projected token configuration + mlflowTokenVolumeName = "mlflow-token" + mlflowTokenMountPath = "/var/run/secrets/mlflow" + mlflowTokenFile = "token" + mlflowTokenExpiration = 3600 // seconds ) var ( diff --git a/controllers/evalhub/deployment.go b/controllers/evalhub/deployment.go index fb0bfa966..9d63877eb 100644 --- a/controllers/evalhub/deployment.go +++ b/controllers/evalhub/deployment.go @@ -119,6 +119,18 @@ func (r *EvalHubReconciler) buildDeploymentSpec(ctx context.Context, instance *e Name: "EVALHUB_INSTANCE_NAME", Value: instance.Name, }, + { + Name: "MLFLOW_CA_CERT_PATH", + Value: serviceCAMountPath + "/" + serviceCACertFile, + }, + { + Name: "MLFLOW_WORKSPACE", + Value: instance.Namespace, + }, + { + Name: "MLFLOW_TOKEN_PATH", + Value: mlflowTokenMountPath + "/" + mlflowTokenFile, + }, } // Merge environment variables with CR values taking precedence @@ -131,6 +143,16 @@ func (r *EvalHubReconciler) buildDeploymentSpec(ctx context.Context, instance *e MountPath: "/etc/evalhub", ReadOnly: true, }, + { + Name: serviceCAVolumeName, + MountPath: serviceCAMountPath, + ReadOnly: true, + }, + { + Name: mlflowTokenVolumeName, + MountPath: mlflowTokenMountPath, + ReadOnly: true, + }, } if instance.Spec.IsDatabaseConfigured() { volumeMounts = append(volumeMounts, corev1.VolumeMount{ @@ -139,7 +161,6 @@ func (r *EvalHubReconciler) buildDeploymentSpec(ctx context.Context, instance *e ReadOnly: true, }) } - // Container definition based on k8s examples container := corev1.Container{ Name: containerName, @@ -296,6 +317,31 @@ func (r *EvalHubReconciler) buildDeploymentSpec(ctx context.Context, instance *e }, }, }, + { + Name: serviceCAVolumeName, + VolumeSource: corev1.VolumeSource{ + ConfigMap: &corev1.ConfigMapVolumeSource{ + LocalObjectReference: corev1.LocalObjectReference{ + Name: instance.Name + "-service-ca", + }, + }, + }, + }, + { + Name: mlflowTokenVolumeName, + VolumeSource: corev1.VolumeSource{ + Projected: &corev1.ProjectedVolumeSource{ + Sources: []corev1.VolumeProjection{ + { + ServiceAccountToken: &corev1.ServiceAccountTokenProjection{ + Path: mlflowTokenFile, + ExpirationSeconds: func() *int64 { e := int64(mlflowTokenExpiration); return &e }(), + }, + }, + }, + }, + }, + }, } if instance.Spec.IsDatabaseConfigured() { volumes = append(volumes, corev1.Volume{ @@ -311,7 +357,6 @@ func (r *EvalHubReconciler) buildDeploymentSpec(ctx context.Context, instance *e }, }) } - // Pod template with both containers and required volumes podTemplate := corev1.PodTemplateSpec{ ObjectMeta: metav1.ObjectMeta{ diff --git a/controllers/evalhub/deployment_test.go b/controllers/evalhub/deployment_test.go index 10f1d6c22..2c90e67c9 100644 --- a/controllers/evalhub/deployment_test.go +++ b/controllers/evalhub/deployment_test.go @@ -371,8 +371,8 @@ var _ = Describe("EvalHub Deployment", func() { }, deployment) Expect(err).NotTo(HaveOccurred()) - By("Checking deployment has 4 volumes (3 base + DB secret)") - Expect(deployment.Spec.Template.Spec.Volumes).To(HaveLen(4)) + By("Checking deployment has 6 volumes (5 base + DB secret)") + Expect(deployment.Spec.Template.Spec.Volumes).To(HaveLen(6)) By("Finding DB secret volume") var dbVolume *corev1.Volume @@ -399,8 +399,8 @@ var _ = Describe("EvalHub Deployment", func() { } Expect(evalHubContainer).NotTo(BeNil()) - By("Checking evalhub container has 2 volume mounts (config + DB secret)") - Expect(evalHubContainer.VolumeMounts).To(HaveLen(2)) + By("Checking evalhub container has 4 volume mounts (config + service-ca + mlflow-token + DB secret)") + Expect(evalHubContainer.VolumeMounts).To(HaveLen(4)) var dbMount *corev1.VolumeMount for i, mount := range evalHubContainer.VolumeMounts { @@ -426,8 +426,8 @@ var _ = Describe("EvalHub Deployment", func() { By("Getting deployment") deployment := waitForDeployment(evalHubName, testNamespace) - By("Checking deployment has only 3 base volumes") - Expect(deployment.Spec.Template.Spec.Volumes).To(HaveLen(3)) + By("Checking deployment has only 5 base volumes") + Expect(deployment.Spec.Template.Spec.Volumes).To(HaveLen(5)) By("Finding evalhub container") var evalHubContainer *corev1.Container @@ -439,8 +439,8 @@ var _ = Describe("EvalHub Deployment", func() { } Expect(evalHubContainer).NotTo(BeNil()) - By("Checking evalhub container has only 1 volume mount") - Expect(evalHubContainer.VolumeMounts).To(HaveLen(1)) + By("Checking evalhub container has 3 base volume mounts (config + service-ca + mlflow-token)") + Expect(evalHubContainer.VolumeMounts).To(HaveLen(3)) Expect(evalHubContainer.VolumeMounts[0].Name).To(Equal("evalhub-config")) }) }) @@ -645,7 +645,7 @@ var _ = Describe("EvalHub Deployment", func() { deployment := waitForDeployment(evalHubName, testNamespace) By("Checking deployment volumes") - Expect(deployment.Spec.Template.Spec.Volumes).To(HaveLen(3)) + Expect(deployment.Spec.Template.Spec.Volumes).To(HaveLen(5)) var evalHubConfigVolume, configVolume, tlsVolume *corev1.Volume for _, volume := range deployment.Spec.Template.Spec.Volumes { @@ -674,7 +674,7 @@ var _ = Describe("EvalHub Deployment", func() { Expect(tlsVolume.VolumeSource.Secret.SecretName).To(Equal(evalHubName + "-tls")) }) - It("should configure service account for proxy", func() { + It("should configure service account for API", func() { By("Reconciling deployment") err := reconciler.reconcileDeployment(ctx, evalHub) Expect(err).NotTo(HaveOccurred()) @@ -682,8 +682,8 @@ var _ = Describe("EvalHub Deployment", func() { By("Getting deployment") deployment := waitForDeployment(evalHubName, testNamespace) - By("Checking service account name") - Expect(deployment.Spec.Template.Spec.ServiceAccountName).To(Equal(evalHubName + "-proxy")) + By("Checking service account name uses -api suffix") + Expect(deployment.Spec.Template.Spec.ServiceAccountName).To(Equal(evalHubName + "-api")) }) }) }) diff --git a/controllers/evalhub/evalhub_controller.go b/controllers/evalhub/evalhub_controller.go index 9608f4fa0..3f098d900 100644 --- a/controllers/evalhub/evalhub_controller.go +++ b/controllers/evalhub/evalhub_controller.go @@ -41,9 +41,9 @@ type EvalHubReconciler struct { } //+kubebuilder:rbac:groups=trustyai.opendatahub.io,resources=evalhubs,verbs=get;list;watch;create;update;patch;delete +//+kubebuilder:rbac:groups=trustyai.opendatahub.io,resources=evalhubs/proxy,verbs=get;create //+kubebuilder:rbac:groups=trustyai.opendatahub.io,resources=evalhubs/status,verbs=get;update;patch //+kubebuilder:rbac:groups=trustyai.opendatahub.io,resources=evalhubs/finalizers,verbs=update -//+kubebuilder:rbac:groups=trustyai.opendatahub.io,resources=evalhubs/proxy,verbs=get;create;update //+kubebuilder:rbac:groups=apps,resources=deployments,verbs=list;watch;get;create;update;patch;delete //+kubebuilder:rbac:groups=apps,resources=deployments/status,verbs=get;update;patch //+kubebuilder:rbac:groups=core,resources=services,verbs=get;list;watch;create;update;patch;delete @@ -52,6 +52,7 @@ type EvalHubReconciler struct { //+kubebuilder:rbac:groups=rbac.authorization.k8s.io,resources=clusterrolebindings,verbs=get;list;watch;create;update;delete //+kubebuilder:rbac:groups=route.openshift.io,resources=routes,verbs=list;watch;get;create;update;patch;delete //+kubebuilder:rbac:groups="",resources=events,verbs=create;patch;update +//+kubebuilder:rbac:groups=rbac.authorization.k8s.io,resources=roles,verbs=get;list;watch;create;update;patch;delete //+kubebuilder:rbac:groups=rbac.authorization.k8s.io,resources=rolebindings,verbs=get;list;watch;create;update;patch;delete // Reconcile is part of the main kubernetes reconciliation loop which aims to @@ -230,22 +231,12 @@ func (r *EvalHubReconciler) handleDeletion(ctx context.Context, instance *evalhu return DoNotRequeue() } -// cleanupClusterRoleBinding deletes the EvalHub proxy ClusterRoleBinding upon instance deletion +// cleanupClusterRoleBinding deletes EvalHub cluster-scoped RBAC resources upon instance deletion. +// Namespace-scoped resources (RoleBindings, ServiceAccounts) are garbage-collected via owner references. func (r *EvalHubReconciler) cleanupClusterRoleBinding(ctx context.Context, instance *evalhubv1alpha1.EvalHub) error { - // Delete proxy ClusterRoleBinding - proxyCRBName := instance.Name + "-" + instance.Namespace + "-proxy-rolebinding" - if err := r.deleteClusterRoleBinding(ctx, proxyCRBName); err != nil { - return err - } - - // Delete jobs RoleBinding - jobsRBName := instance.Name + "-" + instance.Namespace + "-jobs-proxy-rolebinding" - if err := r.deleteRoleBinding(ctx, instance.Namespace, jobsRBName); err != nil { - return err - } - // Cleanup legacy jobs ClusterRoleBinding name (pre-rename) - legacyJobsCRBName := instance.Namespace + "-" + instance.Name + "-jobs-proxy" - if err := r.deleteClusterRoleBinding(ctx, legacyJobsCRBName); err != nil { + // Delete auth reviewer ClusterRoleBinding (cannot be owner-ref'd to a namespaced resource) + authCRBName := generateAuthReviewerClusterRoleBindingName(instance) + if err := r.deleteClusterRoleBinding(ctx, authCRBName); err != nil { return err } @@ -273,23 +264,6 @@ func (r *EvalHubReconciler) deleteClusterRoleBinding(ctx context.Context, crbNam } } -// deleteRoleBinding deletes a specific RoleBinding by name and namespace -func (r *EvalHubReconciler) deleteRoleBinding(ctx context.Context, namespace, rbName string) error { - log := log.FromContext(ctx) - - rb := &rbacv1.RoleBinding{} - log.Info("Deleting RoleBinding", "name", rbName, "namespace", namespace) - - err := r.Get(ctx, types.NamespacedName{Name: rbName, Namespace: namespace}, rb) - if err == nil { - return r.Delete(ctx, rb) - } else if errors.IsNotFound(err) { - log.Info("RoleBinding not found, may have been already deleted", "name", rbName, "namespace", namespace) - return nil - } - return err -} - // updateStatus updates the EvalHub status based on the deployment status func (r *EvalHubReconciler) updateStatus(ctx context.Context, instance *evalhubv1alpha1.EvalHub) error { log := log.FromContext(ctx) diff --git a/controllers/evalhub/label_normalization_test.go b/controllers/evalhub/label_normalization_test.go new file mode 100644 index 000000000..fab5c5a73 --- /dev/null +++ b/controllers/evalhub/label_normalization_test.go @@ -0,0 +1,75 @@ +package evalhub + +import ( + "context" + "strings" + "testing" + + "github.com/stretchr/testify/require" + evalhubv1alpha1 "github.com/trustyai-explainability/trustyai-service-operator/api/evalhub/v1alpha1" + corev1 "k8s.io/api/core/v1" + rbacv1 "k8s.io/api/rbac/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/tools/record" + "sigs.k8s.io/controller-runtime/pkg/client/fake" +) + +func TestNormalizeDNS1123LabelValue(t *testing.T) { + out := normalizeDNS1123LabelValue("This_Is.Not DNS1123!!!") + require.LessOrEqual(t, len(out), 63) + require.Regexp(t, dns1123LabelRe, out) + + long := strings.Repeat("a", 80) + "-suffix" + out2 := normalizeDNS1123LabelValue(long) + require.LessOrEqual(t, len(out2), 63) + require.Regexp(t, dns1123LabelRe, out2) + require.Equal(t, out2, normalizeDNS1123LabelValue(long), "normalization should be stable") +} + +func TestAuthReviewerCRB_AppNameLabelIsNormalizedWhenBindingNameTooLong(t *testing.T) { + scheme := runtime.NewScheme() + require.NoError(t, corev1.AddToScheme(scheme)) + require.NoError(t, rbacv1.AddToScheme(scheme)) + require.NoError(t, evalhubv1alpha1.AddToScheme(scheme)) + + ctx := context.Background() + + // Make the generated binding name exceed 63 chars: + // --auth-reviewer-crb + instanceName := strings.Repeat("a", 30) + ns := strings.Repeat("b", 30) + + evalHub := &evalhubv1alpha1.EvalHub{ + ObjectMeta: metav1.ObjectMeta{ + Name: instanceName, + Namespace: ns, + }, + } + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(evalHub). + Build() + + reconciler := &EvalHubReconciler{ + Client: fakeClient, + Scheme: scheme, + Namespace: ns, + EventRecorder: record.NewFakeRecorder(10), + } + + require.NoError(t, reconciler.createServiceAccount(ctx, evalHub)) + + crbName := generateAuthReviewerClusterRoleBindingName(evalHub) + require.Greater(t, len(crbName), 63, "test must exercise the >63 char case") + + crb := &rbacv1.ClusterRoleBinding{} + require.NoError(t, fakeClient.Get(ctx, types.NamespacedName{Name: crbName}, crb)) + + lbl := crb.Labels["app.kubernetes.io/name"] + require.LessOrEqual(t, len(lbl), 63) + require.Regexp(t, dns1123LabelRe, lbl) + require.Equal(t, generateAuthReviewerClusterRoleBindingAppNameLabelValue(evalHub), lbl) +} diff --git a/controllers/evalhub/proxy_rbac_test.go b/controllers/evalhub/proxy_rbac_test.go index 62d7d764d..cf16f1d1f 100644 --- a/controllers/evalhub/proxy_rbac_test.go +++ b/controllers/evalhub/proxy_rbac_test.go @@ -14,11 +14,11 @@ import ( "sigs.k8s.io/yaml" ) -var _ = Describe("EvalHub Proxy RBAC", func() { +var _ = Describe("EvalHub API RBAC", func() { const ( - testNamespacePrefix = "evalhub-proxy-rbac-test" + testNamespacePrefix = "evalhub-api-rbac-test" operatorNamespacePrefix = "operator-system" - evalHubName = "proxy-rbac-evalhub" + evalHubName = "api-rbac-evalhub" configMapName = "trustyai-service-operator-config" ) @@ -88,10 +88,10 @@ var _ = Describe("EvalHub Proxy RBAC", func() { }) Context("ServiceAccount Management", func() { - It("should generate correct service account name", func() { + It("should generate correct service account name with -api suffix", func() { By("Generating service account name") saName := generateServiceAccountName(evalHub) - Expect(saName).To(Equal(evalHubName + "-proxy")) + Expect(saName).To(Equal(evalHubName + "-api")) }) It("should create service account with correct configuration", func() { @@ -99,21 +99,21 @@ var _ = Describe("EvalHub Proxy RBAC", func() { err := reconciler.createServiceAccount(ctx, evalHub) Expect(err).NotTo(HaveOccurred()) - By("Verifying service account exists") + By("Verifying service account exists with -api suffix") serviceAccount := &corev1.ServiceAccount{} err = k8sClient.Get(ctx, types.NamespacedName{ - Name: evalHubName + "-proxy", + Name: evalHubName + "-api", Namespace: testNamespace, }, serviceAccount) Expect(err).NotTo(HaveOccurred()) By("Checking service account specifications") - Expect(serviceAccount.Name).To(Equal(evalHubName + "-proxy")) + Expect(serviceAccount.Name).To(Equal(evalHubName + "-api")) Expect(serviceAccount.Namespace).To(Equal(testNamespace)) By("Checking labels") Expect(serviceAccount.Labels["app"]).To(Equal("eval-hub")) - Expect(serviceAccount.Labels["app.kubernetes.io/name"]).To(Equal(evalHubName + "-proxy")) + Expect(serviceAccount.Labels["app.kubernetes.io/name"]).To(Equal(evalHubName + "-api")) Expect(serviceAccount.Labels["app.kubernetes.io/instance"]).To(Equal(evalHub.Name)) Expect(serviceAccount.Labels["app.kubernetes.io/part-of"]).To(Equal("eval-hub")) @@ -123,34 +123,108 @@ var _ = Describe("EvalHub Proxy RBAC", func() { Expect(serviceAccount.OwnerReferences[0].Kind).To(Equal("EvalHub")) }) - It("should create cluster role binding with correct configuration", func() { - By("Creating service account (which also creates cluster role binding)") + It("should create auth reviewer ClusterRoleBinding (not full proxy role)", func() { + By("Creating service account (which also creates auth reviewer ClusterRoleBinding)") err := reconciler.createServiceAccount(ctx, evalHub) Expect(err).NotTo(HaveOccurred()) - By("Verifying cluster role binding exists") + By("Verifying auth reviewer ClusterRoleBinding exists") clusterRoleBinding := &rbacv1.ClusterRoleBinding{} - bindingName := fmt.Sprintf("%s-%s-proxy-rolebinding", evalHub.Name, evalHub.Namespace) + bindingName := fmt.Sprintf("%s-%s-auth-reviewer-crb", evalHub.Name, evalHub.Namespace) err = k8sClient.Get(ctx, types.NamespacedName{ Name: bindingName, }, clusterRoleBinding) Expect(err).NotTo(HaveOccurred()) - By("Checking cluster role binding specifications") + By("Checking ClusterRoleBinding specifications") Expect(clusterRoleBinding.Name).To(Equal(bindingName)) - By("Checking subjects") + By("Checking subjects use -api SA") Expect(clusterRoleBinding.Subjects).To(HaveLen(1)) Expect(clusterRoleBinding.Subjects[0].Kind).To(Equal("ServiceAccount")) - Expect(clusterRoleBinding.Subjects[0].Name).To(Equal(evalHubName + "-proxy")) + Expect(clusterRoleBinding.Subjects[0].Name).To(Equal(evalHubName + "-api")) Expect(clusterRoleBinding.Subjects[0].Namespace).To(Equal(testNamespace)) - By("Checking role reference") + By("Checking role reference points to auth-reviewer (not proxy-role)") Expect(clusterRoleBinding.RoleRef.Kind).To(Equal("ClusterRole")) - Expect(clusterRoleBinding.RoleRef.Name).To(Equal("trustyai-service-operator-evalhub-proxy-role")) + Expect(clusterRoleBinding.RoleRef.Name).To(Equal(authReviewerClusterRoleName)) Expect(clusterRoleBinding.RoleRef.APIGroup).To(Equal("rbac.authorization.k8s.io")) }) + It("should create per-instance API access Role with resourceNames", func() { + By("Creating service account") + err := reconciler.createServiceAccount(ctx, evalHub) + Expect(err).NotTo(HaveOccurred()) + + By("Verifying per-instance Role exists") + role := &rbacv1.Role{} + roleName := generateAPIAccessRoleName(evalHub) + err = k8sClient.Get(ctx, types.NamespacedName{ + Name: roleName, + Namespace: testNamespace, + }, role) + Expect(err).NotTo(HaveOccurred()) + + By("Checking Role has resourceNames set to instance name") + Expect(role.Rules).To(HaveLen(2)) + Expect(role.Rules[0].ResourceNames).To(Equal([]string{evalHubName})) + Expect(role.Rules[1].ResourceNames).To(Equal([]string{evalHubName})) + }) + + It("should create namespace-scoped API access RoleBinding referencing per-instance Role", func() { + By("Creating service account") + err := reconciler.createServiceAccount(ctx, evalHub) + Expect(err).NotTo(HaveOccurred()) + + By("Verifying API access RoleBinding exists in namespace") + roleBinding := &rbacv1.RoleBinding{} + rbName := evalHubName + "-api-access-rb" + err = k8sClient.Get(ctx, types.NamespacedName{ + Name: rbName, + Namespace: testNamespace, + }, roleBinding) + Expect(err).NotTo(HaveOccurred()) + + By("Checking RoleBinding is namespace-scoped") + Expect(roleBinding.Namespace).To(Equal(testNamespace)) + + By("Checking subjects use -api SA") + Expect(roleBinding.Subjects).To(HaveLen(1)) + Expect(roleBinding.Subjects[0].Kind).To(Equal("ServiceAccount")) + Expect(roleBinding.Subjects[0].Name).To(Equal(evalHubName + "-api")) + + By("Checking role reference points to per-instance Role (not ClusterRole)") + Expect(roleBinding.RoleRef.Kind).To(Equal("Role")) + Expect(roleBinding.RoleRef.Name).To(Equal(generateAPIAccessRoleName(evalHub))) + }) + + It("should create namespace-scoped jobs API access RoleBinding referencing per-instance Role", func() { + By("Creating service account") + err := reconciler.createServiceAccount(ctx, evalHub) + Expect(err).NotTo(HaveOccurred()) + + By("Verifying jobs API access RoleBinding exists in namespace") + roleBinding := &rbacv1.RoleBinding{} + rbName := evalHubName + "-jobs-api-access-rb" + err = k8sClient.Get(ctx, types.NamespacedName{ + Name: rbName, + Namespace: testNamespace, + }, roleBinding) + Expect(err).NotTo(HaveOccurred()) + + By("Checking RoleBinding is namespace-scoped") + Expect(roleBinding.Namespace).To(Equal(testNamespace)) + + By("Checking subjects use -jobs SA") + Expect(roleBinding.Subjects).To(HaveLen(1)) + Expect(roleBinding.Subjects[0].Kind).To(Equal("ServiceAccount")) + Expect(roleBinding.Subjects[0].Name).To(Equal(evalHubName + "-jobs")) + + By("Checking role reference points to per-instance Role (not ClusterRole)") + Expect(roleBinding.RoleRef.Kind).To(Equal("Role")) + Expect(roleBinding.RoleRef.Name).To(Equal(generateJobsAPIAccessRoleName(evalHub))) + }) + It("should handle existing service account gracefully", func() { By("Creating service account initially") err := reconciler.createServiceAccount(ctx, evalHub) @@ -163,11 +237,62 @@ var _ = Describe("EvalHub Proxy RBAC", func() { By("Verifying only one service account exists") serviceAccount := &corev1.ServiceAccount{} err = k8sClient.Get(ctx, types.NamespacedName{ - Name: evalHubName + "-proxy", + Name: evalHubName + "-api", Namespace: testNamespace, }, serviceAccount) Expect(err).NotTo(HaveOccurred()) - Expect(serviceAccount.Name).To(Equal(evalHubName + "-proxy")) + Expect(serviceAccount.Name).To(Equal(evalHubName + "-api")) + }) + }) + + Context("Split Resource Manager", func() { + It("should create two separate RoleBindings for resource management", func() { + By("Creating service account (which creates all RoleBindings)") + err := reconciler.createServiceAccount(ctx, evalHub) + Expect(err).NotTo(HaveOccurred()) + + By("Verifying jobs-writer RoleBinding exists") + jwRB := &rbacv1.RoleBinding{} + err = k8sClient.Get(ctx, types.NamespacedName{ + Name: evalHubName + "-jobs-writer-rb", + Namespace: testNamespace, + }, jwRB) + Expect(err).NotTo(HaveOccurred()) + Expect(jwRB.RoleRef.Kind).To(Equal("ClusterRole")) + Expect(jwRB.RoleRef.Name).To(Equal(jobsWriterClusterRoleName)) + Expect(jwRB.Subjects).To(HaveLen(1)) + Expect(jwRB.Subjects[0].Name).To(Equal(evalHubName + "-api")) + + By("Verifying job-config RoleBinding exists") + jcRB := &rbacv1.RoleBinding{} + err = k8sClient.Get(ctx, types.NamespacedName{ + Name: evalHubName + "-job-config-rb", + Namespace: testNamespace, + }, jcRB) + Expect(err).NotTo(HaveOccurred()) + Expect(jcRB.RoleRef.Kind).To(Equal("ClusterRole")) + Expect(jcRB.RoleRef.Name).To(Equal(jobConfigClusterRoleName)) + Expect(jcRB.Subjects).To(HaveLen(1)) + Expect(jcRB.Subjects[0].Name).To(Equal(evalHubName + "-api")) + + }) + + It("should not bind jobs SA to resource-manager roles", func() { + By("Creating service account") + err := reconciler.createServiceAccount(ctx, evalHub) + Expect(err).NotTo(HaveOccurred()) + + By("Checking jobs-writer binding has only API SA") + jwRB := &rbacv1.RoleBinding{} + err = k8sClient.Get(ctx, types.NamespacedName{ + Name: evalHubName + "-jobs-writer-rb", + Namespace: testNamespace, + }, jwRB) + Expect(err).NotTo(HaveOccurred()) + for _, subject := range jwRB.Subjects { + Expect(subject.Name).NotTo(Equal(evalHubName+"-jobs"), + "Jobs SA should not be bound to resource-manager roles") + } }) }) @@ -303,6 +428,7 @@ var _ = Describe("EvalHub Proxy RBAC", func() { Expect(resourceAttrs["namespace"]).To(Equal(testNamespace)) Expect(resourceAttrs["apiGroup"]).To(Equal("trustyai.opendatahub.io")) Expect(resourceAttrs["resource"]).To(Equal("evalhubs")) + Expect(resourceAttrs["name"]).To(Equal(evalHubName)) Expect(resourceAttrs["resourceName"]).To(Equal(evalHubName)) Expect(resourceAttrs["subresource"]).To(Equal("proxy")) @@ -387,6 +513,7 @@ var _ = Describe("EvalHub Proxy RBAC", func() { authorization := proxyConfig["authorization"].(map[string]interface{}) resourceAttrs := authorization["resourceAttributes"].(map[string]interface{}) Expect(resourceAttrs["namespace"]).To(Equal(evalHub.Namespace)) + Expect(resourceAttrs["name"]).To(Equal(evalHub.Name)) Expect(resourceAttrs["resourceName"]).To(Equal(evalHub.Name)) }) }) @@ -436,4 +563,30 @@ var _ = Describe("EvalHub Proxy RBAC", func() { Expect(configMap.OwnerReferences).To(BeEmpty()) }) }) + + Context("RBAC Namespace Scoping", func() { + It("should not create cross-namespace ClusterRoleBinding for API access", func() { + By("Creating service account and RBAC resources") + err := reconciler.createServiceAccount(ctx, evalHub) + Expect(err).NotTo(HaveOccurred()) + + By("Verifying no legacy proxy ClusterRoleBinding exists") + legacyCRB := &rbacv1.ClusterRoleBinding{} + legacyBindingName := fmt.Sprintf("%s-%s-proxy-rolebinding", evalHub.Name, evalHub.Namespace) + err = k8sClient.Get(ctx, types.NamespacedName{ + Name: legacyBindingName, + }, legacyCRB) + Expect(err).To(HaveOccurred(), "Legacy proxy ClusterRoleBinding should not exist") + + By("Verifying API access is via namespace-scoped RoleBinding referencing per-instance Role") + apiRB := &rbacv1.RoleBinding{} + err = k8sClient.Get(ctx, types.NamespacedName{ + Name: evalHubName + "-api-access-rb", + Namespace: testNamespace, + }, apiRB) + Expect(err).NotTo(HaveOccurred()) + Expect(apiRB.RoleRef.Kind).To(Equal("Role")) + Expect(apiRB.RoleRef.Name).To(Equal(generateAPIAccessRoleName(evalHub))) + }) + }) }) diff --git a/controllers/evalhub/rbac_manifests_test.go b/controllers/evalhub/rbac_manifests_test.go new file mode 100644 index 000000000..a47870a2f --- /dev/null +++ b/controllers/evalhub/rbac_manifests_test.go @@ -0,0 +1,66 @@ +package evalhub + +import ( + "os" + "path/filepath" + "runtime" + "sort" + "testing" + + rbacv1 "k8s.io/api/rbac/v1" + "sigs.k8s.io/yaml" +) + +func TestEvalHubJobConfigClusterRoleVerbsAreMinimalAndSufficient(t *testing.T) { + // EvalHub sets ConfigMap ownerReferences after creating the Job: + // it does a Get+Update on the ConfigMap (see eval-hub/internal/runtimes/k8s/k8s_helper.go:SetConfigMapOwner). + // Therefore the job-config ClusterRole must include: create,get,update,delete. + _, thisFile, _, ok := runtime.Caller(0) + if !ok { + t.Fatal("runtime.Caller failed") + } + + // This test file lives at trustyai-service-operator/controllers/evalhub/. + // The manifest lives under trustyai-service-operator/config/rbac/evalhub/. + moduleRoot := filepath.Clean(filepath.Join(filepath.Dir(thisFile), "..", "..")) + manifestPath := filepath.Join(moduleRoot, "config", "rbac", "evalhub", "evalhub_job_config_role.yaml") + + raw, err := os.ReadFile(manifestPath) + if err != nil { + t.Fatalf("read %s: %v", manifestPath, err) + } + + var cr rbacv1.ClusterRole + if err := yaml.Unmarshal(raw, &cr); err != nil { + t.Fatalf("unmarshal %s: %v", manifestPath, err) + } + + var gotVerbs []string + for _, rule := range cr.Rules { + for _, res := range rule.Resources { + if res == "configmaps" { + gotVerbs = append([]string(nil), rule.Verbs...) + break + } + } + if len(gotVerbs) > 0 { + break + } + } + if len(gotVerbs) == 0 { + t.Fatalf("expected %s to contain a policy rule for resource 'configmaps'", manifestPath) + } + + wantVerbs := []string{"create", "delete", "get", "update"} + sort.Strings(gotVerbs) + sort.Strings(wantVerbs) + + if len(gotVerbs) != len(wantVerbs) { + t.Fatalf("unexpected verbs for configmaps: got=%v want=%v", gotVerbs, wantVerbs) + } + for i := range wantVerbs { + if gotVerbs[i] != wantVerbs[i] { + t.Fatalf("unexpected verbs for configmaps: got=%v want=%v", gotVerbs, wantVerbs) + } + } +} diff --git a/controllers/evalhub/service_accounts.go b/controllers/evalhub/service_accounts.go index 344275481..18cc58025 100644 --- a/controllers/evalhub/service_accounts.go +++ b/controllers/evalhub/service_accounts.go @@ -2,7 +2,11 @@ package evalhub import ( "context" + "crypto/sha256" + "encoding/hex" + "regexp" "sort" + "strings" evalhubv1alpha1 "github.com/trustyai-explainability/trustyai-service-operator/api/evalhub/v1alpha1" "github.com/trustyai-explainability/trustyai-service-operator/controllers/constants" @@ -16,8 +20,106 @@ import ( ctrl "sigs.k8s.io/controller-runtime" ) +var dns1123LabelRe = regexp.MustCompile(`^[a-z0-9]([a-z0-9-]*[a-z0-9])?$`) + +// normalizeDNS1123LabelValue converts s into a DNS-1123 compatible string that is <= 63 chars. +// It preserves a human-readable prefix and appends a short stable hash to avoid collisions. +// +// This is intentionally stricter than the Kubernetes label value regex to ensure cross-tool +// compatibility (e.g. components that validate against DNS-1123). +func normalizeDNS1123LabelValue(s string) string { + const maxLen = 63 + const hashLen = 10 // 40 bits of hash in hex; low collision risk for our use. + + raw := strings.ToLower(strings.TrimSpace(s)) + if raw == "" { + return "x" + } + + // Replace any invalid character with '-' and collapse runs of '-'. + var b strings.Builder + b.Grow(len(raw)) + lastDash := false + for _, r := range raw { + isAllowed := (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') || r == '-' + if !isAllowed { + if !lastDash { + b.WriteByte('-') + lastDash = true + } + continue + } + if r == '-' { + if lastDash { + continue + } + lastDash = true + b.WriteByte('-') + continue + } + lastDash = false + b.WriteRune(r) + } + clean := strings.Trim(b.String(), "-") + if clean == "" { + clean = "x" + } + + // If already valid and within limit, return as-is. + if len(clean) <= maxLen && dns1123LabelRe.MatchString(clean) { + return clean + } + + sum := sha256.Sum256([]byte(s)) + h := hex.EncodeToString(sum[:])[:hashLen] + + // Keep as much prefix as possible while reserving "-". + prefixMax := maxLen - 1 - hashLen + prefix := clean + if len(prefix) > prefixMax { + prefix = prefix[:prefixMax] + prefix = strings.Trim(prefix, "-") + if prefix == "" { + prefix = "x" + } + } + + out := prefix + "-" + h + // Defensive: ensure output is valid. + out = strings.Trim(out, "-") + if len(out) > maxLen { + out = out[:maxLen] + out = strings.Trim(out, "-") + } + if out == "" || !dns1123LabelRe.MatchString(out) { + // Last resort: just the hash with a leading alpha prefix. + out = "x-" + h + } + return out +} + func generateServiceAccountName(instance *evalhubv1alpha1.EvalHub) string { - return instance.Name + "-proxy" + return instance.Name + "-api" +} + +// generateAPIAccessRoleName returns the name for the per-instance API access Role. +func generateAPIAccessRoleName(instance *evalhubv1alpha1.EvalHub) string { + return instance.Name + "-api-access-role" +} + +// generateJobsAPIAccessRoleName returns the name for the per-instance jobs API access Role. +func generateJobsAPIAccessRoleName(instance *evalhubv1alpha1.EvalHub) string { + return instance.Name + "-jobs-api-access-role" +} + +func generateAuthReviewerClusterRoleBindingName(instance *evalhubv1alpha1.EvalHub) string { + return instance.Name + "-" + instance.Namespace + "-auth-reviewer-crb" +} + +// generateAuthReviewerClusterRoleBindingAppNameLabelValue returns a deterministic, DNS-1123 compatible +// label value (<=63 chars) derived from the full auth reviewer ClusterRoleBinding name. +func generateAuthReviewerClusterRoleBindingAppNameLabelValue(instance *evalhubv1alpha1.EvalHub) string { + return normalizeDNS1123LabelValue(generateAuthReviewerClusterRoleBindingName(instance)) } // createServiceAccount creates a service account for this instance's kube-rbac-proxy @@ -30,7 +132,7 @@ func (r *EvalHubReconciler) createServiceAccount(ctx context.Context, instance * Namespace: instance.Namespace, Labels: map[string]string{ "app": "eval-hub", - "app.kubernetes.io/name": serviceAccountName, + "app.kubernetes.io/name": normalizeDNS1123LabelValue(serviceAccountName), "app.kubernetes.io/instance": instance.Name, "app.kubernetes.io/part-of": "eval-hub", "app.kubernetes.io/version": constants.Version, @@ -56,50 +158,201 @@ func (r *EvalHubReconciler) createServiceAccount(ctx context.Context, instance * return err } - err = r.createClusterRoleBinding(ctx, instance, serviceAccountName) + // Create ClusterRoleBinding for kube-rbac-proxy auth (tokenreviews/subjectaccessreviews only) + err = r.createAuthReviewerClusterRoleBinding(ctx, instance, serviceAccountName) if err != nil { return err } - // Create RoleBinding for proxy ServiceAccount to the pre-created ClusterRole - // This allows the evalhub service to create ConfigMaps, Jobs, and proxy to services - err = r.createResourceManagementRoleBinding(ctx, instance, serviceAccountName) + // Create per-instance Roles before RoleBindings + err = r.createAPIAccessRole(ctx, instance) + if err != nil { + return err + } + + err = r.createJobsAPIAccessRole(ctx, instance) + if err != nil { + return err + } + + // Create namespace-scoped RoleBinding for EvalHub API access (evalhubs/proxy) + err = r.createAPIAccessRoleBinding(ctx, instance, serviceAccountName) + if err != nil { + return err + } + + // Create RoleBindings for split resource-manager roles (API SA only) + err = r.createJobsWriterRoleBinding(ctx, instance, serviceAccountName) + if err != nil { + return err + } + + err = r.createJobConfigRoleBinding(ctx, instance, serviceAccountName) if err != nil { return err } // Create RoleBinding for jobs ServiceAccount to access evalhubs/proxy in this namespace jobsServiceAccountName := generateJobsServiceAccountName(instance) - err = r.createJobsProxyRoleBinding(ctx, instance, jobsServiceAccountName) + err = r.createJobsAPIAccessRoleBinding(ctx, instance, jobsServiceAccountName) if err != nil { return err } - // Create RoleBinding for jobs ServiceAccount to the pre-created ClusterRole - // This allows jobs to create ConfigMaps and Jobs in this namespace - err = r.createJobsResourceManagementRoleBinding(ctx, instance, jobsServiceAccountName) + // Create MLFlow access RoleBindings for both ServiceAccounts. + // MLFlow's kubernetes-auth plugin validates tokens via SubjectAccessReview against + // the workspace namespace. The custom "evalhub-mlflow-access" ClusterRole provides + // the required mlflow.kubeflow.org permissions for both the api and jobs SAs. + err = r.createMLFlowAccessRoleBinding(ctx, instance, serviceAccountName, "api", mlflowAccessClusterRoleName) if err != nil { return err } + err = r.createMLFlowAccessRoleBinding(ctx, instance, jobsServiceAccountName, "jobs", mlflowJobsAccessClusterRoleName) + if err != nil { + return err + } + + return nil +} + +// authReviewerClusterRoleName is the ClusterRole for kube-rbac-proxy auth checks only. +// It contains only tokenreviews/create and subjectaccessreviews/create permissions. +const authReviewerClusterRoleName = "trustyai-service-operator-evalhub-auth-reviewer-role" + +// Split resource-manager ClusterRole names. +// These replace the monolithic evalhub-resource-manager with function-specific roles. +const ( + jobsWriterClusterRoleName = "trustyai-service-operator-evalhub-jobs-writer" + jobConfigClusterRoleName = "trustyai-service-operator-evalhub-job-config" +) + +// MLFlow access uses custom ClusterRoles scoped to the "mlflow.kubeflow.org" API group. +// MLFlow's kubernetes-workspace-provider checks permissions via SelfSubjectAccessReview +// against this group (not core Kubernetes resources). The ClusterRoles are pre-created +// at operator installation time (config/rbac/evalhub_mlflow_access_role.yaml and +// config/rbac/evalhub_mlflow_jobs_role.yaml). +const mlflowAccessClusterRoleName = "trustyai-service-operator-evalhub-mlflow-access" + +// mlflowJobsAccessClusterRoleName is a restricted MLflow ClusterRole for job pods. +// Jobs only need create, get, list -- not update or delete. +const mlflowJobsAccessClusterRoleName = "trustyai-service-operator-evalhub-mlflow-jobs-access" + +// createAPIAccessRole creates a per-instance namespaced Role with resourceNames +// scoped to this specific EvalHub instance. This ensures the SA can only access +// its own instance's evalhubs/proxy subresource. +func (r *EvalHubReconciler) createAPIAccessRole(ctx context.Context, instance *evalhubv1alpha1.EvalHub) error { + log := log.FromContext(ctx) + + roleName := generateAPIAccessRoleName(instance) + role := &rbacv1.Role{ + ObjectMeta: metav1.ObjectMeta{ + Name: roleName, + Namespace: instance.Namespace, + Labels: map[string]string{ + "app": "eval-hub", + "app.kubernetes.io/name": roleName, + "app.kubernetes.io/instance": instance.Name, + "app.kubernetes.io/part-of": "eval-hub", + "app.kubernetes.io/version": constants.Version, + }, + }, + Rules: []rbacv1.PolicyRule{ + { + APIGroups: []string{"trustyai.opendatahub.io"}, + Resources: []string{"evalhubs"}, + ResourceNames: []string{instance.Name}, + Verbs: []string{"get"}, + }, + { + APIGroups: []string{"trustyai.opendatahub.io"}, + Resources: []string{"evalhubs/proxy"}, + ResourceNames: []string{instance.Name}, + Verbs: []string{"get", "create"}, + }, + }, + } + + if err := ctrl.SetControllerReference(instance, role, r.Scheme); err != nil { + return err + } + + found := &rbacv1.Role{} + err := r.Get(ctx, types.NamespacedName{Name: role.Name, Namespace: role.Namespace}, found) + if err != nil && errors.IsNotFound(err) { + log.Info("Creating API access Role", "Namespace", role.Namespace, "Name", role.Name) + return r.Create(ctx, role) + } else if err != nil { + return err + } + + // Role exists, check if rules need updating + if !equalPolicyRules(found.Rules, role.Rules) { + found.Rules = role.Rules + log.Info("Updating API access Role rules", "Name", role.Name) + return r.Update(ctx, found) + } return nil } -// Pre-created ClusterRole name for EvalHub resource management -// This ClusterRole is installed with the operator and contains permissions for: -// - ConfigMaps (for job specs) -// - Jobs (for evaluation jobs) -// - services/proxy (for job callbacks) -const resourceManagerClusterRoleName = "trustyai-service-operator-evalhub-resource-manager" - -// createResourceManagementRoleBinding creates a RoleBinding for the proxy ServiceAccount -// to the pre-created ClusterRole for resource management. -// Using a pre-created ClusterRole avoids the need for the operator to have broad -// permissions that would be required to dynamically create Roles with these permissions. -func (r *EvalHubReconciler) createResourceManagementRoleBinding(ctx context.Context, instance *evalhubv1alpha1.EvalHub, serviceAccountName string) error { +// createJobsAPIAccessRole creates a per-instance namespaced Role for the jobs SA +// with resourceNames scoped to this specific EvalHub instance. +func (r *EvalHubReconciler) createJobsAPIAccessRole(ctx context.Context, instance *evalhubv1alpha1.EvalHub) error { log := log.FromContext(ctx) - roleBindingName := instance.Name + "-resource-manager" + roleName := generateJobsAPIAccessRoleName(instance) + role := &rbacv1.Role{ + ObjectMeta: metav1.ObjectMeta{ + Name: roleName, + Namespace: instance.Namespace, + Labels: map[string]string{ + "app": "eval-hub", + "app.kubernetes.io/name": roleName, + "app.kubernetes.io/instance": instance.Name, + "app.kubernetes.io/part-of": "eval-hub", + "app.kubernetes.io/version": constants.Version, + }, + }, + Rules: []rbacv1.PolicyRule{ + { + APIGroups: []string{"trustyai.opendatahub.io"}, + Resources: []string{"evalhubs/proxy"}, + ResourceNames: []string{instance.Name}, + Verbs: []string{"get", "create"}, + }, + }, + } + + if err := ctrl.SetControllerReference(instance, role, r.Scheme); err != nil { + return err + } + + found := &rbacv1.Role{} + err := r.Get(ctx, types.NamespacedName{Name: role.Name, Namespace: role.Namespace}, found) + if err != nil && errors.IsNotFound(err) { + log.Info("Creating jobs API access Role", "Namespace", role.Namespace, "Name", role.Name) + return r.Create(ctx, role) + } else if err != nil { + return err + } + + if !equalPolicyRules(found.Rules, role.Rules) { + found.Rules = role.Rules + log.Info("Updating jobs API access Role rules", "Name", role.Name) + return r.Update(ctx, found) + } + + return nil +} + +// createMLFlowAccessRoleBinding creates a RoleBinding for a ServiceAccount to +// the specified MLflow ClusterRole in the instance namespace. This allows the +// ServiceAccount to pass MLFlow's kubernetes-auth SubjectAccessReview checks +// against the mlflow.kubeflow.org API group in the workspace namespace. +func (r *EvalHubReconciler) createMLFlowAccessRoleBinding(ctx context.Context, instance *evalhubv1alpha1.EvalHub, serviceAccountName string, suffix string, clusterRoleName string) error { + log := log.FromContext(ctx) + + roleBindingName := instance.Name + "-mlflow-" + suffix + "-rb" roleBinding := &rbacv1.RoleBinding{ ObjectMeta: metav1.ObjectMeta{ Name: roleBindingName, @@ -121,7 +374,7 @@ func (r *EvalHubReconciler) createResourceManagementRoleBinding(ctx context.Cont }, RoleRef: rbacv1.RoleRef{ Kind: "ClusterRole", - Name: resourceManagerClusterRoleName, + Name: clusterRoleName, APIGroup: rbacv1.GroupName, }, } @@ -135,7 +388,7 @@ func (r *EvalHubReconciler) createResourceManagementRoleBinding(ctx context.Cont found := &rbacv1.RoleBinding{} err := r.Get(ctx, types.NamespacedName{Name: roleBinding.Name, Namespace: roleBinding.Namespace}, found) if err != nil && errors.IsNotFound(err) { - log.Info("Creating resource management RoleBinding", "Namespace", roleBinding.Namespace, "Name", roleBinding.Name) + log.Info("Creating MLFlow access RoleBinding", "Namespace", roleBinding.Namespace, "Name", roleBinding.Name) return r.Create(ctx, roleBinding) } else if err != nil { return err @@ -147,21 +400,15 @@ func (r *EvalHubReconciler) createResourceManagementRoleBinding(ctx context.Cont if !subjectsEqual || !roleRefEqual { if roleRefEqual && !subjectsEqual { - // Only subjects differ, we can update them found.Subjects = roleBinding.Subjects - log.Info("Updating resource management RoleBinding subjects", "Name", roleBinding.Name) + log.Info("Updating MLFlow access RoleBinding subjects", "Name", roleBinding.Name) return r.Update(ctx, found) } else if !roleRefEqual { - // RoleRef differs, we need to delete and recreate as RoleRef is immutable - log.Info("RoleRef differs, deleting and recreating resource management RoleBinding", "Name", roleBinding.Name) - - // Delete existing RoleBinding + log.Info("RoleRef differs, deleting and recreating MLFlow access RoleBinding", "Name", roleBinding.Name) if err := r.Delete(ctx, found); err != nil { return err } - - // Create new RoleBinding with desired spec - log.Info("Creating new resource management RoleBinding", "Name", roleBinding.Name) + log.Info("Creating new MLFlow access RoleBinding", "Name", roleBinding.Name) return r.Create(ctx, roleBinding) } } @@ -169,19 +416,19 @@ func (r *EvalHubReconciler) createResourceManagementRoleBinding(ctx context.Cont return nil } -// createJobsResourceManagementRoleBinding creates a RoleBinding for the jobs ServiceAccount -// to the pre-created ClusterRole for resource management. -func (r *EvalHubReconciler) createJobsResourceManagementRoleBinding(ctx context.Context, instance *evalhubv1alpha1.EvalHub, serviceAccountName string) error { +// createAuthReviewerClusterRoleBinding creates a ClusterRoleBinding for kube-rbac-proxy +// auth checks (tokenreviews and subjectaccessreviews only). This is the only +// cluster-scoped binding needed for the EvalHub API ServiceAccount. +func (r *EvalHubReconciler) createAuthReviewerClusterRoleBinding(ctx context.Context, instance *evalhubv1alpha1.EvalHub, serviceAccountName string) error { log := log.FromContext(ctx) - roleBindingName := instance.Name + "-resource-manager-jobs" - roleBinding := &rbacv1.RoleBinding{ + clusterRoleBindingName := generateAuthReviewerClusterRoleBindingName(instance) + clusterRoleBinding := &rbacv1.ClusterRoleBinding{ ObjectMeta: metav1.ObjectMeta{ - Name: roleBindingName, - Namespace: instance.Namespace, + Name: clusterRoleBindingName, Labels: map[string]string{ "app": "eval-hub", - "app.kubernetes.io/name": roleBindingName, + "app.kubernetes.io/name": generateAuthReviewerClusterRoleBindingAppNameLabelValue(instance), "app.kubernetes.io/instance": instance.Name, "app.kubernetes.io/part-of": "eval-hub", "app.kubernetes.io/version": constants.Version, @@ -196,67 +443,103 @@ func (r *EvalHubReconciler) createJobsResourceManagementRoleBinding(ctx context. }, RoleRef: rbacv1.RoleRef{ Kind: "ClusterRole", - Name: resourceManagerClusterRoleName, + Name: authReviewerClusterRoleName, APIGroup: rbacv1.GroupName, }, } - // Set instance as the owner and controller - if err := ctrl.SetControllerReference(instance, roleBinding, r.Scheme); err != nil { - return err - } - - // Check if this RoleBinding already exists - found := &rbacv1.RoleBinding{} - err := r.Get(ctx, types.NamespacedName{Name: roleBinding.Name, Namespace: roleBinding.Namespace}, found) + // Check if ClusterRoleBinding already exists + found := &rbacv1.ClusterRoleBinding{} + err := r.Get(ctx, types.NamespacedName{Name: clusterRoleBindingName}, found) if err != nil && errors.IsNotFound(err) { - log.Info("Creating jobs resource management RoleBinding", "Namespace", roleBinding.Namespace, "Name", roleBinding.Name) - return r.Create(ctx, roleBinding) + // Note: We don't set owner references because ClusterRoleBindings cannot have namespace-scoped owners + log.Info("Creating auth reviewer ClusterRoleBinding", "name", clusterRoleBindingName) + return r.Create(ctx, clusterRoleBinding) } else if err != nil { return err } - // RoleBinding exists, check if it needs updating - subjectsEqual := equalRoleBindingSubjects(found.Subjects, roleBinding.Subjects) - roleRefEqual := equalRoleBindingRoleRef(found.RoleRef, roleBinding.RoleRef) + // ClusterRoleBinding already exists, check if it needs updating + subjectsEqual := equalSubjects(found, clusterRoleBinding) + roleRefEqual := equalRoleRef(found, clusterRoleBinding) if !subjectsEqual || !roleRefEqual { if roleRefEqual && !subjectsEqual { - // Only subjects differ, we can update them - found.Subjects = roleBinding.Subjects - log.Info("Updating jobs resource management RoleBinding subjects", "Name", roleBinding.Name) + found.Subjects = clusterRoleBinding.Subjects + log.Info("Updating auth reviewer ClusterRoleBinding subjects", "name", clusterRoleBindingName) return r.Update(ctx, found) } else if !roleRefEqual { - // RoleRef differs, we need to delete and recreate as RoleRef is immutable - log.Info("RoleRef differs, deleting and recreating jobs resource management RoleBinding", "Name", roleBinding.Name) - - // Delete existing RoleBinding + log.Info("RoleRef differs, deleting and recreating auth reviewer ClusterRoleBinding", "name", clusterRoleBindingName) if err := r.Delete(ctx, found); err != nil { return err } - - // Create new RoleBinding with desired spec - log.Info("Creating new jobs resource management RoleBinding", "Name", roleBinding.Name) - return r.Create(ctx, roleBinding) + log.Info("Creating new auth reviewer ClusterRoleBinding", "name", clusterRoleBindingName) + return r.Create(ctx, clusterRoleBinding) } } return nil } -// createJobsProxyRoleBinding creates a RoleBinding for jobs ServiceAccount to access evalhubs/proxy -// in the EvalHub namespace, preventing cross-namespace access. -func (r *EvalHubReconciler) createJobsProxyRoleBinding(ctx context.Context, instance *evalhubv1alpha1.EvalHub, serviceAccountName string) error { +// createAPIAccessRoleBinding creates a namespace-scoped RoleBinding for the API +// ServiceAccount to the per-instance Role for evalhubs/proxy access. +func (r *EvalHubReconciler) createAPIAccessRoleBinding(ctx context.Context, instance *evalhubv1alpha1.EvalHub, serviceAccountName string) error { + roleBindingName := instance.Name + "-api-access-rb" + roleName := generateAPIAccessRoleName(instance) + + return r.createGenericRoleBinding(ctx, instance, roleBindingName, serviceAccountName, rbacv1.RoleRef{ + Kind: "Role", + Name: roleName, + APIGroup: rbacv1.GroupName, + }) +} + +// createJobsAPIAccessRoleBinding creates a namespace-scoped RoleBinding for the jobs +// ServiceAccount to the per-instance Role for evalhubs/proxy access. +func (r *EvalHubReconciler) createJobsAPIAccessRoleBinding(ctx context.Context, instance *evalhubv1alpha1.EvalHub, serviceAccountName string) error { + roleBindingName := instance.Name + "-jobs-api-access-rb" + roleName := generateJobsAPIAccessRoleName(instance) + + return r.createGenericRoleBinding(ctx, instance, roleBindingName, serviceAccountName, rbacv1.RoleRef{ + Kind: "Role", + Name: roleName, + APIGroup: rbacv1.GroupName, + }) +} + +// createJobsWriterRoleBinding creates a RoleBinding for the API SA to the +// jobs-writer ClusterRole (batch/jobs create,delete). +func (r *EvalHubReconciler) createJobsWriterRoleBinding(ctx context.Context, instance *evalhubv1alpha1.EvalHub, serviceAccountName string) error { + return r.createGenericRoleBinding(ctx, instance, instance.Name+"-jobs-writer-rb", serviceAccountName, rbacv1.RoleRef{ + Kind: "ClusterRole", + Name: jobsWriterClusterRoleName, + APIGroup: rbacv1.GroupName, + }) +} + +// createJobConfigRoleBinding creates a RoleBinding for the API SA to the +// job-config ClusterRole (configmaps create,get,list). +func (r *EvalHubReconciler) createJobConfigRoleBinding(ctx context.Context, instance *evalhubv1alpha1.EvalHub, serviceAccountName string) error { + return r.createGenericRoleBinding(ctx, instance, instance.Name+"-job-config-rb", serviceAccountName, rbacv1.RoleRef{ + Kind: "ClusterRole", + Name: jobConfigClusterRoleName, + APIGroup: rbacv1.GroupName, + }) +} + +// createGenericRoleBinding creates a namespace-scoped RoleBinding with the given +// name, subject SA, and role reference. Handles create-or-update logic including +// immutable RoleRef recreation. +func (r *EvalHubReconciler) createGenericRoleBinding(ctx context.Context, instance *evalhubv1alpha1.EvalHub, roleBindingName string, serviceAccountName string, roleRef rbacv1.RoleRef) error { log := log.FromContext(ctx) - roleBindingName := instance.Name + "-" + instance.Namespace + "-jobs-proxy-rolebinding" roleBinding := &rbacv1.RoleBinding{ ObjectMeta: metav1.ObjectMeta{ Name: roleBindingName, Namespace: instance.Namespace, Labels: map[string]string{ "app": "eval-hub", - "app.kubernetes.io/name": instance.Name + "-jobs-proxy", + "app.kubernetes.io/name": roleBindingName, "app.kubernetes.io/instance": instance.Name, "app.kubernetes.io/part-of": "eval-hub", "app.kubernetes.io/version": constants.Version, @@ -269,49 +552,36 @@ func (r *EvalHubReconciler) createJobsProxyRoleBinding(ctx context.Context, inst Namespace: instance.Namespace, }, }, - RoleRef: rbacv1.RoleRef{ - Kind: "ClusterRole", - Name: "trustyai-service-operator-evalhub-jobs-proxy-role", - APIGroup: rbacv1.GroupName, - }, + RoleRef: roleRef, } - // Set instance as the owner and controller if err := ctrl.SetControllerReference(instance, roleBinding, r.Scheme); err != nil { return err } - // Check if this RoleBinding already exists found := &rbacv1.RoleBinding{} err := r.Get(ctx, types.NamespacedName{Name: roleBinding.Name, Namespace: roleBinding.Namespace}, found) if err != nil && errors.IsNotFound(err) { - log.Info("Creating jobs proxy RoleBinding", "Name", roleBinding.Name) + log.Info("Creating RoleBinding", "Namespace", roleBinding.Namespace, "Name", roleBinding.Name) return r.Create(ctx, roleBinding) } else if err != nil { return err } - // RoleBinding exists, check if it needs updating subjectsEqual := equalRoleBindingSubjects(found.Subjects, roleBinding.Subjects) roleRefEqual := equalRoleBindingRoleRef(found.RoleRef, roleBinding.RoleRef) if !subjectsEqual || !roleRefEqual { if roleRefEqual && !subjectsEqual { - // Only subjects differ, we can update them found.Subjects = roleBinding.Subjects - log.Info("Updating jobs proxy RoleBinding subjects", "Name", roleBinding.Name) + log.Info("Updating RoleBinding subjects", "Name", roleBinding.Name) return r.Update(ctx, found) } else if !roleRefEqual { - // RoleRef differs, we need to delete and recreate as RoleRef is immutable - log.Info("RoleRef differs, deleting and recreating jobs proxy RoleBinding", "Name", roleBinding.Name) - - // Delete existing RoleBinding + log.Info("RoleRef differs, deleting and recreating RoleBinding", "Name", roleBinding.Name) if err := r.Delete(ctx, found); err != nil { return err } - - // Create new RoleBinding with desired spec - log.Info("Creating new jobs proxy RoleBinding", "Name", roleBinding.Name) + log.Info("Creating new RoleBinding", "Name", roleBinding.Name) return r.Create(ctx, roleBinding) } } @@ -319,6 +589,35 @@ func (r *EvalHubReconciler) createJobsProxyRoleBinding(ctx context.Context, inst return nil } +// equalPolicyRules compares two slices of PolicyRules for equality. +func equalPolicyRules(a, b []rbacv1.PolicyRule) bool { + if len(a) != len(b) { + return false + } + for i := range a { + if !equalStringSlices(a[i].APIGroups, b[i].APIGroups) || + !equalStringSlices(a[i].Resources, b[i].Resources) || + !equalStringSlices(a[i].ResourceNames, b[i].ResourceNames) || + !equalStringSlices(a[i].Verbs, b[i].Verbs) { + return false + } + } + return true +} + +// equalStringSlices compares two string slices for equality (order-sensitive). +func equalStringSlices(a, b []string) bool { + if len(a) != len(b) { + return false + } + for i := range a { + if a[i] != b[i] { + return false + } + } + return true +} + // equalRoleBindingSubjects compares subjects between two RoleBindings in an order-insensitive way. func equalRoleBindingSubjects(existing, desired []rbacv1.Subject) bool { if len(existing) != len(desired) { @@ -364,91 +663,6 @@ func equalRoleBindingRoleRef(existing, desired rbacv1.RoleRef) bool { existing.APIGroup == desired.APIGroup } -// createClusterRoleBinding creates a binding between the service account and evalhub proxy cluster role -func (r *EvalHubReconciler) createClusterRoleBinding(ctx context.Context, instance *evalhubv1alpha1.EvalHub, serviceAccountName string) error { - log := log.FromContext(ctx) - - clusterRoleBindingName := instance.Name + "-" + instance.Namespace + "-proxy-rolebinding" - clusterRoleBinding := &rbacv1.ClusterRoleBinding{ - ObjectMeta: metav1.ObjectMeta{ - Name: clusterRoleBindingName, - }, - Subjects: []rbacv1.Subject{ - { - Kind: "ServiceAccount", - Name: serviceAccountName, - Namespace: instance.Namespace, - }, - }, - RoleRef: rbacv1.RoleRef{ - Kind: "ClusterRole", - Name: "trustyai-service-operator-evalhub-proxy-role", - APIGroup: rbacv1.GroupName, - }, - } - - // Check if ClusterRoleBinding already exists - found := &rbacv1.ClusterRoleBinding{} - err := r.Get(ctx, types.NamespacedName{Name: clusterRoleBindingName}, found) - if err != nil && errors.IsNotFound(err) { - // ClusterRoleBinding doesn't exist, create it - // Note: We don't set owner references because ClusterRoleBindings cannot have namespace-scoped owners - log.Info("Creating ClusterRoleBinding", "name", clusterRoleBindingName) - return r.Create(ctx, clusterRoleBinding) - } else if err != nil { - // Error getting ClusterRoleBinding - return err - } - - // ClusterRoleBinding already exists, check if it needs updating - subjectsEqual := equalSubjects(found, clusterRoleBinding) - roleRefEqual := equalRoleRef(found, clusterRoleBinding) - - if !subjectsEqual || !roleRefEqual { - if roleRefEqual && !subjectsEqual { - // Only subjects differ, we can update them - found.Subjects = clusterRoleBinding.Subjects - log.Info("Updating ClusterRoleBinding subjects", "name", clusterRoleBindingName) - return r.Update(ctx, found) - } else if !roleRefEqual { - // RoleRef differs, we need to delete and recreate as RoleRef is immutable - log.Info("RoleRef differs, deleting and recreating ClusterRoleBinding", "name", clusterRoleBindingName) - - // Delete existing ClusterRoleBinding - if err := r.Delete(ctx, found); err != nil { - return err - } - - // Create new ClusterRoleBinding with desired spec - log.Info("Creating new ClusterRoleBinding", "name", clusterRoleBindingName) - return r.Create(ctx, clusterRoleBinding) - } - } - - return nil -} - -// equalClusterRoleBindingSpec compares two ClusterRoleBinding specs -func equalClusterRoleBindingSpec(existing, desired *rbacv1.ClusterRoleBinding) bool { - // Compare subjects - if len(existing.Subjects) != len(desired.Subjects) { - return false - } - for i, subject := range existing.Subjects { - if i >= len(desired.Subjects) || - subject.Kind != desired.Subjects[i].Kind || - subject.Name != desired.Subjects[i].Name || - subject.Namespace != desired.Subjects[i].Namespace { - return false - } - } - - // Compare role reference - return existing.RoleRef.Kind == desired.RoleRef.Kind && - existing.RoleRef.Name == desired.RoleRef.Name && - existing.RoleRef.APIGroup == desired.RoleRef.APIGroup -} - // equalSubjects compares subjects between two ClusterRoleBindings in an order-insensitive way. func equalSubjects(existing, desired *rbacv1.ClusterRoleBinding) bool { if len(existing.Subjects) != len(desired.Subjects) { @@ -509,7 +723,7 @@ func (r *EvalHubReconciler) createJobsServiceAccount(ctx context.Context, instan Namespace: instance.Namespace, Labels: map[string]string{ "app": "eval-hub", - "app.kubernetes.io/name": serviceAccountName, + "app.kubernetes.io/name": normalizeDNS1123LabelValue(serviceAccountName), "app.kubernetes.io/instance": instance.Name, "app.kubernetes.io/part-of": "eval-hub", "app.kubernetes.io/component": "jobs", diff --git a/controllers/evalhub/unit_test.go b/controllers/evalhub/unit_test.go index 0561b43ea..6d1fbeb3b 100644 --- a/controllers/evalhub/unit_test.go +++ b/controllers/evalhub/unit_test.go @@ -577,9 +577,9 @@ func TestEvalHubReconciler_createJobsServiceAccount(t *testing.T) { }) } -// TestEvalHubReconciler_createJobsProxyRoleBinding verifies that the jobs proxy RoleBinding +// TestEvalHubReconciler_createJobsAPIAccessRoleBinding verifies that the jobs API access RoleBinding // is created with the correct RoleRef, Subjects, and owner reference. -func TestEvalHubReconciler_createJobsProxyRoleBinding(t *testing.T) { +func TestEvalHubReconciler_createJobsAPIAccessRoleBinding(t *testing.T) { scheme := runtime.NewScheme() require.NoError(t, rbacv1.AddToScheme(scheme)) require.NoError(t, evalhubv1alpha1.AddToScheme(scheme)) @@ -607,13 +607,13 @@ func TestEvalHubReconciler_createJobsProxyRoleBinding(t *testing.T) { EventRecorder: record.NewFakeRecorder(10), } - t.Run("should create jobs proxy RoleBinding with correct properties", func(t *testing.T) { + t.Run("should create jobs API access RoleBinding referencing per-instance Role", func(t *testing.T) { jobsSAName := evalHubName + "-jobs" - err := reconciler.createJobsProxyRoleBinding(ctx, evalHub, jobsSAName) + err := reconciler.createJobsAPIAccessRoleBinding(ctx, evalHub, jobsSAName) require.NoError(t, err) // Verify RoleBinding was created - rbName := evalHubName + "-" + testNamespace + "-jobs-proxy-rolebinding" + rbName := evalHubName + "-jobs-api-access-rb" rb := &rbacv1.RoleBinding{} err = fakeClient.Get(ctx, types.NamespacedName{ Name: rbName, @@ -621,9 +621,9 @@ func TestEvalHubReconciler_createJobsProxyRoleBinding(t *testing.T) { }, rb) require.NoError(t, err) - // Check RoleRef - assert.Equal(t, "ClusterRole", rb.RoleRef.Kind) - assert.Equal(t, "trustyai-service-operator-evalhub-jobs-proxy-role", rb.RoleRef.Name) + // Check RoleRef — should reference the per-instance Role (not ClusterRole) + assert.Equal(t, "Role", rb.RoleRef.Kind) + assert.Equal(t, generateJobsAPIAccessRoleName(evalHub), rb.RoleRef.Name) assert.Equal(t, rbacv1.GroupName, rb.RoleRef.APIGroup) // Check Subjects @@ -640,7 +640,7 @@ func TestEvalHubReconciler_createJobsProxyRoleBinding(t *testing.T) { t.Run("should update subjects when they differ", func(t *testing.T) { // Get existing RoleBinding - rbName := evalHubName + "-" + testNamespace + "-jobs-proxy-rolebinding" + rbName := evalHubName + "-jobs-api-access-rb" rb := &rbacv1.RoleBinding{} err := fakeClient.Get(ctx, types.NamespacedName{ Name: rbName, @@ -661,7 +661,7 @@ func TestEvalHubReconciler_createJobsProxyRoleBinding(t *testing.T) { // Reconcile again jobsSAName := evalHubName + "-jobs" - err = reconciler.createJobsProxyRoleBinding(ctx, evalHub, jobsSAName) + err = reconciler.createJobsAPIAccessRoleBinding(ctx, evalHub, jobsSAName) require.NoError(t, err) // Verify subjects were updated @@ -676,9 +676,9 @@ func TestEvalHubReconciler_createJobsProxyRoleBinding(t *testing.T) { }) } -// TestEvalHubReconciler_createJobsResourceManagementRoleBinding verifies that the jobs -// resource management RoleBinding is created with the correct RoleRef, Subjects, and owner reference. -func TestEvalHubReconciler_createJobsResourceManagementRoleBinding(t *testing.T) { +// TestEvalHubReconciler_createAPIAccessRole verifies that the per-instance API access Role +// is created with resourceNames scoped to the instance, with correct owner ref and idempotency. +func TestEvalHubReconciler_createAPIAccessRole(t *testing.T) { scheme := runtime.NewScheme() require.NoError(t, rbacv1.AddToScheme(scheme)) require.NoError(t, evalhubv1alpha1.AddToScheme(scheme)) @@ -706,13 +706,143 @@ func TestEvalHubReconciler_createJobsResourceManagementRoleBinding(t *testing.T) EventRecorder: record.NewFakeRecorder(10), } - t.Run("should create jobs resource management RoleBinding with correct properties", func(t *testing.T) { + t.Run("should create API access Role with resourceNames", func(t *testing.T) { + err := reconciler.createAPIAccessRole(ctx, evalHub) + require.NoError(t, err) + + roleName := generateAPIAccessRoleName(evalHub) + role := &rbacv1.Role{} + err = fakeClient.Get(ctx, types.NamespacedName{ + Name: roleName, + Namespace: testNamespace, + }, role) + require.NoError(t, err) + + // Check name + assert.Equal(t, evalHubName+"-api-access-role", role.Name) + + // Check rules have resourceNames scoped to this instance + require.Len(t, role.Rules, 2) + + // First rule: evalhubs get + assert.Equal(t, []string{"trustyai.opendatahub.io"}, role.Rules[0].APIGroups) + assert.Equal(t, []string{"evalhubs"}, role.Rules[0].Resources) + assert.Equal(t, []string{evalHubName}, role.Rules[0].ResourceNames) + assert.Equal(t, []string{"get"}, role.Rules[0].Verbs) + + // Second rule: evalhubs/proxy get,create + assert.Equal(t, []string{"trustyai.opendatahub.io"}, role.Rules[1].APIGroups) + assert.Equal(t, []string{"evalhubs/proxy"}, role.Rules[1].Resources) + assert.Equal(t, []string{evalHubName}, role.Rules[1].ResourceNames) + assert.Equal(t, []string{"get", "create"}, role.Rules[1].Verbs) + + // Check owner reference + require.Len(t, role.OwnerReferences, 1) + assert.Equal(t, evalHubName, role.OwnerReferences[0].Name) + assert.Equal(t, "EvalHub", role.OwnerReferences[0].Kind) + assert.True(t, *role.OwnerReferences[0].Controller) + }) + + t.Run("should be idempotent on repeated calls", func(t *testing.T) { + err := reconciler.createAPIAccessRole(ctx, evalHub) + require.NoError(t, err) + + roleName := generateAPIAccessRoleName(evalHub) + role := &rbacv1.Role{} + err = fakeClient.Get(ctx, types.NamespacedName{ + Name: roleName, + Namespace: testNamespace, + }, role) + require.NoError(t, err) + assert.Equal(t, []string{evalHubName}, role.Rules[0].ResourceNames) + }) +} + +// TestEvalHubReconciler_createAPIAccessRoleBinding_RefersToRole verifies that the +// API access RoleBinding references a Role (not ClusterRole). +func TestEvalHubReconciler_createAPIAccessRoleBinding_RefersToRole(t *testing.T) { + scheme := runtime.NewScheme() + require.NoError(t, rbacv1.AddToScheme(scheme)) + require.NoError(t, evalhubv1alpha1.AddToScheme(scheme)) + + ctx := context.Background() + testNamespace := "test-namespace" + evalHubName := "test-evalhub" + + evalHub := &evalhubv1alpha1.EvalHub{ + ObjectMeta: metav1.ObjectMeta{ + Name: evalHubName, + Namespace: testNamespace, + UID: "test-uid-123", + }, + } + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(evalHub). + Build() + + reconciler := &EvalHubReconciler{ + Client: fakeClient, + Scheme: scheme, + EventRecorder: record.NewFakeRecorder(10), + } + + t.Run("should reference Role not ClusterRole", func(t *testing.T) { + apiSAName := evalHubName + "-api" + err := reconciler.createAPIAccessRoleBinding(ctx, evalHub, apiSAName) + require.NoError(t, err) + + rbName := evalHubName + "-api-access-rb" + rb := &rbacv1.RoleBinding{} + err = fakeClient.Get(ctx, types.NamespacedName{ + Name: rbName, + Namespace: testNamespace, + }, rb) + require.NoError(t, err) + + assert.Equal(t, "Role", rb.RoleRef.Kind, "RoleBinding should reference Role, not ClusterRole") + assert.Equal(t, generateAPIAccessRoleName(evalHub), rb.RoleRef.Name) + }) +} + +// TestEvalHubReconciler_createMLFlowAccessRoleBinding_JobsRole verifies that the jobs +// MLflow RoleBinding uses the restricted jobs-specific ClusterRole. +func TestEvalHubReconciler_createMLFlowAccessRoleBinding_JobsRole(t *testing.T) { + scheme := runtime.NewScheme() + require.NoError(t, rbacv1.AddToScheme(scheme)) + require.NoError(t, evalhubv1alpha1.AddToScheme(scheme)) + + ctx := context.Background() + testNamespace := "test-namespace" + evalHubName := "test-evalhub" + + evalHub := &evalhubv1alpha1.EvalHub{ + ObjectMeta: metav1.ObjectMeta{ + Name: evalHubName, + Namespace: testNamespace, + UID: "test-uid-123", + }, + } + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(evalHub). + Build() + + reconciler := &EvalHubReconciler{ + Client: fakeClient, + Scheme: scheme, + EventRecorder: record.NewFakeRecorder(10), + } + + t.Run("should create jobs MLflow RoleBinding with restricted ClusterRole", func(t *testing.T) { jobsSAName := evalHubName + "-jobs" - err := reconciler.createJobsResourceManagementRoleBinding(ctx, evalHub, jobsSAName) + err := reconciler.createMLFlowAccessRoleBinding(ctx, evalHub, jobsSAName, "jobs", mlflowJobsAccessClusterRoleName) require.NoError(t, err) // Verify RoleBinding was created - rbName := evalHubName + "-resource-manager-jobs" + rbName := evalHubName + "-mlflow-jobs-rb" rb := &rbacv1.RoleBinding{} err = fakeClient.Get(ctx, types.NamespacedName{ Name: rbName, @@ -720,9 +850,9 @@ func TestEvalHubReconciler_createJobsResourceManagementRoleBinding(t *testing.T) }, rb) require.NoError(t, err) - // Check RoleRef + // Check RoleRef uses the jobs-specific restricted role assert.Equal(t, "ClusterRole", rb.RoleRef.Kind) - assert.Equal(t, "trustyai-service-operator-evalhub-resource-manager", rb.RoleRef.Name) + assert.Equal(t, "trustyai-service-operator-evalhub-mlflow-jobs-access", rb.RoleRef.Name) assert.Equal(t, rbacv1.GroupName, rb.RoleRef.APIGroup) // Check Subjects @@ -737,52 +867,38 @@ func TestEvalHubReconciler_createJobsResourceManagementRoleBinding(t *testing.T) assert.Equal(t, "EvalHub", rb.OwnerReferences[0].Kind) }) - t.Run("should update subjects when they differ", func(t *testing.T) { - // Get existing RoleBinding - rbName := evalHubName + "-resource-manager-jobs" - rb := &rbacv1.RoleBinding{} - err := fakeClient.Get(ctx, types.NamespacedName{ - Name: rbName, - Namespace: testNamespace, - }, rb) + t.Run("should create API MLflow RoleBinding with full ClusterRole", func(t *testing.T) { + apiSAName := evalHubName + "-api" + err := reconciler.createMLFlowAccessRoleBinding(ctx, evalHub, apiSAName, "api", mlflowAccessClusterRoleName) require.NoError(t, err) - // Modify subjects - rb.Subjects = []rbacv1.Subject{ - { - Kind: "ServiceAccount", - Name: "different-sa", - Namespace: testNamespace, - }, - } - err = fakeClient.Update(ctx, rb) - require.NoError(t, err) - - // Reconcile again - jobsSAName := evalHubName + "-jobs" - err = reconciler.createJobsResourceManagementRoleBinding(ctx, evalHub, jobsSAName) - require.NoError(t, err) - - // Verify subjects were updated + // Verify RoleBinding was created + rbName := evalHubName + "-mlflow-api-rb" + rb := &rbacv1.RoleBinding{} err = fakeClient.Get(ctx, types.NamespacedName{ Name: rbName, Namespace: testNamespace, }, rb) require.NoError(t, err) + // Check RoleRef uses the full access role + assert.Equal(t, "ClusterRole", rb.RoleRef.Kind) + assert.Equal(t, "trustyai-service-operator-evalhub-mlflow-access", rb.RoleRef.Name) + assert.Equal(t, rbacv1.GroupName, rb.RoleRef.APIGroup) + + // Check Subjects require.Len(t, rb.Subjects, 1) - assert.Equal(t, jobsSAName, rb.Subjects[0].Name) + assert.Equal(t, "ServiceAccount", rb.Subjects[0].Kind) + assert.Equal(t, apiSAName, rb.Subjects[0].Name) }) t.Run("should be idempotent on repeated calls", func(t *testing.T) { jobsSAName := evalHubName + "-jobs" - - // Call again - err := reconciler.createJobsResourceManagementRoleBinding(ctx, evalHub, jobsSAName) + err := reconciler.createMLFlowAccessRoleBinding(ctx, evalHub, jobsSAName, "jobs", mlflowJobsAccessClusterRoleName) require.NoError(t, err) // Verify RoleBinding still exists with correct properties - rbName := evalHubName + "-resource-manager-jobs" + rbName := evalHubName + "-mlflow-jobs-rb" rb := &rbacv1.RoleBinding{} err = fakeClient.Get(ctx, types.NamespacedName{ Name: rbName, @@ -790,14 +906,14 @@ func TestEvalHubReconciler_createJobsResourceManagementRoleBinding(t *testing.T) }, rb) require.NoError(t, err) - assert.Equal(t, "trustyai-service-operator-evalhub-resource-manager", rb.RoleRef.Name) + assert.Equal(t, "trustyai-service-operator-evalhub-mlflow-jobs-access", rb.RoleRef.Name) require.Len(t, rb.Subjects, 1) assert.Equal(t, jobsSAName, rb.Subjects[0].Name) }) } // TestEvalHubReconciler_cleanupClusterRoleBinding verifies that cleanup removes -// the proxy ClusterRoleBinding, jobs RoleBinding, and legacy ClusterRoleBinding. +// the auth-reviewer ClusterRoleBinding (the only cluster-scoped resource not owner-ref'd). func TestEvalHubReconciler_cleanupClusterRoleBinding(t *testing.T) { scheme := runtime.NewScheme() require.NoError(t, rbacv1.AddToScheme(scheme)) @@ -814,60 +930,20 @@ func TestEvalHubReconciler_cleanupClusterRoleBinding(t *testing.T) { }, } - // Create test resources - proxyCRBName := evalHubName + "-" + testNamespace + "-proxy-rolebinding" - proxyCRB := &rbacv1.ClusterRoleBinding{ - ObjectMeta: metav1.ObjectMeta{ - Name: proxyCRBName, - }, - RoleRef: rbacv1.RoleRef{ - APIGroup: rbacv1.GroupName, - Kind: "ClusterRole", - Name: "trustyai-service-operator-evalhub-proxy-role", - }, - Subjects: []rbacv1.Subject{ - { - Kind: "ServiceAccount", - Name: evalHubName + "-proxy", - Namespace: testNamespace, - }, - }, - } - - jobsRBName := evalHubName + "-" + testNamespace + "-jobs-proxy-rolebinding" - jobsRB := &rbacv1.RoleBinding{ + authCRBName := evalHubName + "-" + testNamespace + "-auth-reviewer-crb" + authCRB := &rbacv1.ClusterRoleBinding{ ObjectMeta: metav1.ObjectMeta{ - Name: jobsRBName, - Namespace: testNamespace, + Name: authCRBName, }, RoleRef: rbacv1.RoleRef{ APIGroup: rbacv1.GroupName, Kind: "ClusterRole", - Name: "trustyai-service-operator-evalhub-jobs-proxy-role", + Name: authReviewerClusterRoleName, }, Subjects: []rbacv1.Subject{ { Kind: "ServiceAccount", - Name: evalHubName + "-jobs", - Namespace: testNamespace, - }, - }, - } - - legacyJobsCRBName := testNamespace + "-" + evalHubName + "-jobs-proxy" - legacyJobsCRB := &rbacv1.ClusterRoleBinding{ - ObjectMeta: metav1.ObjectMeta{ - Name: legacyJobsCRBName, - }, - RoleRef: rbacv1.RoleRef{ - APIGroup: rbacv1.GroupName, - Kind: "ClusterRole", - Name: "trustyai-service-operator-evalhub-jobs-proxy-role", - }, - Subjects: []rbacv1.Subject{ - { - Kind: "ServiceAccount", - Name: evalHubName + "-jobs", + Name: evalHubName + "-api", Namespace: testNamespace, }, }, @@ -875,7 +951,7 @@ func TestEvalHubReconciler_cleanupClusterRoleBinding(t *testing.T) { fakeClient := fake.NewClientBuilder(). WithScheme(scheme). - WithObjects(evalHub, proxyCRB, jobsRB, legacyJobsCRB). + WithObjects(evalHub, authCRB). Build() reconciler := &EvalHubReconciler{ @@ -884,32 +960,15 @@ func TestEvalHubReconciler_cleanupClusterRoleBinding(t *testing.T) { EventRecorder: record.NewFakeRecorder(10), } - t.Run("should delete proxy ClusterRoleBinding", func(t *testing.T) { + t.Run("should delete auth reviewer ClusterRoleBinding", func(t *testing.T) { err := reconciler.cleanupClusterRoleBinding(ctx, evalHub) require.NoError(t, err) - // Verify proxy CRB was deleted - err = fakeClient.Get(ctx, types.NamespacedName{Name: proxyCRBName}, &rbacv1.ClusterRoleBinding{}) - assert.True(t, errors.IsNotFound(err), "Proxy ClusterRoleBinding should be deleted") - }) - - t.Run("should delete jobs RoleBinding", func(t *testing.T) { - // Verify jobs RB was deleted - err := fakeClient.Get(ctx, types.NamespacedName{ - Name: jobsRBName, - Namespace: testNamespace, - }, &rbacv1.RoleBinding{}) - assert.True(t, errors.IsNotFound(err), "Jobs RoleBinding should be deleted") - }) - - t.Run("should delete legacy jobs ClusterRoleBinding", func(t *testing.T) { - // Verify legacy CRB was deleted - err := fakeClient.Get(ctx, types.NamespacedName{Name: legacyJobsCRBName}, &rbacv1.ClusterRoleBinding{}) - assert.True(t, errors.IsNotFound(err), "Legacy jobs ClusterRoleBinding should be deleted") + err = fakeClient.Get(ctx, types.NamespacedName{Name: authCRBName}, &rbacv1.ClusterRoleBinding{}) + assert.True(t, errors.IsNotFound(err), "Auth reviewer ClusterRoleBinding should be deleted") }) t.Run("should be idempotent when resources don't exist", func(t *testing.T) { - // Call cleanup again - should not error err := reconciler.cleanupClusterRoleBinding(ctx, evalHub) require.NoError(t, err) }) @@ -1207,8 +1266,8 @@ func TestEvalHubReconciler_reconcileDeployment_WithDB(t *testing.T) { }, deployment) require.NoError(t, err) - // Should have 4 volumes: evalhub-config, kube-rbac-proxy-config, tls, db-secret - assert.Len(t, deployment.Spec.Template.Spec.Volumes, 4) + // Should have 6 volumes: evalhub-config, kube-rbac-proxy-config, tls, service-ca, mlflow-token, db-secret + assert.Len(t, deployment.Spec.Template.Spec.Volumes, 6) // Find the DB secret volume var dbVolume *corev1.Volume @@ -1234,7 +1293,7 @@ func TestEvalHubReconciler_reconcileDeployment_WithDB(t *testing.T) { } } require.NotNil(t, container) - assert.Len(t, container.VolumeMounts, 2) // evalhub-config + db-secret + assert.Len(t, container.VolumeMounts, 4) // evalhub-config + service-ca + mlflow-token + db-secret var dbMount *corev1.VolumeMount for i, m := range container.VolumeMounts {