opendatahub-io · abhijeet-dhumal · Jan 6, 2026 · Dec 10, 2025 · coderabbitai · Dec 11, 2025
diff --git a/manifests/rhoai/rbac_progression_patch.yaml b/manifests/rhoai/rbac_progression_patch.yaml
@@ -11,3 +11,21 @@
     verbs:
     - get
     - list
+# RHAI-specific: Permissions for NetworkPolicy management
+# Required to create/update NetworkPolicies that restrict metrics endpoint access
+# to controller pods only (security hardening for progression tracking)
+# Note: list/watch needed for controller-runtime cache, delete not needed (OwnerReference cleanup)
+- op: add
+  path: /rules/-
+  value:
+    apiGroups:
+    - networking.k8s.io
+    resources:
+    - networkpolicies
+    verbs:
+    - get
+    - list
+    - watch
+    - create
+    - update
+    - patch
diff --git a/pkg/controller/trainjob_controller.go b/pkg/controller/trainjob_controller.go
@@ -43,6 +43,7 @@ import (
 
 	trainer "github.com/kubeflow/trainer/v2/pkg/apis/trainer/v1alpha1"
 	"github.com/kubeflow/trainer/v2/pkg/constants"
+	"github.com/kubeflow/trainer/v2/pkg/rhai"
 	"github.com/kubeflow/trainer/v2/pkg/rhai/progression"
 	jobruntimes "github.com/kubeflow/trainer/v2/pkg/runtime"
 )
@@ -158,6 +159,10 @@ func (r *TrainJobReconciler) reconcileObjects(ctx context.Context, runtime jobru
 			return err
 		}
 	}
+	// Reconcile NetworkPolicy for pod isolation
+	if err := rhai.ReconcileNetworkPolicy(ctx, r.client, trainJob); err != nil {
+		return err
+	}
 	return nil
 }
 

diff --git a/pkg/rhai/constants/constants.go b/pkg/rhai/constants/constants.go
@@ -21,8 +21,8 @@ const (
 	// Progression tracking feature annotations
 
 	// AnnotationProgressionTracking enables/disables progression tracking for a TrainJob.
-	// Value: "enabled" to enable tracking, any other value or absence disables it.
-	// Example: trainer.opendatahub.io/progression-tracking: "enabled"
+	// Value: "true" to enable tracking, any other value or absence disables it.
+	// Example: trainer.opendatahub.io/progression-tracking: "true"
 	AnnotationProgressionTracking string = "trainer.opendatahub.io/progression-tracking"
 
 	// AnnotationTrainerStatus stores the JSON-encoded training status/progress.
@@ -31,7 +31,9 @@ const (
 	AnnotationTrainerStatus string = "trainer.opendatahub.io/trainerStatus"
 
 	// AnnotationMetricsPort specifies the port where the training pod exposes metrics.
-	// Default: 28080
+	// Default: 28080. Valid range: 1024-65535 (non-privileged ports).
+	// Ports 0-1023 require root privileges and are incompatible with OpenShift
+	// restricted SCCs and Kubernetes non-root security policies.
 	// Example: trainer.opendatahub.io/metrics-port: "8080"
 	AnnotationMetricsPort string = "trainer.opendatahub.io/metrics-port"
 
@@ -60,4 +62,27 @@ const (
 	// TerminationGraceBufferSecs is added to preStop duration for pod termination grace period.
 	// This allows time for graceful process shutdown after preStop hook completes.
 	TerminationGraceBufferSecs int = 30
+
+	// NetworkPolicy constants for metrics endpoint security
+
+	// DefaultControllerNamespace is the fallback when SA namespace file is unavailable.
+	DefaultControllerNamespace string = "opendatahub"
+
+	// ControllerPodLabelName is the label key used to identify the controller pod.
+	// NetworkPolicy uses this to allow controller access to training pod metrics.
+	ControllerPodLabelName string = "app.kubernetes.io/name"
+
+	// ControllerPodLabelNameValue is the expected value for the controller name label.
+	// Must match the label applied to controller pods in deployment manifests.
+	// RHOAI: Set via kustomization.yaml labels overlay.
+	ControllerPodLabelNameValue string = "trainer"
+
+	// ControllerPodLabelComponent is the label key for component identification.
+	ControllerPodLabelComponent string = "app.kubernetes.io/component"
+
+	// ControllerPodLabelComponentValue is the expected value for the controller component label.
+	// RHOAI uses "controller" (set via kustomization.yaml).
+	// Upstream Kubeflow uses "manager" (set in base/manager/manager.yaml).
+	// This value must match your deployment's controller pod labels.
+	ControllerPodLabelComponentValue string = "controller"
 )
diff --git a/pkg/rhai/networkpolicy.go b/pkg/rhai/networkpolicy.go
@@ -0,0 +1,180 @@
+/*
+Copyright 2024 The Kubeflow Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package rhai
+
+import (
+	"context"
+	"fmt"
+	"os"
+	"strconv"
+	"strings"
+
+	corev1 "k8s.io/api/core/v1"
+	networkingv1 "k8s.io/api/networking/v1"
+	apierrors "k8s.io/apimachinery/pkg/api/errors"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/util/intstr"
+	"k8s.io/klog/v2"
+	"sigs.k8s.io/controller-runtime/pkg/client"
+
+	trainer "github.com/kubeflow/trainer/v2/pkg/apis/trainer/v1alpha1"
+	"github.com/kubeflow/trainer/v2/pkg/rhai/constants"
+	"github.com/kubeflow/trainer/v2/pkg/rhai/progression"
+)
+
+const serviceAccountNamespaceFile = "/var/run/secrets/kubernetes.io/serviceaccount/namespace"
+
+// getControllerNamespace returns the controller's namespace from SA mount.
+func getControllerNamespace() string {
+	if data, err := os.ReadFile(serviceAccountNamespaceFile); err == nil {
+		if ns := strings.TrimSpace(string(data)); ns != "" {
+			return ns
+		}
+	}
+	return constants.DefaultControllerNamespace
+}
+
+func getNetworkPolicyName(trainJob *trainer.TrainJob) string {
+	return trainJob.Name
+}
+
+// buildNetworkPolicy creates a NetworkPolicy for the TrainJob's pods.
+// Rule 1 (same-job pods → all ports) is always added for pod isolation.
+// Rule 2 (controller → metrics port) is only added when progression tracking is enabled.
+func buildNetworkPolicy(trainJob *trainer.TrainJob) *networkingv1.NetworkPolicy {
+	ingressRules := []networkingv1.NetworkPolicyIngressRule{}
+
+	// Rule 1: Same-job pods → all ports (always, for NCCL/MPI/gRPC)
+	ingressRules = append(ingressRules, networkingv1.NetworkPolicyIngressRule{
+		From: []networkingv1.NetworkPolicyPeer{
+			{
+				PodSelector: &metav1.LabelSelector{
+					MatchLabels: map[string]string{
+						"jobset.sigs.k8s.io/jobset-name": trainJob.Name,
+					},
+				},
+			},
+		},
+	})
+
+	// Rule 2: Controller → metrics port (only when progression tracking enabled)
+	if progression.IsProgressionTrackingEnabled(trainJob) {
+		metricsPort := progression.GetMetricsPort(trainJob)
+		portNum, err := strconv.Atoi(metricsPort)
+		if err != nil {
+			klog.Warningf("Invalid metrics port %q for TrainJob %s/%s, falling back to default %s",
+				metricsPort, trainJob.Namespace, trainJob.Name, constants.DefaultMetricsPort)
+			portNum, _ = strconv.Atoi(constants.DefaultMetricsPort)
+		}
+		port := intstr.FromInt(portNum)
+		controllerNamespace := getControllerNamespace()
+
+		ingressRules = append(ingressRules, networkingv1.NetworkPolicyIngressRule{
+			From: []networkingv1.NetworkPolicyPeer{
+				{
+					NamespaceSelector: &metav1.LabelSelector{
+						MatchLabels: map[string]string{
+							"kubernetes.io/metadata.name": controllerNamespace,
+						},
+					},
+					PodSelector: &metav1.LabelSelector{
+						MatchLabels: map[string]string{
+							constants.ControllerPodLabelName:      constants.ControllerPodLabelNameValue,
+							constants.ControllerPodLabelComponent: constants.ControllerPodLabelComponentValue,
+						},
+					},
+				},
+			},
+			Ports: []networkingv1.NetworkPolicyPort{
+				{
+					Protocol: protocolPtr(corev1.ProtocolTCP),
+					Port:     &port,
+				},
+			},
+		})
+	}
+
+	return &networkingv1.NetworkPolicy{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      getNetworkPolicyName(trainJob),
+			Namespace: trainJob.Namespace,
+			Labels: map[string]string{
+				"trainer.kubeflow.org/trainjob-name": trainJob.Name,
+				"trainer.kubeflow.org/component":     "network-policy",
+			},
+			OwnerReferences: []metav1.OwnerReference{
+				{
+					APIVersion:         trainer.SchemeGroupVersion.String(),
+					Kind:               "TrainJob",
+					Name:               trainJob.Name,
+					UID:                trainJob.UID,
+					Controller:         boolPtr(true),
+					BlockOwnerDeletion: boolPtr(true),
+				},
+			},
+		},
+		Spec: networkingv1.NetworkPolicySpec{
+			PodSelector: metav1.LabelSelector{
+				MatchLabels: map[string]string{
+					"jobset.sigs.k8s.io/jobset-name": trainJob.Name,
+				},
+			},
+			PolicyTypes: []networkingv1.PolicyType{
+				networkingv1.PolicyTypeIngress,
+			},
+			Ingress: ingressRules,
+		},
+	}
+}
+
+func boolPtr(b bool) *bool {
+	return &b
+}
+
+func protocolPtr(p corev1.Protocol) *corev1.Protocol {
+	return &p
+}
+
+// ReconcileNetworkPolicy creates/updates NetworkPolicy for the TrainJob.
+// Uses OwnerReference for automatic cleanup.
+func ReconcileNetworkPolicy(ctx context.Context, c client.Client, trainJob *trainer.TrainJob) error {
+	desiredPolicy := buildNetworkPolicy(trainJob)
+	existingPolicy := &networkingv1.NetworkPolicy{}
+	err := c.Get(ctx, client.ObjectKey{
+		Namespace: trainJob.Namespace,
+		Name:      getNetworkPolicyName(trainJob),
+	}, existingPolicy)
+
+	if apierrors.IsNotFound(err) {
+		if createErr := c.Create(ctx, desiredPolicy); createErr != nil {
+			return fmt.Errorf("failed to create NetworkPolicy: %w", createErr)
+		}
+		return nil
+	}
+
+	if err != nil {
+		return fmt.Errorf("failed to get NetworkPolicy: %w", err)
+	}
+
+	existingPolicy.Spec = desiredPolicy.Spec
+	existingPolicy.Labels = desiredPolicy.Labels
+	if updateErr := c.Update(ctx, existingPolicy); updateErr != nil {
+		return fmt.Errorf("failed to update NetworkPolicy: %w", updateErr)
+	}
+
+	return nil
+}