fix: tf connection reconcile bug, add API auth and fix scheduler binding permission bug (#257)

Code2Life · web-flow · commit 3ce9617acd01 · 2025-07-02T13:33:12.000+08:00
* fix: worker schedule to cpu node bug, refactor main func, webhook missing patches issue

* chore: lint issue

* fix: connection and assign-port API auth, scheduler binding permission

* fix: connection can not recover when pod running again bug

* fix: lint issue

* fix: scheduler not set correct gpu-ids annotation issue
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -107,6 +107,7 @@
         "schedulingconfigtemplate",
         "schedulingconfigtemplates",
         "schedulingcorev",
+        "serviceaccount",
         "shirou",
         "shortuuid",
         "statefulsets",
@@ -118,6 +119,7 @@
         "tensorfusionaiv",
         "tensorfusioncluster",
         "tensorfusionclusters",
+        "tensorfusionconnection",
         "tensorfusionconnections",
         "tensorfusionworkload",
         "tensorfusionworkloads",
diff --git a/charts/tensor-fusion/Chart.yaml b/charts/tensor-fusion/Chart.yaml
@@ -15,10 +15,10 @@ type: application
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: 1.4.2
+version: 1.4.3
 
 # This is the version number of the application being deployed. This version number should be
 # incremented each time you make changes to the application. Versions are not expected to
 # follow Semantic Versioning. They should reflect the version the application is using.
 # It is recommended to use it with quotes.
-appVersion: "1.35.2"
+appVersion: "1.36.1"
diff --git a/charts/tensor-fusion/templates/rbac.yaml b/charts/tensor-fusion/templates/rbac.yaml
@@ -3,7 +3,6 @@ kind: ClusterRole
 metadata:
   name: {{ include "tensor-fusion.fullname" . }}-role
 rules:
-rules:
 - apiGroups:
   - ""
   resources:
@@ -35,24 +34,19 @@ rules:
   resources:
   - nodes/finalizers
   - pods/binding
+  - pods/exec
   - pods/finalizers
   verbs:
-  - update
-- apiGroups:
-  - ""
-  resources:
-  - nodes/status
-  - pods/status
-  verbs:
+  - create
   - get
   - patch
   - update
 - apiGroups:
   - ""
   resources:
-  - pods/exec
+  - nodes/status
+  - pods/status
   verbs:
-  - create
   - get
   - patch
   - update
diff --git a/cmd/main.go b/cmd/main.go
@@ -156,6 +156,7 @@ func main() {
 	gpuInfos := make([]config.GpuInfo, 0)
 	gpuPricingMap := make(map[string]float64)
 	startWatchGPUInfoChanges(ctx, &gpuInfos, gpuPricingMap)
+	utils.InitServiceAccountConfig()
 
 	metricsServerOptions := metricsserver.Options{
 		BindAddress:   metricsAddr,
@@ -205,7 +206,7 @@ func main() {
 
 	startCustomResourceController(ctx, mgr, metricsRecorder, allocator, portAllocator)
 
-	startHttpServerForTFClient(ctx, kc, portAllocator)
+	startHttpServerForTFClient(ctx, kc, portAllocator, mgr.Elected())
 
 	// +kubebuilder:scaffold:builder
 	addHealthCheckAPI(mgr)
@@ -250,7 +251,12 @@ func startTensorFusionAllocators(
 	return allocator, portAllocator
 }
 
-func startHttpServerForTFClient(ctx context.Context, kc *rest.Config, portAllocator *portallocator.PortAllocator) {
+func startHttpServerForTFClient(
+	ctx context.Context,
+	kc *rest.Config,
+	portAllocator *portallocator.PortAllocator,
+	leaderChan <-chan struct{},
+) {
 	client, err := client.NewWithWatch(kc, client.Options{Scheme: scheme})
 	if err != nil {
 		setupLog.Error(err, "failed to create client with watch")
@@ -266,7 +272,7 @@ func startHttpServerForTFClient(ctx context.Context, kc *rest.Config, portAlloca
 		setupLog.Error(err, "failed to create assign host port router")
 		os.Exit(1)
 	}
-	httpServer := server.NewHTTPServer(connectionRouter, assignHostPortRouter)
+	httpServer := server.NewHTTPServer(connectionRouter, assignHostPortRouter, leaderChan)
 	go func() {
 		err := httpServer.Run()
 		if err != nil {
diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml
@@ -35,24 +35,19 @@ rules:
   resources:
   - nodes/finalizers
   - pods/binding
+  - pods/exec
   - pods/finalizers
   verbs:
-  - update
-- apiGroups:
-  - ""
-  resources:
-  - nodes/status
-  - pods/status
-  verbs:
+  - create
   - get
   - patch
   - update
 - apiGroups:
   - ""
   resources:
-  - pods/exec
+  - nodes/status
+  - pods/status
   verbs:
-  - create
   - get
   - patch
   - update
diff --git a/internal/constants/constants.go b/internal/constants/constants.go
@@ -185,3 +185,21 @@ const (
 	LowFrequencyObjFailureMaxBurst            = 1
 	LowFrequencyObjFailureConcurrentReconcile = 5
 )
+
+// For security enhancement, there are 2 types of endpoints to protect
+// 1. client call operator /connection API, to obtain tensor fusion worker's URL
+// 2. worker call hypervisor API, to obtain current workers GPU quota info
+// if this env var is set on operator and hypervisor, will try to verify JWT signature for each call
+// not implemented yet, iss is public in EKS and most K8S distribution
+// but k3s and some K8S distribution may not support, need to find some way to get SA token JWT pub key
+
+const HypervisorVerifyServiceAccountEnabledEnvVar = "SA_TOKEN_VERIFY_ENABLED"
+const HypervisorVerifyServiceAccountPublicKeyEnvVar = "SA_TOKEN_VERIFY_PUBLIC_KEY"
+
+// TensorFusion ControllerManager's http endpoint will verify Pod JWT signature
+// if this env var is set, will disable the verification, it's enabled by default
+// should not set to true in production environment
+const DisableConnectionAuthEnv = "DISABLE_CONNECTION_AUTH"
+
+const AuthorizationHeader = "Authorization"
+const ExtraVerificationInfoPodIDKey = "authentication.kubernetes.io/pod-uid"
diff --git a/internal/controller/node_controller.go b/internal/controller/node_controller.go
@@ -53,7 +53,7 @@ type NodeReconciler struct {
 
 // +kubebuilder:rbac:groups=core,resources=nodes,verbs=get;list;watch;create;update;patch;delete
 // +kubebuilder:rbac:groups=core,resources=nodes/status,verbs=get;update;patch
-// +kubebuilder:rbac:groups=core,resources=nodes/finalizers,verbs=update
+// +kubebuilder:rbac:groups=core,resources=nodes/finalizers,verbs=create;get;patch;update
 
 // This reconcile loop only take effect on nodeSelector mode, while in AutoProvision mode, GPUNode will manage the K8S Node rather than reversed
 func (r *NodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
diff --git a/internal/controller/pod_controller.go b/internal/controller/pod_controller.go
@@ -57,8 +57,8 @@ type PodReconciler struct {
 // +kubebuilder:rbac:groups=policy,resources=*,verbs=get;list;watch
 // +kubebuilder:rbac:groups=core,resources=pods/status,verbs=get;update;patch
 // +kubebuilder:rbac:groups=core,resources=pods/exec,verbs=create;get;update;patch
-// +kubebuilder:rbac:groups=core,resources=pods/finalizers,verbs=update
-// +kubebuilder:rbac:groups=core,resources=pods/binding,verbs=update
+// +kubebuilder:rbac:groups=core,resources=pods/finalizers,verbs=create;get;update;patch
+// +kubebuilder:rbac:groups=core,resources=pods/binding,verbs=create;get;update;patch
 
 // Add GPU connection for Pods using GPU
 // Have to create TensorFusion connection here because pod UID not available in MutatingWebhook
diff --git a/internal/controller/tensorfusioncluster_controller.go b/internal/controller/tensorfusioncluster_controller.go
@@ -197,7 +197,7 @@ func (r *TensorFusionClusterReconciler) listOwnedGPUPools(ctx context.Context, t
 	return gpupoolsList.Items, nil
 }
 
-func (r *TensorFusionClusterReconciler) reconcileTimeSeriesDatabase(ctx context.Context, tfc *tfv1.TensorFusionCluster) (bool, error) {
+func (r *TensorFusionClusterReconciler) reconcileTimeSeriesDatabase(_ context.Context, _ *tfv1.TensorFusionCluster) (bool, error) {
 	// TODO: Not implemented yet
 	return false, nil
 }
diff --git a/internal/controller/tensorfusionconnection_controller.go b/internal/controller/tensorfusionconnection_controller.go
@@ -87,10 +87,8 @@ func (r *TensorFusionConnectionReconciler) Reconcile(ctx context.Context, req ct
 		}
 	}
 
-	needSelectWorker, shouldReturn, err := r.shouldSelectWorker(ctx, connection)
-	if shouldReturn {
-		// when err is not nil and shouldReturn is true,
-		// it means already cleared the existing workerName and updated status, wait next reconcile loop
+	needSelectWorker, err := r.shouldSelectWorker(ctx, connection)
+	if err != nil {
 		return ctrl.Result{}, err
 	}
 
@@ -99,6 +97,7 @@ func (r *TensorFusionConnectionReconciler) Reconcile(ctx context.Context, req ct
 		return ctrl.Result{}, nil
 	}
 
+	log.Info("Selecting worker for connection", "connection", connection.Name, "namespace", connection.Namespace)
 	if workload.Spec.IsDynamicReplica() {
 		// 1st MODE: select the dedicated worker if it's running, otherwise wait utils it's becoming ready
 		return ctrl.Result{}, r.syncDedicatedWorkerStatus(ctx, connection)
@@ -109,8 +108,8 @@ func (r *TensorFusionConnectionReconciler) Reconcile(ctx context.Context, req ct
 }
 
 func (r *TensorFusionConnectionReconciler) syncDedicatedWorkerStatus(ctx context.Context, connection *tfv1.TensorFusionConnection) error {
-	var pod v1.Pod
-	if err := r.Get(ctx, client.ObjectKey{Name: connection.Name, Namespace: connection.Namespace}, &pod); err != nil {
+	pod := &v1.Pod{}
+	if err := r.Get(ctx, client.ObjectKey{Name: connection.Name, Namespace: connection.Namespace}, pod); err != nil {
 		return fmt.Errorf("failed to get dedicated worker pod for connection %w", err)
 	}
 	if pod.Status.Phase != v1.PodRunning {
@@ -124,14 +123,18 @@ func (r *TensorFusionConnectionReconciler) syncDedicatedWorkerStatus(ctx context
 		if revision == "" {
 			revision = "0"
 		}
-		connection.Status.ConnectionURL = fmt.Sprintf("native+%s+%d+%s-%s", pod.Status.PodIP, constants.TensorFusionWorkerPortNumber, pod.Name, revision)
+		setConnectionWorkerURL(connection, pod.Status.PodIP, pod.Name, revision)
 		if err := r.Status().Update(ctx, connection); err != nil {
 			return fmt.Errorf("failed to update connection status: %w", err)
 		}
 		return nil
 	}
 }
 
+func setConnectionWorkerURL(connection *tfv1.TensorFusionConnection, podIp string, podName string, revision string) {
+	connection.Status.ConnectionURL = fmt.Sprintf("native+%s+%d+%s-%s", podIp, constants.TensorFusionWorkerPortNumber, podName, revision)
+}
+
 func (r *TensorFusionConnectionReconciler) selectWorkerAndSyncStatusFromWorkerPool(
 	ctx context.Context,
 	connection *tfv1.TensorFusionConnection,
@@ -162,8 +165,7 @@ func (r *TensorFusionConnectionReconciler) selectWorkerAndSyncStatusFromWorkerPo
 	if resourceVersion == "" {
 		resourceVersion = "0"
 	}
-
-	connection.Status.ConnectionURL = fmt.Sprintf("native+%s+%d+%s-%s", s.WorkerIp, constants.TensorFusionWorkerPortNumber, s.WorkerName, resourceVersion)
+	setConnectionWorkerURL(connection, s.WorkerIp, s.WorkerName, resourceVersion)
 	if err := r.Status().Update(ctx, connection); err != nil {
 		return ctrl.Result{}, fmt.Errorf("update connection status: %w", err)
 	}
@@ -202,38 +204,48 @@ func (r *TensorFusionConnectionReconciler) patchMatchedWorkerLabel(ctx context.C
 
 func (r *TensorFusionConnectionReconciler) shouldSelectWorker(
 	ctx context.Context, connection *tfv1.TensorFusionConnection,
-) (bool, bool, error) {
-	needSelectWorker := false
+) (needSelectWorker bool, err error) {
 	if connection.Status.WorkerName != "" {
 		// check if worker pod is still running
 		pod := &v1.Pod{}
 		if err := r.Get(ctx, client.ObjectKey{Name: connection.Status.WorkerName, Namespace: connection.Namespace}, pod); err != nil {
 			if errors.IsNotFound(err) {
 				needSelectWorker = true
 			} else {
-				return false, true, fmt.Errorf("failed to get worker pod: %w", err)
+				return needSelectWorker, fmt.Errorf("failed to get worker pod: %w", err)
 			}
 		}
+		// NOTE: no need to handle pod deleting since connection should be deleted at first, sync running status with Pod
 		if pod.Status.Phase != v1.PodRunning {
 			connection.Status.WorkerName = ""
 			connection.Status.Phase = tfv1.WorkerFailed
 			connection.Status.ConnectionURL = ""
 			// set worker name to empty to trigger select worker again
 			if updateErr := r.Status().Update(ctx, connection); updateErr != nil {
-				return false, true, fmt.Errorf("failed to update connection status: %w", updateErr)
+				return false, fmt.Errorf("failed to update connection status: %w", updateErr)
+			}
+			// let next reconcile loop to trigger select worker
+			return false, nil
+		} else if connection.Status.Phase != tfv1.WorkerRunning {
+			// pod is running now, but connection is not running, update connection to running
+			connection.Status.Phase = tfv1.WorkerRunning
+			setConnectionWorkerURL(connection, pod.Status.PodIP, pod.Name, pod.ResourceVersion)
+			if updateErr := r.Status().Update(ctx, connection); updateErr != nil {
+				return false, fmt.Errorf("failed to update connection status: %w", updateErr)
 			}
-			return false, true, nil
+			// current worker is working again, no need to select another worker
+			return false, nil
 		}
 	} else {
 		if connection.Status.Phase == "" {
 			connection.Status.Phase = tfv1.WorkerPending
 			if updateErr := r.Status().Update(ctx, connection); updateErr != nil {
-				return false, true, fmt.Errorf("failed to update connection status: %w", updateErr)
+				return false, fmt.Errorf("failed to update connection status: %w", updateErr)
 			}
 		}
 		needSelectWorker = true
 	}
-	return needSelectWorker, false, nil
+	return needSelectWorker, nil
 }
 
 // SetupWithManager sets up the controller with the Manager.
diff --git a/internal/portallocator/portallocator.go b/internal/portallocator/portallocator.go
@@ -350,10 +350,13 @@ func (s *PortAllocator) initBitMapForNodeLevelPortAssign(ctx context.Context) {
 
 	size := (s.PortRangeEndNode-s.PortRangeStartNode)/64 + 1
 	for _, pod := range podList.Items {
-		if pod.Annotations == nil {
+		if pod.Annotations == nil || pod.Annotations[constants.GenPortNumberAnnotation] == "" {
+			continue
+		}
+		port, err := strconv.Atoi(pod.Annotations[constants.GenPortNumberAnnotation])
+		if err != nil {
 			continue
 		}
-		port, _ := strconv.Atoi(pod.Annotations[constants.GenPortNumberAnnotation])
 		if port > s.PortRangeEndNode || port < s.PortRangeStartNode {
 			log.Error(err, "existing Pod's node level host port out of range", "port", port, "expected-start", s.PortRangeStartNode, "expected-end", s.PortRangeEndNode, "pod", pod.Name, "node", pod.Spec.NodeName)
 			continue
diff --git a/internal/scheduler/gpuresources/gpuresources.go b/internal/scheduler/gpuresources/gpuresources.go
@@ -205,6 +205,7 @@ func (s *GPUFit) Reserve(ctx context.Context, state *framework.CycleState, pod *
 	schedulingResult.FinalGPUs = lo.Map(gpuScoreEntries[:neededGPUs], func(entry lo.Entry[string, int], _ int) string {
 		return entry.Key
 	})
+	state.Write(CycleStateGPUSchedulingResult, schedulingResult)
 
 	_, err = s.allocator.Bind(
 		schedulingResult.FinalGPUs,
@@ -239,6 +240,7 @@ func (s *GPUFit) PreBind(ctx context.Context, state *framework.CycleState, pod *
 	}
 	// write the allocated GPU info to Pod in bindingCycle, before default binder changing the Pod nodeName info
 	gpuIDs := strings.Join(gpuSchedulingResult.(*GPUSchedulingStateData).FinalGPUs, ",")
+	s.logger.Info("PreBinding pod for GPU resources", "pod", pod.Name, "node", nodeName, "gpuIDs", gpuIDs)
 	patch := []byte(`[{
 		"op": "add",
 		"path": "/metadata/annotations/` + utils.EscapeJSONPointer(constants.GPUDeviceIDsAnnotation) + `",
diff --git a/internal/server/router/assign_host_port.go b/internal/server/router/assign_host_port.go
@@ -5,11 +5,17 @@ import (
 	"fmt"
 	"net/http"
 
+	"github.com/NexusGPU/tensor-fusion/internal/constants"
 	"github.com/NexusGPU/tensor-fusion/internal/portallocator"
+	"github.com/NexusGPU/tensor-fusion/internal/utils"
 	"github.com/gin-gonic/gin"
+	v1 "k8s.io/api/authentication/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"sigs.k8s.io/controller-runtime/pkg/log"
 )
 
+const assignPortTokenReviewName = "assign-host-port-token-review"
+
 type AssignHostPortRouter struct {
 	allocator *portallocator.PortAllocator
 }
@@ -19,15 +25,39 @@ func NewAssignHostPortRouter(ctx context.Context, allocator *portallocator.PortA
 }
 
 func (r *AssignHostPortRouter) AssignHostPort(ctx *gin.Context) {
-	// TODO verify service account token, issuer must be the same as current instance
-	// namely the request must comes from peer operator Pod
-
 	podName := ctx.Query("podName")
+	token := ctx.Request.Header.Get(constants.AuthorizationHeader)
+
+	if token == "" {
+		log.FromContext(ctx).Error(nil, "assigned host port failed, missing token", "podName", podName)
+		ctx.String(http.StatusUnauthorized, "missing authorization header")
+		return
+	}
+	tokenReview := &v1.TokenReview{
+		ObjectMeta: metav1.ObjectMeta{
+			Name: assignPortTokenReviewName,
+		},
+		Spec: v1.TokenReviewSpec{
+			Token: token,
+		},
+	}
+	if err := r.allocator.Client.Create(ctx, tokenReview); err != nil {
+		log.FromContext(ctx).Error(err, "assigned host port failed, auth endpoint error", "podName", podName)
+		ctx.String(http.StatusInternalServerError, "auth endpoint error")
+		return
+	}
+	if !tokenReview.Status.Authenticated || tokenReview.Status.User.Username != utils.GetSelfServiceAccountName() {
+		log.FromContext(ctx).Error(nil, "assigned host port failed, token invalid", "podName", podName)
+		ctx.String(http.StatusUnauthorized, "token authentication failed")
+		return
+	}
+
 	port, err := r.allocator.AssignClusterLevelHostPort(podName)
 	if err != nil {
+		log.FromContext(ctx).Error(err, "assigned host port failed, port allocation failed", "podName", podName)
 		ctx.String(http.StatusInternalServerError, err.Error())
 		return
 	}
-	log.FromContext(ctx).Info("assigned host port", "podName", podName, "port", port)
+	log.FromContext(ctx).Info("assigned host port successfully", "podName", podName, "port", port)
 	ctx.String(http.StatusOK, fmt.Sprintf("%d", port))
 }
diff --git a/internal/server/router/connection.go b/internal/server/router/connection.go
diff --git a/internal/server/server.go b/internal/server/server.go
diff --git a/internal/utils/config.go b/internal/utils/config.go
diff --git a/internal/webhook/v1/pod_webhook.go b/internal/webhook/v1/pod_webhook.go

Original file line number	Diff line number	Diff line change
`@@ -197,7 +197,7 @@ func (r *TensorFusionClusterReconciler) listOwnedGPUPools(ctx context.Context, t`
`197`	`197`	`return gpupoolsList.Items, nil`
`198`	`198`	`}`
`199`	`199`
`200`		`-func (r TensorFusionClusterReconciler) reconcileTimeSeriesDatabase(ctx context.Context, tfc tfv1.TensorFusionCluster) (bool, error) {`
	`200`	`+func (r TensorFusionClusterReconciler) reconcileTimeSeriesDatabase(_ context.Context, _ tfv1.TensorFusionCluster) (bool, error) {`
`201`	`201`	`// TODO: Not implemented yet`
`202`	`202`	`return false, nil`
`203`	`203`	`}`