fix: token review permission issue for remote worker, pass annotation to workload, fix allocator mem state bug (#303)

Code2Life · web-flow · commit d4a82190457b · 2025-07-31T09:20:29.000+08:00
* fix: token review permission issue for remote worker, pass annotation to workload, fix allocator mem state bug

* fix: bump helm version
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -150,6 +150,7 @@
         "tflops",
         "timberio",
         "Tmpl",
+        "tokenreviews",
         "Tolerations",
         "utilerrors",
         "utilruntime",
diff --git a/charts/tensor-fusion/Chart.yaml b/charts/tensor-fusion/Chart.yaml
@@ -15,7 +15,7 @@ type: application
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: 1.5.2
+version: 1.5.3
 
 # This is the version number of the application being deployed. This version number should be
 # incremented each time you make changes to the application. Versions are not expected to
diff --git a/charts/tensor-fusion/templates/rbac-hypervisor.yaml b/charts/tensor-fusion/templates/rbac-hypervisor.yaml
@@ -26,6 +26,12 @@ rules:
   - watch
   - update
   - patch
+- apiGroups:
+  - authentication.k8s.io
+  resources:
+  - tokenreviews
+  verbs:
+  - create
 ---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRoleBinding
diff --git a/charts/tensor-fusion/templates/rbac.yaml b/charts/tensor-fusion/templates/rbac.yaml
@@ -182,7 +182,12 @@ rules:
   - get
   - list
   - watch
-
+- apiGroups:
+    - authentication.k8s.io
+  resources:
+    - tokenreviews
+  verbs:
+    - create
 ---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRoleBinding
diff --git a/cmd/nodediscovery/main.go b/cmd/nodediscovery/main.go
@@ -161,8 +161,11 @@ func main() {
 			ctrl.Log.Info("found GPU info from config", "deviceName", deviceName, "FP16 TFlops", tflops, "uuid", uuid)
 		}
 
-		gpu := createOrUpdateTensorFusionGPU(k8sClient, ctx, k8sNodeName, gpunode, uuid, deviceName, memInfo, tflops)
-
+		gpu, err := createOrUpdateTensorFusionGPU(k8sClient, ctx, k8sNodeName, gpunode, uuid, deviceName, memInfo, tflops)
+		if err != nil {
+			ctrl.Log.Error(err, "failed to create or update GPU", "uuid", uuid)
+			os.Exit(1)
+		}
 		totalTFlops.Add(gpu.Status.Capacity.Tflops)
 		totalVRAM.Add(gpu.Status.Capacity.Vram)
 		availableTFlops.Add(gpu.Status.Available.Tflops)
@@ -194,13 +197,17 @@ func patchGPUNodeStatus(k8sClient client.Client, ctx context.Context,
 
 func createOrUpdateTensorFusionGPU(
 	k8sClient client.Client, ctx context.Context, k8sNodeName string, gpunode *tfv1.GPUNode,
-	uuid string, deviceName string, memInfo nvml.Memory_v2, tflops resource.Quantity) *tfv1.GPU {
+	uuid string, deviceName string, memInfo nvml.Memory_v2, tflops resource.Quantity) (*tfv1.GPU, error) {
 	gpu := &tfv1.GPU{
 		ObjectMeta: metav1.ObjectMeta{
 			Name: uuid,
 		},
 	}
 
+	if len(gpunode.OwnerReferences) == 0 {
+		return nil, fmt.Errorf("GPUNode has no owner references of GPU pool")
+	}
+
 	err := retry.OnError(wait.Backoff{
 		Steps:    10,
 		Duration: time.Second,
@@ -213,6 +220,7 @@ func createOrUpdateTensorFusionGPU(
 			// Set metadata fields
 			gpu.Labels = map[string]string{
 				constants.LabelKeyOwner: gpunode.Name,
+				constants.GpuPoolKey:    gpunode.OwnerReferences[0].Name,
 			}
 			gpu.Annotations = map[string]string{
 				constants.LastSyncTimeAnnotationKey: time.Now().Format(time.RFC3339),
@@ -240,7 +248,7 @@ func createOrUpdateTensorFusionGPU(
 	})
 	if err != nil {
 		ctrl.Log.Error(err, "failed to create or update GPU after retries", "gpu", gpu)
-		os.Exit(1)
+		return nil, err
 	}
 
 	err = retry.OnError(retry.DefaultBackoff, func(err error) bool {
@@ -272,10 +280,10 @@ func createOrUpdateTensorFusionGPU(
 	})
 	if err != nil {
 		ctrl.Log.Error(err, "failed to update status of GPU after retries", "gpu", gpu)
-		os.Exit(1)
+		return nil, err
 	}
 
-	return gpu
+	return gpu, nil
 }
 
 func kubeClient() (client.Client, error) {
diff --git a/cmd/nodediscovery/main_test.go b/cmd/nodediscovery/main_test.go
@@ -30,6 +30,11 @@ func TestCreateOrUpdateTensorFusionGPU(t *testing.T) {
 	gpuNode := &tfv1.GPUNode{
 		ObjectMeta: metav1.ObjectMeta{
 			Name: gpuNodeName,
+			OwnerReferences: []metav1.OwnerReference{
+				{
+					Name: "test-gpu-pool",
+				},
+			},
 		},
 	}
 
@@ -38,7 +43,8 @@ func TestCreateOrUpdateTensorFusionGPU(t *testing.T) {
 
 	k8sClient := fake.NewClientBuilder().WithScheme(scheme).WithStatusSubresource(&tfv1.GPU{}).Build()
 
-	gpu := createOrUpdateTensorFusionGPU(k8sClient, ctx, k8sNodeName, gpuNode, uuid, deviceName, memInfo, tflops)
+	gpu, err := createOrUpdateTensorFusionGPU(k8sClient, ctx, k8sNodeName, gpuNode, uuid, deviceName, memInfo, tflops)
+	assert.NoError(t, err)
 
 	// Assertions
 	assert.NotNil(t, gpu, "GPU object should not be nil")
@@ -51,10 +57,13 @@ func TestCreateOrUpdateTensorFusionGPU(t *testing.T) {
 		gpu.Status.NodeSelector, "Node selector should match")
 
 	// Verify labels and annotations
-	assert.Equal(t, map[string]string{constants.LabelKeyOwner: gpuNodeName}, gpu.Labels, "GPU labels should match")
+	assert.Equal(t, map[string]string{
+		constants.LabelKeyOwner: gpuNodeName,
+		constants.GpuPoolKey:    "test-gpu-pool",
+	}, gpu.Labels, "GPU labels should match")
 	assert.Contains(t, gpu.Annotations, constants.LastSyncTimeAnnotationKey,
 		"GPU annotations should contain last report time")
-	_, err := time.Parse(time.RFC3339, gpu.Annotations[constants.LastSyncTimeAnnotationKey])
+	_, err = time.Parse(time.RFC3339, gpu.Annotations[constants.LastSyncTimeAnnotationKey])
 	assert.NoError(t, err, "Last report time annotation should be a valid RFC3339 timestamp")
 
 	// Verify the Available field does not change after the update
@@ -64,7 +73,8 @@ func TestCreateOrUpdateTensorFusionGPU(t *testing.T) {
 	assert.NoError(t, err)
 
 	tflops.Add(resource.MustParse("100"))
-	updatedGpu := createOrUpdateTensorFusionGPU(k8sClient, ctx, k8sNodeName, gpuNode, uuid, deviceName, memInfo, tflops)
+	updatedGpu, err := createOrUpdateTensorFusionGPU(k8sClient, ctx, k8sNodeName, gpuNode, uuid, deviceName, memInfo, tflops)
+	assert.NoError(t, err)
 	assert.NotEqual(t, updatedGpu.Status.Capacity, gpu.Status.Capacity, "GPU capacity should not match")
 	assert.Equal(t, updatedGpu.Status.Available.Tflops, gpu.Status.Available.Tflops, "GPU TFlops should match")
 	assert.Equal(t, updatedGpu.Status.Available.Vram, gpu.Status.Available.Vram, "GPU VRAM should match")
@@ -84,6 +94,11 @@ func TestGPUControllerReference(t *testing.T) {
 		ObjectMeta: metav1.ObjectMeta{
 			Name: gpuNodeName,
 			UID:  "mock-uid",
+			OwnerReferences: []metav1.OwnerReference{
+				{
+					Name: "test-gpu-pool",
+				},
+			},
 		},
 	}
 
@@ -92,17 +107,24 @@ func TestGPUControllerReference(t *testing.T) {
 
 	k8sClient := fake.NewClientBuilder().WithScheme(scheme).WithStatusSubresource(&tfv1.GPU{}).Build()
 
-	gpu := createOrUpdateTensorFusionGPU(k8sClient, ctx, k8sNodeName, gpuNode, uuid, deviceName, memInfo, tflops)
+	gpu, err := createOrUpdateTensorFusionGPU(k8sClient, ctx, k8sNodeName, gpuNode, uuid, deviceName, memInfo, tflops)
+	assert.NoError(t, err)
 	assert.True(t, metav1.IsControlledBy(gpu, gpuNode))
 
 	newGpuNode := &tfv1.GPUNode{
 		ObjectMeta: metav1.ObjectMeta{
 			Name: "new-test-gpu-node",
 			UID:  "new-mock-uid",
+			OwnerReferences: []metav1.OwnerReference{
+				{
+					Name: "new-test-gpu-pool",
+				},
+			},
 		},
 	}
 
-	gpu = createOrUpdateTensorFusionGPU(k8sClient, ctx, k8sNodeName, newGpuNode, uuid, deviceName, memInfo, tflops)
+	gpu, err = createOrUpdateTensorFusionGPU(k8sClient, ctx, k8sNodeName, newGpuNode, uuid, deviceName, memInfo, tflops)
+	assert.NoError(t, err)
 	assert.NotNil(t, gpu.OwnerReferences[0].Kind)
 	assert.NotNil(t, gpu.OwnerReferences[0].APIVersion)
 	assert.True(t, metav1.IsControlledBy(gpu, newGpuNode))
@@ -127,6 +149,11 @@ func TestPatchGPUNodeStatus(t *testing.T) {
 					ObjectMeta: metav1.ObjectMeta{
 						Name:      "test-gpu-node",
 						Namespace: "default",
+						OwnerReferences: []metav1.OwnerReference{
+							{
+								Name: "test-gpu-pool",
+							},
+						},
 					},
 					Status: tfv1.GPUNodeStatus{
 						Phase:       "", // Empty phase should be set to pending
@@ -161,6 +188,11 @@ func TestPatchGPUNodeStatus(t *testing.T) {
 					ObjectMeta: metav1.ObjectMeta{
 						Name:      "test-gpu-node-running",
 						Namespace: "default",
+						OwnerReferences: []metav1.OwnerReference{
+							{
+								Name: "test-gpu-pool",
+							},
+						},
 					},
 					Status: tfv1.GPUNodeStatus{
 						Phase:       tfv1.TensorFusionGPUNodePhaseRunning,
@@ -194,6 +226,11 @@ func TestPatchGPUNodeStatus(t *testing.T) {
 					ObjectMeta: metav1.ObjectMeta{
 						Name:      "test-gpu-node-zero",
 						Namespace: "default",
+						OwnerReferences: []metav1.OwnerReference{
+							{
+								Name: "test-gpu-pool",
+							},
+						},
 					},
 					Status: tfv1.GPUNodeStatus{
 						Phase: "",
@@ -279,6 +316,11 @@ func TestPatchGPUNodeStatus_ErrorScenarios(t *testing.T) {
 					ObjectMeta: metav1.ObjectMeta{
 						Name:      "nonexistent-gpu-node",
 						Namespace: "default",
+						OwnerReferences: []metav1.OwnerReference{
+							{
+								Name: "test-gpu-pool",
+							},
+						},
 					},
 				}
 			},
@@ -315,6 +357,11 @@ func TestPatchGPUNodeStatus_Integration(t *testing.T) {
 		ObjectMeta: metav1.ObjectMeta{
 			Name:      "integration-test-node",
 			Namespace: "default",
+			OwnerReferences: []metav1.OwnerReference{
+				{
+					Name: "test-gpu-pool",
+				},
+			},
 		},
 		Status: tfv1.GPUNodeStatus{
 			Phase:               "",
diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml
@@ -76,6 +76,12 @@ rules:
   - patch
   - update
   - watch
+- apiGroups:
+  - authentication.k8s.io
+  resources:
+  - tokenreviews
+  verbs:
+  - create
 - apiGroups:
   - batch
   resources:
diff --git a/internal/constants/constants.go b/internal/constants/constants.go
@@ -71,12 +71,18 @@ const (
 	SetPendingOwnedWorkloadAnnotation = Domain + "/pending-owned-workload"
 	PricingAnnotation                 = Domain + "/hourly-pricing"
 
+	WorkloadModeAnnotation = Domain + "/workload-mode"
+	WorkloadModeDynamic    = "dynamic"
+	WorkloadModeFixed      = "fixed"
+
 	// Annotations for killer switch: disable features
 	// ['gpu-opt', 'mem-manager', 'gpu-limiter']
 	DisableFeaturesAnnotation = Domain + "/disable-features"
 	BuiltInFeaturesGpuOpt     = "gpu-opt"
 	BuiltInFeaturesGpuLimiter = "gpu-limiter"
 	BuiltInFeaturesMemManager = "mem-manager"
+	// For debug purpose only of Remote vGPU, disable start worker to manual start with ad-hoc command inside Pod
+	BuiltInFeatureStartWorker = "start-worker"
 
 	GenHostPortLabel        = Domain + "/host-port"
 	GenHostPortLabelValue   = "auto"
diff --git a/internal/controller/gpu_controller.go b/internal/controller/gpu_controller.go
@@ -19,13 +19,10 @@ package controller
 import (
 	"context"
 	"fmt"
-	"strings"
 
 	tfv1 "github.com/NexusGPU/tensor-fusion/api/v1"
 	"github.com/NexusGPU/tensor-fusion/internal/constants"
-	"github.com/samber/lo"
 	"k8s.io/apimachinery/pkg/api/errors"
-	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/runtime"
 	ctrl "sigs.k8s.io/controller-runtime"
 	"sigs.k8s.io/controller-runtime/pkg/client"
@@ -52,42 +49,12 @@ func (r *GPUReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.R
 		return ctrl.Result{}, err
 	}
 
-	kgvs, _, err := r.Scheme.ObjectKinds(&tfv1.GPUNode{})
-	if err != nil {
-		return ctrl.Result{}, fmt.Errorf("get object kinds for GPUNode: %w", err)
-	}
-
-	owner, ok := lo.Find(gpu.OwnerReferences, func(or metav1.OwnerReference) bool {
-		for _, kvg := range kgvs {
-			if kvg.Kind == or.Kind && fmt.Sprintf("%s/%s", kvg.Group, kvg.Version) == or.APIVersion {
-				return true
-			}
-		}
-		return false
-	})
-
-	if !ok {
-		return ctrl.Result{}, fmt.Errorf("owner node of gpu(%s) not found", gpu.Name)
-	}
-
 	gpunode := &tfv1.GPUNode{}
-	if err := r.Get(ctx, client.ObjectKey{Name: owner.Name}, gpunode); err != nil {
-		return ctrl.Result{}, fmt.Errorf("get node %s: %w", owner.Name, err)
-	}
-
-	var poolName string
-	for labelKey := range gpunode.Labels {
-		after, ok := strings.CutPrefix(labelKey, constants.GPUNodePoolIdentifierLabelPrefix)
-		if ok {
-			poolName = after
-			break
-		}
-	}
-
-	if poolName == "" {
-		return ctrl.Result{}, fmt.Errorf("node %s is not assigned to any pool", gpunode.Name)
+	if err := r.Get(ctx, client.ObjectKey{Name: gpu.Labels[constants.LabelKeyOwner]}, gpunode); err != nil {
+		return ctrl.Result{}, fmt.Errorf("can not get node %s: %w", gpu.Labels[constants.LabelKeyOwner], err)
 	}
 
+	// Fix old version issue when discovery job not set UsedBy field
 	if gpu.Status.UsedBy == "" && gpu.Status.UUID != "" {
 		patch := client.MergeFrom(gpu.DeepCopy())
 		gpu.Status.UsedBy = tfv1.UsedByTensorFusion
@@ -97,20 +64,6 @@ func (r *GPUReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.R
 		return ctrl.Result{}, nil
 	}
 
-	// No need to calculate patch since GPU's owner pool not changed
-	if gpu.Labels != nil && gpu.Labels[constants.GpuPoolKey] == poolName {
-		return ctrl.Result{}, nil
-	}
-
-	patch := client.MergeFrom(gpu.DeepCopy())
-	if gpu.Labels == nil {
-		gpu.Labels = make(map[string]string)
-	}
-	gpu.Labels[constants.GpuPoolKey] = poolName
-	if err := r.Patch(ctx, gpu, patch); err != nil {
-		return ctrl.Result{}, fmt.Errorf("patch gpu %s: %w", gpu.Name, err)
-	}
-
 	return ctrl.Result{}, nil
 }
 
diff --git a/internal/controller/tensorfusionconnection_controller.go b/internal/controller/tensorfusionconnection_controller.go
@@ -54,6 +54,7 @@ type TensorFusionConnectionReconciler struct {
 // +kubebuilder:rbac:groups=tensor-fusion.ai,resources=tensorfusionconnections,verbs=get;list;watch;create;update;patch;delete
 // +kubebuilder:rbac:groups=tensor-fusion.ai,resources=tensorfusionconnections/status,verbs=get;update;patch
 // +kubebuilder:rbac:groups=tensor-fusion.ai,resources=tensorfusionconnections/finalizers,verbs=update
+// +kubebuilder:rbac:groups=authentication.k8s.io,resources=tokenreviews,verbs=create
 
 // Add and monitor GPU worker Pod for a TensorFusionConnection
 func (r *TensorFusionConnectionReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
@@ -70,14 +71,14 @@ func (r *TensorFusionConnectionReconciler) Reconcile(ctx context.Context, req ct
 			// Object not found, could have been deleted after reconcile request, return without error
 			return ctrl.Result{}, nil
 		}
-		log.Error(err, "Failed to get TensorFusionConnection")
+		log.Error(err, "Failed to get TensorFusionConnection", "name", req.Name)
 		return ctrl.Result{}, err
 	}
 
 	workloadName := connection.Labels[constants.WorkloadKey]
 	workload := &tfv1.TensorFusionWorkload{}
 	if err := r.Get(ctx, client.ObjectKey{Name: workloadName, Namespace: connection.Namespace}, workload); err != nil {
-		return ctrl.Result{}, fmt.Errorf("can not found TensorFusionWorkload: %w", err)
+		return ctrl.Result{}, fmt.Errorf("can not found TensorFusionWorkload for connection %s: %w", connection.Name, err)
 	}
 
 	if workload.Spec.IsDynamicReplica() {
diff --git a/internal/gpuallocator/gpuallocator.go b/internal/gpuallocator/gpuallocator.go
diff --git a/internal/utils/compose.go b/internal/utils/compose.go
diff --git a/internal/webhook/v1/pod_webhook.go b/internal/webhook/v1/pod_webhook.go
diff --git a/internal/worker/worker.go b/internal/worker/worker.go