|
| 1 | +package v1 |
| 2 | + |
| 3 | +import ( |
| 4 | + "context" |
| 5 | + "fmt" |
| 6 | + "strconv" |
| 7 | + |
| 8 | + "github.com/NexusGPU/tensor-fusion/internal/constants" |
| 9 | + "github.com/NexusGPU/tensor-fusion/internal/utils" |
| 10 | + corev1 "k8s.io/api/core/v1" |
| 11 | + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" |
| 12 | + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" |
| 13 | + "sigs.k8s.io/controller-runtime/pkg/client" |
| 14 | +) |
| 15 | + |
| 16 | +type TensorFusionPodCounter struct { |
| 17 | + Client client.Client |
| 18 | +} |
| 19 | + |
| 20 | +// getOrGenerateKey returns the pod's counter key from annotation if present, otherwise generates one from pod template labels (e.g. pod-template-hash or fallback to object hash) |
| 21 | +func getOrGenerateKey(pod *corev1.Pod) string { |
| 22 | + if pod.Annotations != nil { |
| 23 | + if key, ok := pod.Annotations[constants.TensorFusionPodCounterKeyAnnotation]; ok && key != "" { |
| 24 | + return key |
| 25 | + } |
| 26 | + } |
| 27 | + // Try to use pod-template-hash if present |
| 28 | + if hash, ok := pod.Labels["pod-template-hash"]; ok && hash != "" { |
| 29 | + return hash |
| 30 | + } |
| 31 | + |
| 32 | + // Fallback to object hash |
| 33 | + return utils.GetObjectHash(pod) |
| 34 | +} |
| 35 | + |
| 36 | +// Get gets the counter value from the owner annotation by key |
| 37 | +func (c *TensorFusionPodCounter) Get(ctx context.Context, pod *corev1.Pod) (int32, string, error) { |
| 38 | + ownerRef := getControllerOwnerRef(pod) |
| 39 | + if ownerRef == nil { |
| 40 | + return 0, "", fmt.Errorf("no controller owner reference found for pod %s/%s", pod.Namespace, pod.Name) |
| 41 | + } |
| 42 | + key := getOrGenerateKey(pod) |
| 43 | + ownerObj := &unstructured.Unstructured{} |
| 44 | + ownerObj.SetAPIVersion(ownerRef.APIVersion) |
| 45 | + ownerObj.SetKind(ownerRef.Kind) |
| 46 | + objKey := client.ObjectKey{Name: ownerRef.Name, Namespace: pod.Namespace} |
| 47 | + if err := c.Client.Get(ctx, objKey, ownerObj); err != nil { |
| 48 | + return 0, "", fmt.Errorf("failed to get owner object: %w", err) |
| 49 | + } |
| 50 | + annotations := ownerObj.GetAnnotations() |
| 51 | + if annotations == nil { |
| 52 | + return 0, "", nil |
| 53 | + } |
| 54 | + val, ok := annotations[key] |
| 55 | + if !ok || val == "" { |
| 56 | + return 0, "", nil |
| 57 | + } |
| 58 | + count, err := strconv.ParseInt(val, 10, 32) |
| 59 | + if err != nil { |
| 60 | + return 0, "", fmt.Errorf("invalid count annotation: %s, err: %w", val, err) |
| 61 | + } |
| 62 | + return int32(count), key, nil |
| 63 | +} |
| 64 | + |
| 65 | +// Increase increases the counter in owner annotation by key |
| 66 | +func (c *TensorFusionPodCounter) Increase(ctx context.Context, pod *corev1.Pod) error { |
| 67 | + ownerRef := getControllerOwnerRef(pod) |
| 68 | + if ownerRef == nil { |
| 69 | + return fmt.Errorf("no controller owner reference found for pod %s/%s", pod.Namespace, pod.Name) |
| 70 | + } |
| 71 | + key := getOrGenerateKey(pod) |
| 72 | + ownerObj := &unstructured.Unstructured{} |
| 73 | + ownerObj.SetAPIVersion(ownerRef.APIVersion) |
| 74 | + ownerObj.SetKind(ownerRef.Kind) |
| 75 | + objKey := client.ObjectKey{Name: ownerRef.Name, Namespace: pod.Namespace} |
| 76 | + if err := c.Client.Get(ctx, objKey, ownerObj); err != nil { |
| 77 | + return fmt.Errorf("failed to get owner object: %w", err) |
| 78 | + } |
| 79 | + annotations := ownerObj.GetAnnotations() |
| 80 | + if annotations == nil { |
| 81 | + annotations = map[string]string{} |
| 82 | + } |
| 83 | + val := annotations[key] |
| 84 | + if val == "" { |
| 85 | + val = "0" |
| 86 | + } |
| 87 | + count, err := strconv.ParseInt(val, 10, 32) |
| 88 | + if err != nil { |
| 89 | + return fmt.Errorf("invalid count annotation: %s, err: %w", val, err) |
| 90 | + } |
| 91 | + count++ |
| 92 | + annotations[key] = fmt.Sprintf("%d", count) |
| 93 | + ownerObj.SetAnnotations(annotations) |
| 94 | + if err := c.Client.Update(ctx, ownerObj); err != nil { |
| 95 | + return fmt.Errorf("failed to update owner annotation: %w", err) |
| 96 | + } |
| 97 | + return nil |
| 98 | +} |
| 99 | + |
| 100 | +// Decrease decreases the counter in owner annotation by key |
| 101 | +func (c *TensorFusionPodCounter) Decrease(ctx context.Context, pod *corev1.Pod) error { |
| 102 | + ownerRef := getControllerOwnerRef(pod) |
| 103 | + if ownerRef == nil { |
| 104 | + return fmt.Errorf("no controller owner reference found for pod %s/%s", pod.Namespace, pod.Name) |
| 105 | + } |
| 106 | + key := getOrGenerateKey(pod) |
| 107 | + ownerObj := &unstructured.Unstructured{} |
| 108 | + ownerObj.SetAPIVersion(ownerRef.APIVersion) |
| 109 | + ownerObj.SetKind(ownerRef.Kind) |
| 110 | + objKey := client.ObjectKey{Name: ownerRef.Name, Namespace: pod.Namespace} |
| 111 | + if err := c.Client.Get(ctx, objKey, ownerObj); err != nil { |
| 112 | + return fmt.Errorf("failed to get owner object: %w", err) |
| 113 | + } |
| 114 | + annotations := ownerObj.GetAnnotations() |
| 115 | + if annotations == nil { |
| 116 | + annotations = map[string]string{} |
| 117 | + } |
| 118 | + val := annotations[key] |
| 119 | + if val == "" { |
| 120 | + val = "0" |
| 121 | + } |
| 122 | + count, err := strconv.ParseInt(val, 10, 32) |
| 123 | + if err != nil { |
| 124 | + return fmt.Errorf("invalid count annotation: %s, err: %w", val, err) |
| 125 | + } |
| 126 | + count-- |
| 127 | + if count <= 0 { |
| 128 | + delete(annotations, key) |
| 129 | + } else { |
| 130 | + annotations[key] = fmt.Sprintf("%d", count) |
| 131 | + } |
| 132 | + ownerObj.SetAnnotations(annotations) |
| 133 | + if err := c.Client.Update(ctx, ownerObj); err != nil { |
| 134 | + return fmt.Errorf("failed to update owner annotation: %w", err) |
| 135 | + } |
| 136 | + return nil |
| 137 | +} |
| 138 | + |
| 139 | +// getControllerOwnerRef returns the controller owner reference of a pod |
| 140 | +func getControllerOwnerRef(pod *corev1.Pod) *metav1.OwnerReference { |
| 141 | + for i, ref := range pod.OwnerReferences { |
| 142 | + if ref.Controller != nil && *ref.Controller { |
| 143 | + return &pod.OwnerReferences[i] |
| 144 | + } |
| 145 | + } |
| 146 | + return nil |
| 147 | +} |
0 commit comments