feat: add GPU metrics for TFLOPS and VRAM requests and limits

0x5457 · 0x5457 · commit bbd183582701 · 2025-01-17T06:08:13.000Z
diff --git a/cmd/main.go b/cmd/main.go
@@ -26,6 +26,14 @@ import (
 	// to ensure that exec-entrypoint and run can make use of them.
 	_ "k8s.io/client-go/plugin/pkg/client/auth"
 
+	tensorfusionaiv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1"
+	"github.com/NexusGPU/tensor-fusion-operator/internal/config"
+	"github.com/NexusGPU/tensor-fusion-operator/internal/controller"
+	"github.com/NexusGPU/tensor-fusion-operator/internal/scheduler"
+	"github.com/NexusGPU/tensor-fusion-operator/internal/server"
+	"github.com/NexusGPU/tensor-fusion-operator/internal/server/router"
+	webhookcorev1 "github.com/NexusGPU/tensor-fusion-operator/internal/webhook/v1"
+	"github.com/NexusGPU/tensor-fusion-operator/internal/worker"
 	"k8s.io/apimachinery/pkg/runtime"
 	utilruntime "k8s.io/apimachinery/pkg/util/runtime"
 	clientgoscheme "k8s.io/client-go/kubernetes/scheme"
@@ -36,15 +44,6 @@ import (
 	"sigs.k8s.io/controller-runtime/pkg/metrics/filters"
 	metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"
 	"sigs.k8s.io/controller-runtime/pkg/webhook"
-
-	tensorfusionaiv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1"
-	"github.com/NexusGPU/tensor-fusion-operator/internal/config"
-	"github.com/NexusGPU/tensor-fusion-operator/internal/controller"
-	"github.com/NexusGPU/tensor-fusion-operator/internal/scheduler"
-	"github.com/NexusGPU/tensor-fusion-operator/internal/server"
-	"github.com/NexusGPU/tensor-fusion-operator/internal/server/router"
-	webhookcorev1 "github.com/NexusGPU/tensor-fusion-operator/internal/webhook/v1"
-	"github.com/NexusGPU/tensor-fusion-operator/internal/worker"
 	// +kubebuilder:scaffold:imports
 )
 
@@ -76,7 +75,7 @@ func main() {
 	flag.BoolVar(&enableLeaderElection, "leader-elect", false,
 		"Enable leader election for controller manager. "+
 			"Enabling this will ensure there is only one active controller manager.")
-	flag.BoolVar(&secureMetrics, "metrics-secure", true,
+	flag.BoolVar(&secureMetrics, "metrics-secure", false,
 		"If set, the metrics endpoint is served securely via HTTPS. Use --metrics-secure=false to use HTTP instead.")
 	flag.BoolVar(&enableHTTP2, "enable-http2", false,
 		"If set, HTTP/2 will be enabled for the metrics and webhook servers")
diff --git a/go.mod b/go.mod
@@ -8,6 +8,7 @@ require (
 	github.com/lithammer/shortuuid/v4 v4.2.0
 	github.com/onsi/ginkgo/v2 v2.22.1
 	github.com/onsi/gomega v1.36.2
+	github.com/prometheus/client_golang v1.20.5
 	github.com/samber/lo v1.47.0
 	golang.org/x/exp v0.0.0-20241217172543-b2144cdd0a67
 	gomodules.xyz/jsonpatch/v2 v2.4.0
@@ -72,7 +73,6 @@ require (
 	github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
 	github.com/pelletier/go-toml/v2 v2.2.3 // indirect
 	github.com/pkg/errors v0.9.1 // indirect
-	github.com/prometheus/client_golang v1.20.5 // indirect
 	github.com/prometheus/client_model v0.6.1 // indirect
 	github.com/prometheus/common v0.61.0 // indirect
 	github.com/prometheus/procfs v0.15.1 // indirect
diff --git a/internal/constants/constants.go b/internal/constants/constants.go
@@ -11,9 +11,11 @@ const (
 	Finalizer       = Domain + "/" + FinalizerSuffix
 
 	// Annotation key constants
-	EnableContainerAnnotationFormat = Domain + "/enable-%s"
-	TFLOPSContainerAnnotationFormat = Domain + "/tflops-%s"
-	VRAMContainerAnnotationFormat   = Domain + "/vram-%s"
+	EnableAnnotationFormat        = Domain + "/enable-%s"
+	TFLOPSRequestAnnotationFormat = Domain + "/tflops-request-%s"
+	VRAMRequestAnnotationFormat   = Domain + "/vram-request-%s"
+	TFLOPSLimitAnnotationFormat   = Domain + "/tflops-limit-%s"
+	VRAMLimitAnnotationFormat     = Domain + "/vram-limit-%s"
 
 	PendingRequeueDuration = time.Second * 3
 
diff --git a/internal/controller/pod_controller.go b/internal/controller/pod_controller.go
@@ -22,7 +22,9 @@ import (
 
 	tfv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1"
 	"github.com/NexusGPU/tensor-fusion-operator/internal/constants"
+	"github.com/NexusGPU/tensor-fusion-operator/internal/metrics"
 	webhookv1 "github.com/NexusGPU/tensor-fusion-operator/internal/webhook/v1"
+	"github.com/prometheus/client_golang/prometheus"
 	corev1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/api/errors"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -55,13 +57,13 @@ func (r *PodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.R
 		log.Error(err, "Failed to get Pod")
 		return ctrl.Result{}, err
 	}
-	reqs := webhookv1.ParseTFReq(pod)
-	if len(reqs) == 0 {
+	resources := webhookv1.ParseTFResources(pod)
+	if len(resources) == 0 {
 		return ctrl.Result{}, nil
 	}
 
 	// generate tensor fusion connections and apply to cluster
-	tfConnections := GenerateTensorFusionConnection(pod, reqs)
+	tfConnections := GenerateTensorFusionConnection(pod, resources)
 
 	for _, tfConnection := range tfConnections {
 		existConn := &tfv1.TensorFusionConnection{}
@@ -73,10 +75,24 @@ func (r *PodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.R
 			}
 		}
 	}
+
+	// update metrics
+	for _, res := range resources {
+		labels := prometheus.Labels{
+			"pod":       pod.Name,
+			"namespace": pod.Namespace,
+			"container": res.ContainerName,
+		}
+		metrics.GpuTflopsRequest.With(labels).Set(res.TflopsRequest.AsApproximateFloat64())
+		metrics.GpuTflopsLimit.With(labels).Set(res.TflopsLimit.AsApproximateFloat64())
+		metrics.VramBytesRequest.With(labels).Set(res.VramRequest.AsApproximateFloat64())
+		metrics.VramBytesLimit.With(labels).Set(res.VramLimit.AsApproximateFloat64())
+	}
+
 	return ctrl.Result{}, nil
 }
 
-func GenerateTensorFusionConnection(pod *corev1.Pod, tfReq []webhookv1.TFReq) []*tfv1.TensorFusionConnection {
+func GenerateTensorFusionConnection(pod *corev1.Pod, tfReq []webhookv1.TFResource) []*tfv1.TensorFusionConnection {
 	connections := make([]*tfv1.TensorFusionConnection, 0, len(tfReq))
 
 	for _, req := range tfReq {
@@ -96,12 +112,12 @@ func GenerateTensorFusionConnection(pod *corev1.Pod, tfReq []webhookv1.TFReq) []
 			Spec: tfv1.TensorFusionConnectionSpec{
 				Resources: tfv1.Resources{
 					Requests: tfv1.Resource{
-						Tflops: req.Tflops,
-						Vram:   req.Vram,
+						Tflops: req.TflopsRequest,
+						Vram:   req.VramRequest,
 					},
 					Limits: tfv1.Resource{
-						Tflops: req.Tflops,
-						Vram:   req.Vram,
+						Tflops: req.TflopsLimit,
+						Vram:   req.VramLimit,
 					},
 				},
 			},
diff --git a/internal/metrics/connection.go b/internal/metrics/connection.go
@@ -0,0 +1,40 @@
+package metrics
+
+import (
+	"github.com/prometheus/client_golang/prometheus"
+	"sigs.k8s.io/controller-runtime/pkg/metrics"
+)
+
+var (
+	GpuTflopsRequest = prometheus.NewGaugeVec(
+		prometheus.GaugeOpts{
+			Name: "gpu_tflops_request",
+		},
+		[]string{"namespace", "pod", "container"},
+	)
+
+	GpuTflopsLimit = prometheus.NewGaugeVec(
+		prometheus.GaugeOpts{
+			Name: "gpu_tflops_limit",
+		},
+		[]string{"namespace", "pod", "container"},
+	)
+
+	VramBytesRequest = prometheus.NewGaugeVec(
+		prometheus.GaugeOpts{
+			Name: "vram_bytes_request",
+		},
+		[]string{"namespace", "pod", "container"},
+	)
+
+	VramBytesLimit = prometheus.NewGaugeVec(
+		prometheus.GaugeOpts{
+			Name: "vram_bytes_limit",
+		},
+		[]string{"namespace", "pod", "container"},
+	)
+)
+
+func init() {
+	metrics.Registry.MustRegister(GpuTflopsRequest, GpuTflopsLimit, VramBytesRequest, VramBytesLimit)
+}
diff --git a/internal/webhook/v1/pod_webhook.go b/internal/webhook/v1/pod_webhook.go
@@ -69,13 +69,13 @@ func (m *TensorFusionPodMutator) Handle(ctx context.Context, req admission.Reque
 	log := log.FromContext(ctx)
 	log.Info("Mutating pod", "generateName", pod.GenerateName, "namespace", pod.Namespace)
 
-	reqs := ParseTFReq(pod)
-	if len(reqs) == 0 {
+	resources := ParseTFResources(pod)
+	if len(resources) == 0 {
 		return admission.Allowed("no tensor fusion requirements found")
 	}
 
 	// 1. Inject initContainer and env variables
-	patches, err := m.patchTFClient(pod, reqs)
+	patches, err := m.patchTFClient(pod, resources)
 	if err != nil {
 		return admission.Errored(http.StatusInternalServerError, err)
 	}
@@ -89,36 +89,43 @@ func (m *TensorFusionPodMutator) InjectDecoder(d admission.Decoder) error {
 	return nil
 }
 
-type TFReq struct {
+type TFResource struct {
 	ContainerName       string
 	ConnectionName      string
 	ConnectionNamespace string
-	Tflops              resource.Quantity
-	Vram                resource.Quantity
+	TflopsRequest       resource.Quantity
+	VramRequest         resource.Quantity
+	TflopsLimit         resource.Quantity
+	VramLimit           resource.Quantity
 }
 
-func ParseTFReq(pod *corev1.Pod) []TFReq {
+func ParseTFResources(pod *corev1.Pod) []TFResource {
 	if pod.Annotations == nil {
 		return nil
 	}
 
-	reqs := make([]TFReq, 0, len(pod.Spec.Containers))
+	reqs := make([]TFResource, 0, len(pod.Spec.Containers))
 
 	for _, container := range pod.Spec.Containers {
 		containerName := container.Name
 
 		// Check if TF requirements exist for this container
-		tflopsKey := fmt.Sprintf(constants.TFLOPSContainerAnnotationFormat, containerName)
-		vramKey := fmt.Sprintf(constants.VRAMContainerAnnotationFormat, containerName)
+		tflopsReqKey := fmt.Sprintf(constants.TFLOPSRequestAnnotationFormat, containerName)
+		vramReqKey := fmt.Sprintf(constants.VRAMRequestAnnotationFormat, containerName)
+		tflopsLimitKey := fmt.Sprintf(constants.TFLOPSLimitAnnotationFormat, containerName)
+		vramLimitKey := fmt.Sprintf(constants.VRAMLimitAnnotationFormat, containerName)
 
-		tflopsStr, hasTflops := pod.Annotations[tflopsKey]
-		vramStr, hasVram := pod.Annotations[vramKey]
+		tflopsReqStr, hasTflopsReq := pod.Annotations[tflopsReqKey]
+		vramReqStr, hasVramReq := pod.Annotations[vramReqKey]
 
-		if !hasTflops && !hasVram {
+		tflopsLimitStr, hasTflopsLimit := pod.Annotations[tflopsLimitKey]
+		vramLimitStr, hasVramLimit := pod.Annotations[vramLimitKey]
+
+		if !hasTflopsReq && !hasVramReq && !hasTflopsLimit && !hasVramLimit {
 			continue
 		}
 
-		req := TFReq{
+		req := TFResource{
 			ContainerName: containerName,
 		}
 		connectionNameEnv, ok := lo.Find(container.Env, func(e corev1.EnvVar) bool {
@@ -133,19 +140,35 @@ func ParseTFReq(pod *corev1.Pod) []TFReq {
 		if ok {
 			req.ConnectionNamespace = connectionNamespaceEnv.Value
 		}
-		// Parse TFLOPS requirement
-		if hasTflops {
-			tflops, err := resource.ParseQuantity(tflopsStr)
+		// Parse TFLOPS request
+		if hasTflopsReq {
+			tflops, err := resource.ParseQuantity(tflopsReqStr)
+			if err == nil {
+				req.TflopsRequest = tflops
+			}
+		}
+
+		// Parse VRAM request
+		if hasVramReq {
+			vram, err := resource.ParseQuantity(vramReqStr)
+			if err == nil {
+				req.VramRequest = vram
+			}
+		}
+
+		// Parse TFLOPS limit
+		if hasTflopsReq {
+			tflops, err := resource.ParseQuantity(tflopsLimitStr)
 			if err == nil {
-				req.Tflops = tflops
+				req.TflopsLimit = tflops
 			}
 		}
 
-		// Parse VRAM requirement
-		if hasVram {
-			vram, err := resource.ParseQuantity(vramStr)
+		// Parse VRAM limit
+		if hasVramReq {
+			vram, err := resource.ParseQuantity(vramLimitStr)
 			if err == nil {
-				req.Vram = vram
+				req.VramLimit = vram
 			}
 		}
 
@@ -155,7 +178,7 @@ func ParseTFReq(pod *corev1.Pod) []TFReq {
 	return reqs
 }
 
-func (m *TensorFusionPodMutator) patchTFClient(pod *corev1.Pod, tfReq []TFReq) ([]jsonpatch.JsonPatchOperation, error) {
+func (m *TensorFusionPodMutator) patchTFClient(pod *corev1.Pod, tfReq []TFResource) ([]jsonpatch.JsonPatchOperation, error) {
 	// Convert the current pod to JSON
 	currentBytes, err := json.Marshal(pod)
 	if err != nil {
diff --git a/internal/webhook/v1/pod_webhook_test.go b/internal/webhook/v1/pod_webhook_test.go
@@ -19,6 +19,7 @@ package v1
 import (
 	"context"
 	"encoding/json"
+	"fmt"
 	"net/http"
 
 	tfv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1"
@@ -63,14 +64,16 @@ var _ = Describe("TensorFusionPodMutator", func() {
 	})
 
 	Context("Handle", func() {
-		It("should successfully mutate a pod with TF requirements", func() {
+		It("should successfully mutate a pod with TF resources", func() {
 			pod := &corev1.Pod{
 				ObjectMeta: metav1.ObjectMeta{
 					Name:      "test-pod",
 					Namespace: "default",
 					Annotations: map[string]string{
-						constants.Domain + "/tflops-main": "100",
-						constants.Domain + "/vram-main":   "16Gi",
+						fmt.Sprintf(constants.TFLOPSRequestAnnotationFormat, "main"): "10",
+						fmt.Sprintf(constants.VRAMRequestAnnotationFormat, "main"):   "1Gi",
+						fmt.Sprintf(constants.TFLOPSLimitAnnotationFormat, "main"):   "100",
+						fmt.Sprintf(constants.VRAMLimitAnnotationFormat, "main"):     "16Gi",
 					},
 				},
 				Spec: corev1.PodSpec{
@@ -165,8 +168,10 @@ var _ = Describe("TensorFusionPodMutator", func() {
 			pod := &corev1.Pod{
 				ObjectMeta: metav1.ObjectMeta{
 					Annotations: map[string]string{
-						constants.Domain + "/tflops-test-container": "100",
-						constants.Domain + "/vram-test-container":   "16Gi",
+						fmt.Sprintf(constants.TFLOPSRequestAnnotationFormat, "test-container"): "10",
+						fmt.Sprintf(constants.VRAMRequestAnnotationFormat, "test-container"):   "1Gi",
+						fmt.Sprintf(constants.TFLOPSLimitAnnotationFormat, "test-container"):   "100",
+						fmt.Sprintf(constants.VRAMLimitAnnotationFormat, "test-container"):     "16Gi",
 					},
 				},
 				Spec: corev1.PodSpec{
@@ -188,11 +193,13 @@ var _ = Describe("TensorFusionPodMutator", func() {
 				},
 			}
 
-			reqs := ParseTFReq(pod)
-			Expect(reqs).To(HaveLen(1))
-			Expect(reqs[0].ContainerName).To(Equal("test-container"))
-			Expect(reqs[0].Tflops.String()).To(Equal("100"))
-			Expect(reqs[0].Vram.String()).To(Equal("16Gi"))
+			resources := ParseTFResources(pod)
+			Expect(resources).To(HaveLen(1))
+			Expect(resources[0].ContainerName).To(Equal("test-container"))
+			Expect(resources[0].TflopsRequest.String()).To(Equal("10"))
+			Expect(resources[0].VramRequest.String()).To(Equal("1Gi"))
+			Expect(resources[0].TflopsLimit.String()).To(Equal("100"))
+			Expect(resources[0].VramLimit.String()).To(Equal("16Gi"))
 		})
 	})
 
@@ -207,7 +214,7 @@ var _ = Describe("TensorFusionPodMutator", func() {
 					},
 				},
 			}
-			patch, err := mutator.patchTFClient(pod, []TFReq{{ContainerName: "test-container", Tflops: resource.MustParse("100"), Vram: resource.MustParse("16Gi")}})
+			patch, err := mutator.patchTFClient(pod, []TFResource{{ContainerName: "test-container", TflopsRequest: resource.MustParse("100"), VramRequest: resource.MustParse("16Gi")}})
 			Expect(err).NotTo(HaveOccurred())
 			Expect(patch).NotTo(BeEmpty())
 			Expect(patch).To(HaveLen(2))