NexusGPU
diff --git a/‎.vscode/settings.json‎
Lines changed: 1 addition & 0 deletions b/‎.vscode/settings.json‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎api/v1/gpu_types.go‎
Lines changed: 19 additions & 0 deletions b/‎api/v1/gpu_types.go‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎api/v1/schedulingconfigtemplate_types.go‎
Lines changed: 1 addition & 0 deletions b/‎api/v1/schedulingconfigtemplate_types.go‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎api/v1/tensorfusionconnection_types.go‎
Lines changed: 5 additions & 0 deletions b/‎api/v1/tensorfusionconnection_types.go‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎api/v1/tensorfusionworkload_types.go‎
Lines changed: 9 additions & 0 deletions b/‎api/v1/tensorfusionworkload_types.go‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎api/v1/workloadprofile_types.go‎
Lines changed: 5 additions & 0 deletions b/‎api/v1/workloadprofile_types.go‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎charts/tensor-fusion/Chart.yaml‎
Lines changed: 2 additions & 2 deletions b/‎charts/tensor-fusion/Chart.yaml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎charts/tensor-fusion/crds/tensor-fusion.ai_gpus.yaml‎
Lines changed: 17 additions & 0 deletions b/‎charts/tensor-fusion/crds/tensor-fusion.ai_gpus.yaml‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎charts/tensor-fusion/crds/tensor-fusion.ai_schedulingconfigtemplates.yaml‎
Lines changed: 3 additions & 0 deletions b/‎charts/tensor-fusion/crds/tensor-fusion.ai_schedulingconfigtemplates.yaml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎charts/tensor-fusion/crds/tensor-fusion.ai_tensorfusionconnections.yaml‎
Lines changed: 17 additions & 1 deletion b/‎charts/tensor-fusion/crds/tensor-fusion.ai_tensorfusionconnections.yaml‎
Lines changed: 17 additions & 1 deletion
@@ -20,6 +20,7 @@
         "certgen",
         "certificaterequests",
         "certmanager",
+        "clientcmd",
         "clientgoscheme",
         "clientset",
         "cloudnative",
 
@@ -35,12 +35,29 @@ type GPUStatus struct {
 	NodeSelector map[string]string `json:"nodeSelector"`
 	GPUModel     string            `json:"gpuModel"`
 
+	// GPU is used by tensor-fusion or nvidia-operator
+	// This is the key to be compatible with nvidia-device-plugin to avoid resource overlap
+	// Hypervisor will watch kubelet device plugin to report all GPUs already used by nvidia-device-plugin
+	// GPUs will be grouped by usedBy to be used by different Pods,
+	// tensor-fusion annotation or nvidia-device-plugin resource block
+	// +optional
+	UsedBy UsedBySystem `json:"usedBy,omitempty"`
+
 	Message string `json:"message"`
 
 	// +optional
 	RunningApps []*RunningAppDetail `json:"runningApps,omitempty"`
 }
 
+// +kubebuilder:validation:Enum=tensor-fusion;nvidia-device-plugin
+// +default="tensor-fusion"
+type UsedBySystem string
+
+const (
+	UsedByTensorFusion       UsedBySystem = "tensor-fusion"
+	UsedByNvidiaDevicePlugin UsedBySystem = "nvidia-device-plugin"
+)
+
 type RunningAppDetail struct {
 	// Workload name namespace
 	Name      string `json:"name,omitempty"`
@@ -72,6 +89,8 @@ const (
 // +kubebuilder:printcolumn:name="Available TFlops",type="string",JSONPath=".status.available.tflops"
 // +kubebuilder:printcolumn:name="Available VRAM",type="string",JSONPath=".status.available.vram"
 // +kubebuilder:printcolumn:name="Device UUID",type="string",JSONPath=".status.uuid"
+// +kubebuilder:printcolumn:name="Used By",type="string",JSONPath=".status.usedBy"
+// +kubebuilder:printcolumn:name="Node",type="string",JSONPath=".status.nodeSelector"
 // GPU is the Schema for the gpus API.
 type GPU struct {
 	metav1.TypeMeta   `json:",inline"`
 
@@ -217,6 +217,7 @@ type SchedulingConfigTemplateStatus struct {
 // +kubebuilder:resource:scope=Cluster
 // +kubebuilder:printcolumn:name="Mode",type="string",JSONPath=".spec.placement.mode"
 // +kubebuilder:printcolumn:name="Allow Local GPU",type="string",JSONPath=".spec.placement.allowLocalGPU"
+// +kubebuilder:printcolumn:name="AutoFreeze",type="string",JSONPath=".spec.hypervisor.autoFreezeAndResume.autoFreeze.enable"
 // SchedulingConfigTemplate is the Schema for the schedulingconfigtemplates API.
 type SchedulingConfigTemplate struct {
 	metav1.TypeMeta   `json:",inline"`
 
@@ -46,6 +46,11 @@ type TensorFusionConnectionStatus struct {
 
 // +kubebuilder:object:root=true
 // +kubebuilder:subresource:status
+// +kubebuilder:printcolumn:name="Phase",type="string",JSONPath=".status.phase"
+// +kubebuilder:printcolumn:name="Connection URL",type="string",JSONPath=".status.connectionURL"
+// +kubebuilder:printcolumn:name="Worker Name",type="string",JSONPath=".status.workerName"
+// +kubebuilder:printcolumn:name="Workload Name",type="string",JSONPath=".spec.workloadName"
+// +kubebuilder:printcolumn:name="Client Pod",type="string",JSONPath=".spec.clientPod"
 
 // TensorFusionConnection is the Schema for the tensorfusionconnections API.
 type TensorFusionConnection struct {
 
@@ -69,6 +69,15 @@ type TensorFusionWorkloadStatus struct {
 
 // +kubebuilder:object:root=true
 // +kubebuilder:subresource:status
+// +kubebuilder:printcolumn:name="Phase",type="string",JSONPath=".status.phase"
+// +kubebuilder:printcolumn:name="Worker Count",type="string",JSONPath=".status.workerCount"
+// +kubebuilder:printcolumn:name="Ready Workers",type="string",JSONPath=".status.readyWorkers"
+// +kubebuilder:printcolumn:name="Pod Template Hash",type="string",JSONPath=".status.podTemplateHash"
+// +kubebuilder:printcolumn:name="Pool Name",type="string",JSONPath=".spec.poolName"
+// +kubebuilder:printcolumn:name="QoS",type="string",JSONPath=".spec.qos"
+// +kubebuilder:printcolumn:name="Is Local GPU",type="string",JSONPath=".spec.isLocalGPU"
+// +kubebuilder:printcolumn:name="GPU Number",type="string",JSONPath=".spec.gpuCount"
+// +kubebuilder:printcolumn:name="Replicas",type="string",JSONPath=".spec.replicas"
 
 // TensorFusionWorkload is the Schema for the tensorfusionworkloads API.
 type TensorFusionWorkload struct {
 
@@ -80,6 +80,11 @@ type WorkloadProfileStatus struct {
 
 // +kubebuilder:object:root=true
 // +kubebuilder:subresource:status
+// +kubebuilder:printcolumn:name="Pool Name",type="string",JSONPath=".spec.poolName"
+// +kubebuilder:printcolumn:name="QoS",type="string",JSONPath=".spec.qos"
+// +kubebuilder:printcolumn:name="Is Local GPU",type="string",JSONPath=".spec.isLocalGPU"
+// +kubebuilder:printcolumn:name="GPU Number",type="string",JSONPath=".spec.gpuCount"
+// +kubebuilder:printcolumn:name="Replicas",type="string",JSONPath=".spec.replicas"
 
 // WorkloadProfile is the Schema for the workloadprofiles API.
 type WorkloadProfile struct {
 
@@ -15,10 +15,10 @@ type: application
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: 1.4.5
+version: 1.4.6
 
 # This is the version number of the application being deployed. This version number should be
 # incremented each time you make changes to the application. Versions are not expected to
 # follow Semantic Versioning. They should reflect the version the application is using.
 # It is recommended to use it with quotes.
-appVersion: "1.36.3"
+appVersion: "1.39.1"
@@ -36,6 +36,12 @@ spec:
     - jsonPath: .status.uuid
       name: Device UUID
       type: string
+    - jsonPath: .status.usedBy
+      name: Used By
+      type: string
+    - jsonPath: .status.nodeSelector
+      name: Node
+      type: string
     name: v1
     schema:
       openAPIV3Schema:
@@ -131,6 +137,17 @@ spec:
                   - count
                   type: object
                 type: array
+              usedBy:
+                description: |-
+                  GPU is used by tensor-fusion or nvidia-operator
+                  This is the key to be compatible with nvidia-device-plugin to avoid resource overlap
+                  Hypervisor will watch kubelet device plugin to report all GPUs already used by nvidia-device-plugin
+                  GPUs will be grouped by usedBy to be used by different Pods,
+                  tensor-fusion annotation or nvidia-device-plugin resource block
+                enum:
+                - tensor-fusion
+                - nvidia-device-plugin
+                type: string
               uuid:
                 type: string
             required:
 
@@ -21,6 +21,9 @@ spec:
     - jsonPath: .spec.placement.allowLocalGPU
       name: Allow Local GPU
       type: string
+    - jsonPath: .spec.hypervisor.autoFreezeAndResume.autoFreeze.enable
+      name: AutoFreeze
+      type: string
     name: v1
     schema:
       openAPIV3Schema:
 
@@ -14,7 +14,23 @@ spec:
     singular: tensorfusionconnection
   scope: Namespaced
   versions:
-  - name: v1
+  - additionalPrinterColumns:
+    - jsonPath: .status.phase
+      name: Phase
+      type: string
+    - jsonPath: .status.connectionURL
+      name: Connection URL
+      type: string
+    - jsonPath: .status.workerName
+      name: Worker Name
+      type: string
+    - jsonPath: .spec.workloadName
+      name: Workload Name
+      type: string
+    - jsonPath: .spec.clientPod
+      name: Client Pod
+      type: string
+    name: v1
     schema:
       openAPIV3Schema:
         description: TensorFusionConnection is the Schema for the tensorfusionconnections