NexusGPU
diff --git a/‎.vscode/launch.json‎
Lines changed: 21 additions & 5 deletions b/‎.vscode/launch.json‎
Lines changed: 21 additions & 5 deletions
diff --git a/‎.vscode/settings.json‎
Lines changed: 3 additions & 0 deletions b/‎.vscode/settings.json‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎charts/tensor-fusion/Chart.yaml‎
Lines changed: 2 additions & 2 deletions b/‎charts/tensor-fusion/Chart.yaml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎charts/tensor-fusion/templates/config.yaml‎
Lines changed: 4 additions & 1 deletion b/‎charts/tensor-fusion/templates/config.yaml‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎charts/tensor-fusion/templates/controller-deployment.yaml‎
Lines changed: 5 additions & 0 deletions b/‎charts/tensor-fusion/templates/controller-deployment.yaml‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎charts/tensor-fusion/templates/greptime-secret.yaml‎
Lines changed: 1 addition & 1 deletion b/‎charts/tensor-fusion/templates/greptime-secret.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎charts/tensor-fusion/templates/rbac.yaml‎
Lines changed: 24 additions & 25 deletions b/‎charts/tensor-fusion/templates/rbac.yaml‎
Lines changed: 24 additions & 25 deletions
diff --git a/‎charts/tensor-fusion/values.yaml‎
Lines changed: 56 additions & 0 deletions b/‎charts/tensor-fusion/values.yaml‎
Lines changed: 56 additions & 0 deletions
@@ -10,7 +10,7 @@
             "request": "launch",
             "mode": "auto",
             "env": {
-                "ENABLE_WEBHOOKS": "false"
+                "ENABLE_WEBHOOKS": "false",
             },
             "program": "${workspaceFolder}/cmd/main.go",
             "args": [
@@ -39,20 +39,36 @@
             "console": "integratedTerminal",
             "env": {
                 "KUBECONFIG": "~/.kube/config-tf-dev",
-                "ENABLE_WEBHOOKS": "false"
+                "ENABLE_WEBHOOKS": "false",
+                "ENABLE_SCHEDULER": "false"
             },
+            "args": [
+                "--gpu-info-config", "${workspaceFolder}/config/samples/gpu-info-config.yaml",
+                "--dynamic-config", "${workspaceFolder}/config/samples/dynamic-config.yaml",
+                "--scheduler-config", "${workspaceFolder}/config/samples/scheduler-config.yaml",
+                "--enable-alert",
+                "--enable-auto-scale"
+            ],
             "program": "${workspaceFolder}/cmd/main.go",
         },
         {
-            "name": "Debug Demo Env Operator",
+            "name": "Debug Local Env Operator",
             "type": "go",
             "request": "launch",
             "mode": "auto",
             "console": "integratedTerminal",
             "env": {
-                "KUBECONFIG": "~/.kube/dev_us-east-1_demo",
-                "ENABLE_WEBHOOKS": "false"
+                "KUBECONFIG": "~/.kube/config-local-studio",
+                "ENABLE_WEBHOOKS": "false",
+                "ENABLE_SCHEDULER": "false"
             },
+            "args": [
+                "--gpu-info-config", "${workspaceFolder}/config/samples/gpu-info-config.yaml",
+                "--dynamic-config", "${workspaceFolder}/config/samples/dynamic-config.yaml",
+                "--scheduler-config", "${workspaceFolder}/config/samples/scheduler-config.yaml",
+                "--enable-alert",
+                "--enable-auto-scale"
+            ],
             "program": "${workspaceFolder}/cmd/main.go",
         },
         {
 
@@ -98,6 +98,7 @@
         "RDNA",
         "readyz",
         "replicaset",
+        "replicasets",
         "runbook",
         "runpod",
         "samber",
@@ -108,6 +109,7 @@
         "schedulingcorev",
         "shirou",
         "shortuuid",
+        "statefulsets",
         "strategicpatch",
         "strategicpatches",
         "subresource",
@@ -116,6 +118,7 @@
         "tensorfusionaiv",
         "tensorfusioncluster",
         "tensorfusionclusters",
+        "tensorfusionconnections",
         "tensorfusionworkload",
         "tensorfusionworkloads",
         "Tera",
 
@@ -15,10 +15,10 @@ type: application
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: 1.4.1
+version: 1.4.2
 
 # This is the version number of the application being deployed. This version number should be
 # incremented each time you make changes to the application. Versions are not expected to
 # follow Semantic Versioning. They should reflect the version the application is using.
 # It is recommended to use it with quotes.
-appVersion: "1.30.3"
+appVersion: "1.35.2"
@@ -9,4 +9,7 @@ metadata:
 data:
   # Read by tensor fusion operator, eval alert rules and send to alertmanager if enabledAlert is true
   config.yaml: |
-    {{- toYaml .Values.dynamicConfig | nindent 4 }}
+    {{- toYaml .Values.dynamicConfig | nindent 4 }}
+  scheduler-config.yaml: |
+    {{- toYaml .Values.schedulerConfig | nindent 4 }}
+    
@@ -87,6 +87,10 @@ spec:
               mountPath: /etc/tensor-fusion/config.yaml
               subPath: config.yaml
               readOnly: true
+            - name: config
+              mountPath: /etc/tensor-fusion/scheduler-config.yaml
+              subPath: scheduler-config.yaml
+              readOnly: true
         {{- if .Values.agent.agentId }}
         - name: cluster-agent
           image: "{{ .Values.agent.image.repository }}:{{ .Values.agent.image.tag | default "latest" }}"
@@ -172,6 +176,7 @@ spec:
         - name: kubernetes-logs
           hostPath:
             path: /var/log/pods
+            type: DirectoryOrCreate
       {{- with .Values.controller.affinity }}
       affinity:
         {{- toYaml . | nindent 8 }}
 
@@ -3,7 +3,7 @@ apiVersion: v1
 kind: Secret
 metadata:
   name: tf-greptimedb-secret
-  namespace: {{ .Release.Namespace }}
+  namespace: {{ include "tensor-fusion.namespace" . }}
 data:
   password: "{{ .Values.greptime.password | b64enc }}"
 {{- end }}
@@ -3,10 +3,12 @@ kind: ClusterRole
 metadata:
   name: {{ include "tensor-fusion.fullname" . }}-role
 rules:
+rules:
 - apiGroups:
   - ""
   resources:
   - configmaps
+  - events
   - namespaces
   verbs:
   - create
@@ -15,21 +17,6 @@ rules:
   - patch
   - update
   - watch
-- apiGroups:
-  - ""
-  resources:
-  - secrets
-  verbs:
-  - get
-  - list
-  - watch
-- apiGroups:
-  - ""
-  resources:
-  - events
-  verbs:
-  - create
-  - patch
 - apiGroups:
   - ""
   resources:
@@ -43,13 +30,12 @@ rules:
   - patch
   - update
   - watch
-  - deletecollection
 - apiGroups:
   - ""
   resources:
   - nodes/finalizers
-  - pods/finalizers
   - pods/binding
+  - pods/finalizers
   verbs:
   - update
 - apiGroups:
@@ -70,13 +56,23 @@ rules:
   - get
   - patch
   - update
+- apiGroups:
+  - ""
+  - policy
+  - storage.k8s.io
+  resources:
+  - '*'
+  verbs:
+  - get
+  - list
+  - watch
 - apiGroups:
   - apps
   resources:
   - daemonsets
   - deployments
-  - statefulsets
   - replicasets
+  - statefulsets
   verbs:
   - create
   - delete
@@ -110,25 +106,26 @@ rules:
   resources:
   - leases
   verbs:
+  - create
+  - delete
   - get
   - list
-  - watch
-  - create
-  - update
   - patch
-  - delete
+  - update
+  - watch
 - apiGroups:
   - tensor-fusion.ai
   resources:
-  - workloadprofiles
   - gpunodeclasses
   - gpunodes
   - gpupools
+  - gpuresourcequotas
   - gpus
   - schedulingconfigtemplates
   - tensorfusionclusters
   - tensorfusionconnections
   - tensorfusionworkloads
+  - workloadprofiles
   verbs:
   - create
   - delete
@@ -140,29 +137,31 @@ rules:
 - apiGroups:
   - tensor-fusion.ai
   resources:
-  - workloadprofiles/finalizers
   - gpunodeclasses/finalizers
   - gpunodes/finalizers
   - gpupools/finalizers
+  - gpuresourcequotas/finalizers
   - gpus/finalizers
   - schedulingconfigtemplates/finalizers
   - tensorfusionclusters/finalizers
   - tensorfusionconnections/finalizers
   - tensorfusionworkloads/finalizers
+  - workloadprofiles/finalizers
   verbs:
   - update
 - apiGroups:
   - tensor-fusion.ai
   resources:
-  - workloadprofiles/status
   - gpunodeclasses/status
   - gpunodes/status
   - gpupools/status
+  - gpuresourcequotas/status
   - gpus/status
   - schedulingconfigtemplates/status
   - tensorfusionclusters/status
   - tensorfusionconnections/status
   - tensorfusionworkloads/status
+  - workloadprofiles/status
   verbs:
   - get
   - patch
 
@@ -158,6 +158,62 @@ alert:
     templates:
     - /etc/alertmanager/*.tmpl
 
+schedulerConfig:
+  apiVersion: kubescheduler.config.k8s.io/v1
+  kind: KubeSchedulerConfiguration
+  clientConnection:
+    kubeconfig: ""
+    qps: 50
+    burst: 100
+  profiles:
+  # Refer: https://kubernetes.io/docs/reference/scheduling/config/
+  - schedulerName: tensor-fusion-scheduler
+    plugins:
+      preFilter:
+        enabled:
+        - name: GPUResourcesFit
+      filter:
+        enabled:
+        - name: GPUResourcesFit
+        - name: GPUNetworkTopologyAware
+      score:
+        enabled:
+        - name: GPUResourcesFit
+          weight: 5
+      preBind:
+        enabled:
+        - name: GPUResourcesFit
+    pluginConfig:
+    - name: GPUResourcesFit
+      args:
+        maxWorkerPerNode: 256
+        vramWeight: 0.7
+        tflopsWeight: 0.3
+    - name: GPUNetworkTopologyAware
+      args:
+        # Avoid the remote TFWorker RX/TX to avoid single node consume too much bandwidth
+        # Need enable monitor to take effect
+        totalIntranetBandWidthGBps: 100
+    - name: NodeResourcesFit
+      args:
+        scoringStrategy:
+          resources:
+          - name: cpu
+            weight: 1
+          - name: memory
+            weight: 1
+          requestedToCapacityRatio:
+            shape:
+            - utilization: 0
+              score: 0
+            - utilization: 80
+              score: 10
+            - utilization: 90
+              score: 2
+            - utilization: 100
+              score: 0
+          type: RequestedToCapacityRatio
+
 # KV structure config for other global configs
 dynamicConfig:
   # retention period for metrics data