Skip to content

Commit 97b1729

Browse files
authored
feat: new scheduler and webhook handler (#256)
* fix: remove unused tf cluster fields, add PatchToEmbeddedWorkerContainer in client patch * fix: rbac issue of scheduler framework, bump helm version * fix: worker schedule to cpu node bug, refactor main func, webhook missing patches issue * chore: lint issue * fix: add test case for new mutating webhook
1 parent 2fea94a commit 97b1729

19 files changed

+329
-185
lines changed

.vscode/launch.json

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
"request": "launch",
1111
"mode": "auto",
1212
"env": {
13-
"ENABLE_WEBHOOKS": "false"
13+
"ENABLE_WEBHOOKS": "false",
1414
},
1515
"program": "${workspaceFolder}/cmd/main.go",
1616
"args": [
@@ -39,20 +39,36 @@
3939
"console": "integratedTerminal",
4040
"env": {
4141
"KUBECONFIG": "~/.kube/config-tf-dev",
42-
"ENABLE_WEBHOOKS": "false"
42+
"ENABLE_WEBHOOKS": "false",
43+
"ENABLE_SCHEDULER": "false"
4344
},
45+
"args": [
46+
"--gpu-info-config", "${workspaceFolder}/config/samples/gpu-info-config.yaml",
47+
"--dynamic-config", "${workspaceFolder}/config/samples/dynamic-config.yaml",
48+
"--scheduler-config", "${workspaceFolder}/config/samples/scheduler-config.yaml",
49+
"--enable-alert",
50+
"--enable-auto-scale"
51+
],
4452
"program": "${workspaceFolder}/cmd/main.go",
4553
},
4654
{
47-
"name": "Debug Demo Env Operator",
55+
"name": "Debug Local Env Operator",
4856
"type": "go",
4957
"request": "launch",
5058
"mode": "auto",
5159
"console": "integratedTerminal",
5260
"env": {
53-
"KUBECONFIG": "~/.kube/dev_us-east-1_demo",
54-
"ENABLE_WEBHOOKS": "false"
61+
"KUBECONFIG": "~/.kube/config-local-studio",
62+
"ENABLE_WEBHOOKS": "false",
63+
"ENABLE_SCHEDULER": "false"
5564
},
65+
"args": [
66+
"--gpu-info-config", "${workspaceFolder}/config/samples/gpu-info-config.yaml",
67+
"--dynamic-config", "${workspaceFolder}/config/samples/dynamic-config.yaml",
68+
"--scheduler-config", "${workspaceFolder}/config/samples/scheduler-config.yaml",
69+
"--enable-alert",
70+
"--enable-auto-scale"
71+
],
5672
"program": "${workspaceFolder}/cmd/main.go",
5773
},
5874
{

.vscode/settings.json

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,7 @@
9898
"RDNA",
9999
"readyz",
100100
"replicaset",
101+
"replicasets",
101102
"runbook",
102103
"runpod",
103104
"samber",
@@ -108,6 +109,7 @@
108109
"schedulingcorev",
109110
"shirou",
110111
"shortuuid",
112+
"statefulsets",
111113
"strategicpatch",
112114
"strategicpatches",
113115
"subresource",
@@ -116,6 +118,7 @@
116118
"tensorfusionaiv",
117119
"tensorfusioncluster",
118120
"tensorfusionclusters",
121+
"tensorfusionconnections",
119122
"tensorfusionworkload",
120123
"tensorfusionworkloads",
121124
"Tera",

charts/tensor-fusion/Chart.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,10 @@ type: application
1515
# This is the chart version. This version number should be incremented each time you make changes
1616
# to the chart and its templates, including the app version.
1717
# Versions are expected to follow Semantic Versioning (https://semver.org/)
18-
version: 1.4.1
18+
version: 1.4.2
1919

2020
# This is the version number of the application being deployed. This version number should be
2121
# incremented each time you make changes to the application. Versions are not expected to
2222
# follow Semantic Versioning. They should reflect the version the application is using.
2323
# It is recommended to use it with quotes.
24-
appVersion: "1.30.3"
24+
appVersion: "1.35.2"

charts/tensor-fusion/templates/config.yaml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,4 +9,7 @@ metadata:
99
data:
1010
# Read by tensor fusion operator, eval alert rules and send to alertmanager if enabledAlert is true
1111
config.yaml: |
12-
{{- toYaml .Values.dynamicConfig | nindent 4 }}
12+
{{- toYaml .Values.dynamicConfig | nindent 4 }}
13+
scheduler-config.yaml: |
14+
{{- toYaml .Values.schedulerConfig | nindent 4 }}
15+

charts/tensor-fusion/templates/controller-deployment.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,10 @@ spec:
8787
mountPath: /etc/tensor-fusion/config.yaml
8888
subPath: config.yaml
8989
readOnly: true
90+
- name: config
91+
mountPath: /etc/tensor-fusion/scheduler-config.yaml
92+
subPath: scheduler-config.yaml
93+
readOnly: true
9094
{{- if .Values.agent.agentId }}
9195
- name: cluster-agent
9296
image: "{{ .Values.agent.image.repository }}:{{ .Values.agent.image.tag | default "latest" }}"
@@ -172,6 +176,7 @@ spec:
172176
- name: kubernetes-logs
173177
hostPath:
174178
path: /var/log/pods
179+
type: DirectoryOrCreate
175180
{{- with .Values.controller.affinity }}
176181
affinity:
177182
{{- toYaml . | nindent 8 }}

charts/tensor-fusion/templates/greptime-secret.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ apiVersion: v1
33
kind: Secret
44
metadata:
55
name: tf-greptimedb-secret
6-
namespace: {{ .Release.Namespace }}
6+
namespace: {{ include "tensor-fusion.namespace" . }}
77
data:
88
password: "{{ .Values.greptime.password | b64enc }}"
99
{{- end }}

charts/tensor-fusion/templates/rbac.yaml

Lines changed: 24 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,12 @@ kind: ClusterRole
33
metadata:
44
name: {{ include "tensor-fusion.fullname" . }}-role
55
rules:
6+
rules:
67
- apiGroups:
78
- ""
89
resources:
910
- configmaps
11+
- events
1012
- namespaces
1113
verbs:
1214
- create
@@ -15,21 +17,6 @@ rules:
1517
- patch
1618
- update
1719
- watch
18-
- apiGroups:
19-
- ""
20-
resources:
21-
- secrets
22-
verbs:
23-
- get
24-
- list
25-
- watch
26-
- apiGroups:
27-
- ""
28-
resources:
29-
- events
30-
verbs:
31-
- create
32-
- patch
3320
- apiGroups:
3421
- ""
3522
resources:
@@ -43,13 +30,12 @@ rules:
4330
- patch
4431
- update
4532
- watch
46-
- deletecollection
4733
- apiGroups:
4834
- ""
4935
resources:
5036
- nodes/finalizers
51-
- pods/finalizers
5237
- pods/binding
38+
- pods/finalizers
5339
verbs:
5440
- update
5541
- apiGroups:
@@ -70,13 +56,23 @@ rules:
7056
- get
7157
- patch
7258
- update
59+
- apiGroups:
60+
- ""
61+
- policy
62+
- storage.k8s.io
63+
resources:
64+
- '*'
65+
verbs:
66+
- get
67+
- list
68+
- watch
7369
- apiGroups:
7470
- apps
7571
resources:
7672
- daemonsets
7773
- deployments
78-
- statefulsets
7974
- replicasets
75+
- statefulsets
8076
verbs:
8177
- create
8278
- delete
@@ -110,25 +106,26 @@ rules:
110106
resources:
111107
- leases
112108
verbs:
109+
- create
110+
- delete
113111
- get
114112
- list
115-
- watch
116-
- create
117-
- update
118113
- patch
119-
- delete
114+
- update
115+
- watch
120116
- apiGroups:
121117
- tensor-fusion.ai
122118
resources:
123-
- workloadprofiles
124119
- gpunodeclasses
125120
- gpunodes
126121
- gpupools
122+
- gpuresourcequotas
127123
- gpus
128124
- schedulingconfigtemplates
129125
- tensorfusionclusters
130126
- tensorfusionconnections
131127
- tensorfusionworkloads
128+
- workloadprofiles
132129
verbs:
133130
- create
134131
- delete
@@ -140,29 +137,31 @@ rules:
140137
- apiGroups:
141138
- tensor-fusion.ai
142139
resources:
143-
- workloadprofiles/finalizers
144140
- gpunodeclasses/finalizers
145141
- gpunodes/finalizers
146142
- gpupools/finalizers
143+
- gpuresourcequotas/finalizers
147144
- gpus/finalizers
148145
- schedulingconfigtemplates/finalizers
149146
- tensorfusionclusters/finalizers
150147
- tensorfusionconnections/finalizers
151148
- tensorfusionworkloads/finalizers
149+
- workloadprofiles/finalizers
152150
verbs:
153151
- update
154152
- apiGroups:
155153
- tensor-fusion.ai
156154
resources:
157-
- workloadprofiles/status
158155
- gpunodeclasses/status
159156
- gpunodes/status
160157
- gpupools/status
158+
- gpuresourcequotas/status
161159
- gpus/status
162160
- schedulingconfigtemplates/status
163161
- tensorfusionclusters/status
164162
- tensorfusionconnections/status
165163
- tensorfusionworkloads/status
164+
- workloadprofiles/status
166165
verbs:
167166
- get
168167
- patch

charts/tensor-fusion/values.yaml

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,62 @@ alert:
158158
templates:
159159
- /etc/alertmanager/*.tmpl
160160

161+
schedulerConfig:
162+
apiVersion: kubescheduler.config.k8s.io/v1
163+
kind: KubeSchedulerConfiguration
164+
clientConnection:
165+
kubeconfig: ""
166+
qps: 50
167+
burst: 100
168+
profiles:
169+
# Refer: https://kubernetes.io/docs/reference/scheduling/config/
170+
- schedulerName: tensor-fusion-scheduler
171+
plugins:
172+
preFilter:
173+
enabled:
174+
- name: GPUResourcesFit
175+
filter:
176+
enabled:
177+
- name: GPUResourcesFit
178+
- name: GPUNetworkTopologyAware
179+
score:
180+
enabled:
181+
- name: GPUResourcesFit
182+
weight: 5
183+
preBind:
184+
enabled:
185+
- name: GPUResourcesFit
186+
pluginConfig:
187+
- name: GPUResourcesFit
188+
args:
189+
maxWorkerPerNode: 256
190+
vramWeight: 0.7
191+
tflopsWeight: 0.3
192+
- name: GPUNetworkTopologyAware
193+
args:
194+
# Avoid the remote TFWorker RX/TX to avoid single node consume too much bandwidth
195+
# Need enable monitor to take effect
196+
totalIntranetBandWidthGBps: 100
197+
- name: NodeResourcesFit
198+
args:
199+
scoringStrategy:
200+
resources:
201+
- name: cpu
202+
weight: 1
203+
- name: memory
204+
weight: 1
205+
requestedToCapacityRatio:
206+
shape:
207+
- utilization: 0
208+
score: 0
209+
- utilization: 80
210+
score: 10
211+
- utilization: 90
212+
score: 2
213+
- utilization: 100
214+
score: 0
215+
type: RequestedToCapacityRatio
216+
161217
# KV structure config for other global configs
162218
dynamicConfig:
163219
# retention period for metrics data

0 commit comments

Comments
 (0)