Skip to content

Commit 3ce9617

Browse files
authored
fix: tf connection reconcile bug, add API auth and fix scheduler binding permission bug (#257)
* fix: worker schedule to cpu node bug, refactor main func, webhook missing patches issue * chore: lint issue * fix: connection and assign-port API auth, scheduler binding permission * fix: connection can not recover when pod running again bug * fix: lint issue * fix: scheduler not set correct gpu-ids annotation issue
1 parent bafa6e0 commit 3ce9617

File tree

17 files changed

+267
-56
lines changed

17 files changed

+267
-56
lines changed

.vscode/settings.json

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@
107107
"schedulingconfigtemplate",
108108
"schedulingconfigtemplates",
109109
"schedulingcorev",
110+
"serviceaccount",
110111
"shirou",
111112
"shortuuid",
112113
"statefulsets",
@@ -118,6 +119,7 @@
118119
"tensorfusionaiv",
119120
"tensorfusioncluster",
120121
"tensorfusionclusters",
122+
"tensorfusionconnection",
121123
"tensorfusionconnections",
122124
"tensorfusionworkload",
123125
"tensorfusionworkloads",

charts/tensor-fusion/Chart.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,10 @@ type: application
1515
# This is the chart version. This version number should be incremented each time you make changes
1616
# to the chart and its templates, including the app version.
1717
# Versions are expected to follow Semantic Versioning (https://semver.org/)
18-
version: 1.4.2
18+
version: 1.4.3
1919

2020
# This is the version number of the application being deployed. This version number should be
2121
# incremented each time you make changes to the application. Versions are not expected to
2222
# follow Semantic Versioning. They should reflect the version the application is using.
2323
# It is recommended to use it with quotes.
24-
appVersion: "1.35.2"
24+
appVersion: "1.36.1"

charts/tensor-fusion/templates/rbac.yaml

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@ kind: ClusterRole
33
metadata:
44
name: {{ include "tensor-fusion.fullname" . }}-role
55
rules:
6-
rules:
76
- apiGroups:
87
- ""
98
resources:
@@ -35,24 +34,19 @@ rules:
3534
resources:
3635
- nodes/finalizers
3736
- pods/binding
37+
- pods/exec
3838
- pods/finalizers
3939
verbs:
40-
- update
41-
- apiGroups:
42-
- ""
43-
resources:
44-
- nodes/status
45-
- pods/status
46-
verbs:
40+
- create
4741
- get
4842
- patch
4943
- update
5044
- apiGroups:
5145
- ""
5246
resources:
53-
- pods/exec
47+
- nodes/status
48+
- pods/status
5449
verbs:
55-
- create
5650
- get
5751
- patch
5852
- update

cmd/main.go

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,7 @@ func main() {
156156
gpuInfos := make([]config.GpuInfo, 0)
157157
gpuPricingMap := make(map[string]float64)
158158
startWatchGPUInfoChanges(ctx, &gpuInfos, gpuPricingMap)
159+
utils.InitServiceAccountConfig()
159160

160161
metricsServerOptions := metricsserver.Options{
161162
BindAddress: metricsAddr,
@@ -205,7 +206,7 @@ func main() {
205206

206207
startCustomResourceController(ctx, mgr, metricsRecorder, allocator, portAllocator)
207208

208-
startHttpServerForTFClient(ctx, kc, portAllocator)
209+
startHttpServerForTFClient(ctx, kc, portAllocator, mgr.Elected())
209210

210211
// +kubebuilder:scaffold:builder
211212
addHealthCheckAPI(mgr)
@@ -250,7 +251,12 @@ func startTensorFusionAllocators(
250251
return allocator, portAllocator
251252
}
252253

253-
func startHttpServerForTFClient(ctx context.Context, kc *rest.Config, portAllocator *portallocator.PortAllocator) {
254+
func startHttpServerForTFClient(
255+
ctx context.Context,
256+
kc *rest.Config,
257+
portAllocator *portallocator.PortAllocator,
258+
leaderChan <-chan struct{},
259+
) {
254260
client, err := client.NewWithWatch(kc, client.Options{Scheme: scheme})
255261
if err != nil {
256262
setupLog.Error(err, "failed to create client with watch")
@@ -266,7 +272,7 @@ func startHttpServerForTFClient(ctx context.Context, kc *rest.Config, portAlloca
266272
setupLog.Error(err, "failed to create assign host port router")
267273
os.Exit(1)
268274
}
269-
httpServer := server.NewHTTPServer(connectionRouter, assignHostPortRouter)
275+
httpServer := server.NewHTTPServer(connectionRouter, assignHostPortRouter, leaderChan)
270276
go func() {
271277
err := httpServer.Run()
272278
if err != nil {

config/rbac/role.yaml

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -35,24 +35,19 @@ rules:
3535
resources:
3636
- nodes/finalizers
3737
- pods/binding
38+
- pods/exec
3839
- pods/finalizers
3940
verbs:
40-
- update
41-
- apiGroups:
42-
- ""
43-
resources:
44-
- nodes/status
45-
- pods/status
46-
verbs:
41+
- create
4742
- get
4843
- patch
4944
- update
5045
- apiGroups:
5146
- ""
5247
resources:
53-
- pods/exec
48+
- nodes/status
49+
- pods/status
5450
verbs:
55-
- create
5651
- get
5752
- patch
5853
- update

internal/constants/constants.go

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,3 +185,21 @@ const (
185185
LowFrequencyObjFailureMaxBurst = 1
186186
LowFrequencyObjFailureConcurrentReconcile = 5
187187
)
188+
189+
// For security enhancement, there are 2 types of endpoints to protect
190+
// 1. client call operator /connection API, to obtain tensor fusion worker's URL
191+
// 2. worker call hypervisor API, to obtain current workers GPU quota info
192+
// if this env var is set on operator and hypervisor, will try to verify JWT signature for each call
193+
// not implemented yet, iss is public in EKS and most K8S distribution
194+
// but k3s and some K8S distribution may not support, need to find some way to get SA token JWT pub key
195+
196+
const HypervisorVerifyServiceAccountEnabledEnvVar = "SA_TOKEN_VERIFY_ENABLED"
197+
const HypervisorVerifyServiceAccountPublicKeyEnvVar = "SA_TOKEN_VERIFY_PUBLIC_KEY"
198+
199+
// TensorFusion ControllerManager's http endpoint will verify Pod JWT signature
200+
// if this env var is set, will disable the verification, it's enabled by default
201+
// should not set to true in production environment
202+
const DisableConnectionAuthEnv = "DISABLE_CONNECTION_AUTH"
203+
204+
const AuthorizationHeader = "Authorization"
205+
const ExtraVerificationInfoPodIDKey = "authentication.kubernetes.io/pod-uid"

internal/controller/node_controller.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ type NodeReconciler struct {
5353

5454
// +kubebuilder:rbac:groups=core,resources=nodes,verbs=get;list;watch;create;update;patch;delete
5555
// +kubebuilder:rbac:groups=core,resources=nodes/status,verbs=get;update;patch
56-
// +kubebuilder:rbac:groups=core,resources=nodes/finalizers,verbs=update
56+
// +kubebuilder:rbac:groups=core,resources=nodes/finalizers,verbs=create;get;patch;update
5757

5858
// This reconcile loop only take effect on nodeSelector mode, while in AutoProvision mode, GPUNode will manage the K8S Node rather than reversed
5959
func (r *NodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {

internal/controller/pod_controller.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,8 +57,8 @@ type PodReconciler struct {
5757
// +kubebuilder:rbac:groups=policy,resources=*,verbs=get;list;watch
5858
// +kubebuilder:rbac:groups=core,resources=pods/status,verbs=get;update;patch
5959
// +kubebuilder:rbac:groups=core,resources=pods/exec,verbs=create;get;update;patch
60-
// +kubebuilder:rbac:groups=core,resources=pods/finalizers,verbs=update
61-
// +kubebuilder:rbac:groups=core,resources=pods/binding,verbs=update
60+
// +kubebuilder:rbac:groups=core,resources=pods/finalizers,verbs=create;get;update;patch
61+
// +kubebuilder:rbac:groups=core,resources=pods/binding,verbs=create;get;update;patch
6262

6363
// Add GPU connection for Pods using GPU
6464
// Have to create TensorFusion connection here because pod UID not available in MutatingWebhook

internal/controller/tensorfusioncluster_controller.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -197,7 +197,7 @@ func (r *TensorFusionClusterReconciler) listOwnedGPUPools(ctx context.Context, t
197197
return gpupoolsList.Items, nil
198198
}
199199

200-
func (r *TensorFusionClusterReconciler) reconcileTimeSeriesDatabase(ctx context.Context, tfc *tfv1.TensorFusionCluster) (bool, error) {
200+
func (r *TensorFusionClusterReconciler) reconcileTimeSeriesDatabase(_ context.Context, _ *tfv1.TensorFusionCluster) (bool, error) {
201201
// TODO: Not implemented yet
202202
return false, nil
203203
}

internal/controller/tensorfusionconnection_controller.go

Lines changed: 28 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -87,10 +87,8 @@ func (r *TensorFusionConnectionReconciler) Reconcile(ctx context.Context, req ct
8787
}
8888
}
8989

90-
needSelectWorker, shouldReturn, err := r.shouldSelectWorker(ctx, connection)
91-
if shouldReturn {
92-
// when err is not nil and shouldReturn is true,
93-
// it means already cleared the existing workerName and updated status, wait next reconcile loop
90+
needSelectWorker, err := r.shouldSelectWorker(ctx, connection)
91+
if err != nil {
9492
return ctrl.Result{}, err
9593
}
9694

@@ -99,6 +97,7 @@ func (r *TensorFusionConnectionReconciler) Reconcile(ctx context.Context, req ct
9997
return ctrl.Result{}, nil
10098
}
10199

100+
log.Info("Selecting worker for connection", "connection", connection.Name, "namespace", connection.Namespace)
102101
if workload.Spec.IsDynamicReplica() {
103102
// 1st MODE: select the dedicated worker if it's running, otherwise wait utils it's becoming ready
104103
return ctrl.Result{}, r.syncDedicatedWorkerStatus(ctx, connection)
@@ -109,8 +108,8 @@ func (r *TensorFusionConnectionReconciler) Reconcile(ctx context.Context, req ct
109108
}
110109

111110
func (r *TensorFusionConnectionReconciler) syncDedicatedWorkerStatus(ctx context.Context, connection *tfv1.TensorFusionConnection) error {
112-
var pod v1.Pod
113-
if err := r.Get(ctx, client.ObjectKey{Name: connection.Name, Namespace: connection.Namespace}, &pod); err != nil {
111+
pod := &v1.Pod{}
112+
if err := r.Get(ctx, client.ObjectKey{Name: connection.Name, Namespace: connection.Namespace}, pod); err != nil {
114113
return fmt.Errorf("failed to get dedicated worker pod for connection %w", err)
115114
}
116115
if pod.Status.Phase != v1.PodRunning {
@@ -124,14 +123,18 @@ func (r *TensorFusionConnectionReconciler) syncDedicatedWorkerStatus(ctx context
124123
if revision == "" {
125124
revision = "0"
126125
}
127-
connection.Status.ConnectionURL = fmt.Sprintf("native+%s+%d+%s-%s", pod.Status.PodIP, constants.TensorFusionWorkerPortNumber, pod.Name, revision)
126+
setConnectionWorkerURL(connection, pod.Status.PodIP, pod.Name, revision)
128127
if err := r.Status().Update(ctx, connection); err != nil {
129128
return fmt.Errorf("failed to update connection status: %w", err)
130129
}
131130
return nil
132131
}
133132
}
134133

134+
func setConnectionWorkerURL(connection *tfv1.TensorFusionConnection, podIp string, podName string, revision string) {
135+
connection.Status.ConnectionURL = fmt.Sprintf("native+%s+%d+%s-%s", podIp, constants.TensorFusionWorkerPortNumber, podName, revision)
136+
}
137+
135138
func (r *TensorFusionConnectionReconciler) selectWorkerAndSyncStatusFromWorkerPool(
136139
ctx context.Context,
137140
connection *tfv1.TensorFusionConnection,
@@ -162,8 +165,7 @@ func (r *TensorFusionConnectionReconciler) selectWorkerAndSyncStatusFromWorkerPo
162165
if resourceVersion == "" {
163166
resourceVersion = "0"
164167
}
165-
166-
connection.Status.ConnectionURL = fmt.Sprintf("native+%s+%d+%s-%s", s.WorkerIp, constants.TensorFusionWorkerPortNumber, s.WorkerName, resourceVersion)
168+
setConnectionWorkerURL(connection, s.WorkerIp, s.WorkerName, resourceVersion)
167169
if err := r.Status().Update(ctx, connection); err != nil {
168170
return ctrl.Result{}, fmt.Errorf("update connection status: %w", err)
169171
}
@@ -202,38 +204,48 @@ func (r *TensorFusionConnectionReconciler) patchMatchedWorkerLabel(ctx context.C
202204

203205
func (r *TensorFusionConnectionReconciler) shouldSelectWorker(
204206
ctx context.Context, connection *tfv1.TensorFusionConnection,
205-
) (bool, bool, error) {
206-
needSelectWorker := false
207+
) (needSelectWorker bool, err error) {
207208
if connection.Status.WorkerName != "" {
208209
// check if worker pod is still running
209210
pod := &v1.Pod{}
210211
if err := r.Get(ctx, client.ObjectKey{Name: connection.Status.WorkerName, Namespace: connection.Namespace}, pod); err != nil {
211212
if errors.IsNotFound(err) {
212213
needSelectWorker = true
213214
} else {
214-
return false, true, fmt.Errorf("failed to get worker pod: %w", err)
215+
return needSelectWorker, fmt.Errorf("failed to get worker pod: %w", err)
215216
}
216217
}
218+
// NOTE: no need to handle pod deleting since connection should be deleted at first, sync running status with Pod
217219
if pod.Status.Phase != v1.PodRunning {
218220
connection.Status.WorkerName = ""
219221
connection.Status.Phase = tfv1.WorkerFailed
220222
connection.Status.ConnectionURL = ""
221223
// set worker name to empty to trigger select worker again
222224
if updateErr := r.Status().Update(ctx, connection); updateErr != nil {
223-
return false, true, fmt.Errorf("failed to update connection status: %w", updateErr)
225+
return false, fmt.Errorf("failed to update connection status: %w", updateErr)
226+
}
227+
// let next reconcile loop to trigger select worker
228+
return false, nil
229+
} else if connection.Status.Phase != tfv1.WorkerRunning {
230+
// pod is running now, but connection is not running, update connection to running
231+
connection.Status.Phase = tfv1.WorkerRunning
232+
setConnectionWorkerURL(connection, pod.Status.PodIP, pod.Name, pod.ResourceVersion)
233+
if updateErr := r.Status().Update(ctx, connection); updateErr != nil {
234+
return false, fmt.Errorf("failed to update connection status: %w", updateErr)
224235
}
225-
return false, true, nil
236+
// current worker is working again, no need to select another worker
237+
return false, nil
226238
}
227239
} else {
228240
if connection.Status.Phase == "" {
229241
connection.Status.Phase = tfv1.WorkerPending
230242
if updateErr := r.Status().Update(ctx, connection); updateErr != nil {
231-
return false, true, fmt.Errorf("failed to update connection status: %w", updateErr)
243+
return false, fmt.Errorf("failed to update connection status: %w", updateErr)
232244
}
233245
}
234246
needSelectWorker = true
235247
}
236-
return needSelectWorker, false, nil
248+
return needSelectWorker, nil
237249
}
238250

239251
// SetupWithManager sets up the controller with the Manager.

0 commit comments

Comments
 (0)