Skip to content

Commit 984cbd1

Browse files
authored
feat: add sidecar worker mode for hard-isolation, worker customization in annotation (#387)
1 parent 7725719 commit 984cbd1

16 files changed

+34143
-55
lines changed

api/v1/workloadprofile_types.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,11 @@ type WorkloadProfileSpec struct {
5252
// Schedule the workload to the same GPU server that runs vGPU worker for best performance, default to false
5353
IsLocalGPU bool `json:"isLocalGPU,omitempty"`
5454

55+
// +optional
56+
// When set to sidecar worker mode, its always Local GPU mode, and hard-isolated with shared memory
57+
// default to false, indicates the workload's embedded worker is same process, soft-isolated
58+
SidecarWorker bool `json:"sidecarWorker,omitempty"`
59+
5560
// +optional
5661
// GPUModel specifies the required GPU model (e.g., "A100", "H100")
5762
GPUModel string `json:"gpuModel,omitempty"`
@@ -68,6 +73,10 @@ type WorkloadProfileSpec struct {
6873
// +optional
6974
// NodeAffinity specifies the node affinity requirements for the workload
7075
NodeAffinity *v1.NodeAffinity `json:"nodeAffinity,omitempty"`
76+
77+
// +optional
78+
// WorkerPodTemplate is the template for the worker pod, only take effect in remote vGPU mode
79+
WorkerPodTemplate *v1.PodTemplateSpec `json:"workerPodTemplate,omitempty"`
7180
}
7281

7382
func (t WorkloadProfileSpec) IsDynamicReplica() bool {

api/v1/zz_generated.deepcopy.go

Lines changed: 5 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

charts/tensor-fusion/crds/tensor-fusion.ai_tensorfusionworkloads.yaml

Lines changed: 8419 additions & 0 deletions
Large diffs are not rendered by default.

charts/tensor-fusion/crds/tensor-fusion.ai_workloadprofiles.yaml

Lines changed: 8419 additions & 0 deletions
Large diffs are not rendered by default.

config/crd/bases/tensor-fusion.ai_tensorfusionworkloads.yaml

Lines changed: 8419 additions & 0 deletions
Large diffs are not rendered by default.

config/crd/bases/tensor-fusion.ai_workloadprofiles.yaml

Lines changed: 8419 additions & 0 deletions
Large diffs are not rendered by default.

go.mod

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,8 @@ require (
4040
k8s.io/klog/v2 v2.130.1
4141
k8s.io/kube-scheduler v0.34.1
4242
k8s.io/kubernetes v1.34.1
43-
k8s.io/utils v0.0.0-20250820121507-0af2bda4dd1d
44-
sigs.k8s.io/controller-runtime v0.22.1
43+
k8s.io/utils v0.0.0-20251002143259-bc988d571ff4
44+
sigs.k8s.io/controller-runtime v0.22.2
4545
sigs.k8s.io/karpenter v1.6.2
4646
sigs.k8s.io/yaml v1.6.0
4747
)
@@ -181,7 +181,7 @@ require (
181181
gopkg.in/ini.v1 v1.67.0 // indirect
182182
gopkg.in/yaml.v2 v2.4.0 // indirect
183183
gopkg.in/yaml.v3 v3.0.1 // indirect
184-
k8s.io/apiextensions-apiserver v0.34.0 // indirect
184+
k8s.io/apiextensions-apiserver v0.34.1 // indirect
185185
k8s.io/cloud-provider v0.34.0 // indirect
186186
k8s.io/controller-manager v0.34.0 // indirect
187187
k8s.io/csi-translation-lib v0.34.0 // indirect

go.sum

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -516,8 +516,8 @@ gorm.io/gorm v1.31.0 h1:0VlycGreVhK7RF/Bwt51Fk8v0xLiiiFdbGDPIZQ7mJY=
516516
gorm.io/gorm v1.31.0/go.mod h1:XyQVbO2k6YkOis7C2437jSit3SsDK72s7n7rsSHd+Gs=
517517
k8s.io/api v0.34.1 h1:jC+153630BMdlFukegoEL8E/yT7aLyQkIVuwhmwDgJM=
518518
k8s.io/api v0.34.1/go.mod h1:SB80FxFtXn5/gwzCoN6QCtPD7Vbu5w2n1S0J5gFfTYk=
519-
k8s.io/apiextensions-apiserver v0.34.0 h1:B3hiB32jV7BcyKcMU5fDaDxk882YrJ1KU+ZSkA9Qxoc=
520-
k8s.io/apiextensions-apiserver v0.34.0/go.mod h1:hLI4GxE1BDBy9adJKxUxCEHBGZtGfIg98Q+JmTD7+g0=
519+
k8s.io/apiextensions-apiserver v0.34.1 h1:NNPBva8FNAPt1iSVwIE0FsdrVriRXMsaWFMqJbII2CI=
520+
k8s.io/apiextensions-apiserver v0.34.1/go.mod h1:hP9Rld3zF5Ay2Of3BeEpLAToP+l4s5UlxiHfqRaRcMc=
521521
k8s.io/apimachinery v0.34.1 h1:dTlxFls/eikpJxmAC7MVE8oOeP1zryV7iRyIjB0gky4=
522522
k8s.io/apimachinery v0.34.1/go.mod h1:/GwIlEcWuTX9zKIg2mbw0LRFIsXwrfoVxn+ef0X13lw=
523523
k8s.io/apiserver v0.34.1 h1:U3JBGdgANK3dfFcyknWde1G6X1F4bg7PXuvlqt8lITA=
@@ -550,13 +550,13 @@ k8s.io/kubelet v0.34.0 h1:1nZt1Q6Kfx7xCaTS9vnqR9sjZDxf3cRSQkAFCczULmc=
550550
k8s.io/kubelet v0.34.0/go.mod h1:NqbF8ViVettlZbf9hw9DJhubaWn7rGvDDTcLMDm6tQ0=
551551
k8s.io/kubernetes v1.34.1 h1:F3p8dtpv+i8zQoebZeK5zBqM1g9x1aIdnA5vthvcuUk=
552552
k8s.io/kubernetes v1.34.1/go.mod h1:iu+FhII+Oc/1gGWLJcer6wpyih441aNFHl7Pvm8yPto=
553-
k8s.io/utils v0.0.0-20250820121507-0af2bda4dd1d h1:wAhiDyZ4Tdtt7e46e9M5ZSAJ/MnPGPs+Ki1gHw4w1R0=
554-
k8s.io/utils v0.0.0-20250820121507-0af2bda4dd1d/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0=
553+
k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 h1:SjGebBtkBqHFOli+05xYbK8YF1Dzkbzn+gDM4X9T4Ck=
554+
k8s.io/utils v0.0.0-20251002143259-bc988d571ff4/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0=
555555
rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4=
556556
sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.33.0 h1:qPrZsv1cwQiFeieFlRqT627fVZ+tyfou/+S5S0H5ua0=
557557
sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.33.0/go.mod h1:Ve9uj1L+deCXFrPOk1LpFXqTg7LCFzFso6PA48q/XZw=
558-
sigs.k8s.io/controller-runtime v0.22.1 h1:Ah1T7I+0A7ize291nJZdS1CabF/lB4E++WizgV24Eqg=
559-
sigs.k8s.io/controller-runtime v0.22.1/go.mod h1:FwiwRjkRPbiN+zp2QRp7wlTCzbUXxZ/D4OzuQUDwBHY=
558+
sigs.k8s.io/controller-runtime v0.22.2 h1:cK2l8BGWsSWkXz09tcS4rJh95iOLney5eawcK5A33r4=
559+
sigs.k8s.io/controller-runtime v0.22.2/go.mod h1:+QX1XUpTXN4mLoblf4tqr5CQcyHPAki2HLXqQMY6vh8=
560560
sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 h1:IpInykpT6ceI+QxKBbEflcR5EXP7sU1kvOlxwZh5txg=
561561
sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730/go.mod h1:mdzfpAEoE6DHQEN0uh9ZbOCuHbLK5wOm7dK4ctXE9Tg=
562562
sigs.k8s.io/karpenter v1.6.2 h1:WFayZ49CSOaDMku1iYBTsD3A9hOB2yU/U95VcSAJ8KM=

internal/constants/constants.go

Lines changed: 16 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -55,18 +55,18 @@ const (
5555
GpuPoolKey = Domain + "/gpupool"
5656

5757
// Annotation key constants
58-
GpuCountAnnotation = Domain + "/gpu-count"
59-
TFLOPSRequestAnnotation = Domain + "/tflops-request"
60-
VRAMRequestAnnotation = Domain + "/vram-request"
61-
TFLOPSLimitAnnotation = Domain + "/tflops-limit"
62-
VRAMLimitAnnotation = Domain + "/vram-limit"
63-
WorkloadProfileAnnotation = Domain + "/workload-profile"
64-
InjectContainerAnnotation = Domain + "/inject-container"
65-
IsLocalGPUAnnotation = Domain + "/is-local-gpu"
66-
QoSLevelAnnotation = Domain + "/qos"
67-
EmbeddedWorkerAnnotation = Domain + "/embedded-worker"
68-
DedicatedWorkerAnnotation = Domain + "/dedicated-worker"
69-
StandaloneWorkerModeAnnotation = Domain + "/no-standalone-worker-mode"
58+
GpuCountAnnotation = Domain + "/gpu-count"
59+
TFLOPSRequestAnnotation = Domain + "/tflops-request"
60+
VRAMRequestAnnotation = Domain + "/vram-request"
61+
TFLOPSLimitAnnotation = Domain + "/tflops-limit"
62+
VRAMLimitAnnotation = Domain + "/vram-limit"
63+
WorkloadProfileAnnotation = Domain + "/workload-profile"
64+
InjectContainerAnnotation = Domain + "/inject-container"
65+
IsLocalGPUAnnotation = Domain + "/is-local-gpu"
66+
QoSLevelAnnotation = Domain + "/qos"
67+
EmbeddedWorkerAnnotation = Domain + "/embedded-worker"
68+
DedicatedWorkerAnnotation = Domain + "/dedicated-worker"
69+
SidecarWorkerAnnotation = Domain + "/sidecar-worker"
7070
// GPUModelAnnotation specifies the required GPU model (e.g., "A100", "H100")
7171
GPUModelAnnotation = Domain + "/gpu-model"
7272
// GPU ID list is assigned by scheduler, should not specified by user
@@ -76,6 +76,8 @@ const (
7676
PricingAnnotation = Domain + "/hourly-pricing"
7777
// In remote vGPU mode, selected workload is set by user with /workload annotation or generated by system
7878
SelectedWorkloadAnnotation = Domain + "/selected-workload"
79+
// Additional worker pod template is set by user with /worker-pod-template annotation
80+
WorkerPodTemplateAnnotation = Domain + "/worker-pod-template"
7981

8082
WorkloadModeAnnotation = Domain + "/workload-mode"
8183
WorkloadModeDynamic = "dynamic"
@@ -181,6 +183,8 @@ const (
181183
const TFDataPath = "/run/tensor-fusion"
182184
const TFDataPathWorkerExpr = "shm/$(POD_NAMESPACE)/$(POD_NAME)"
183185
const DataVolumeName = "tf-data"
186+
const TransportShmVolumeName = "tf-transport-shm"
187+
const TransportShmPath = "/dev/shm"
184188
const TensorFusionPoolManualCompaction = Domain + "/manual-compaction"
185189
const TensorFusionSystemName = "tensor-fusion"
186190

internal/constants/env.go

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -58,9 +58,13 @@ const (
5858

5959
// TensorFusion client related envs
6060
const (
61-
GetConnectionURLEnv = "TENSOR_FUSION_OPERATOR_GET_CONNECTION_URL"
62-
ConnectionNameEnv = "TENSOR_FUSION_CONNECTION_NAME"
63-
ConnectionNamespaceEnv = "TENSOR_FUSION_CONNECTION_NAMESPACE"
61+
GetConnectionURLEnv = "TENSOR_FUSION_OPERATOR_GET_CONNECTION_URL"
62+
ConnectionInfoEnv = "TENSOR_FUSION_OPERATOR_CONNECTION_INFO"
63+
ConnectionNameEnv = "TENSOR_FUSION_CONNECTION_NAME"
64+
ConnectionNamespaceEnv = "TENSOR_FUSION_CONNECTION_NAMESPACE"
65+
DisableVMSharedMemEnv = "TF_USE_IVSHMEM"
66+
ConnectionSharedMemSize = "256"
67+
ConnectionSharedMemName = "tf_shm"
6468

6569
RealNvmlLibPathEnv = "TF_NVML_LIB_PATH"
6670
RealCUDALibPathEnv = "TF_CUDA_LIB_PATH"

0 commit comments

Comments
 (0)