Skip to content

Commit 5c1fcab

Browse files
authored
feat: add pod namespace and container name env vars for worker and hypervisor containers (#263)
1 parent a0486e4 commit 5c1fcab

File tree

2 files changed

+44
-2
lines changed

2 files changed

+44
-2
lines changed

internal/constants/env.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,21 +72,25 @@ const (
7272
HostIPFieldRef = "status.hostIP"
7373
NodeNameFieldRef = "spec.nodeName"
7474
ResourceNameFieldRef = "metadata.name"
75+
NamespaceFieldRef = "metadata.namespace"
7576
)
7677

7778
// TensorFusion worker related envs
7879
const (
7980
HypervisorIPEnv = "HYPERVISOR_IP"
8081
HypervisorPortEnv = "HYPERVISOR_PORT"
8182

83+
PodNamespaceEnv = "POD_NAMESPACE"
84+
ContainerNameEnv = "CONTAINER_NAME"
85+
8286
TensorFusionRemoteWorkerPortNumber = 8000
8387
TensorFusionRemoteWorkerPortName = "remote-vgpu"
8488
)
8589

8690
// TensorFusion hypervisor related envs
8791
const (
8892
HypervisorPoolNameEnv = "TENSOR_FUSION_POOL_NAME"
89-
HypervisorPodNameEnv = "POD_NAME"
93+
PodNameEnv = "POD_NAME"
9094
VectorPodNodeNameEnv = "NODE_NAME"
9195
HypervisorGPUNodeNameEnv = "GPU_NODE_NAME"
9296
HypervisorSchedulingConfigEnv = "TF_HYPERVISOR_SCHEDULING_CONFIG"

internal/utils/compose.go

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,23 @@ func AddTFDefaultClientConfBeforePatch(
190190
}, v1.EnvVar{
191191
Name: constants.HypervisorPortEnv,
192192
Value: strconv.Itoa(int(getHypervisorPortNumber(pool.Spec.ComponentConfig.Hypervisor))),
193+
}, v1.EnvVar{
194+
Name: constants.PodNamespaceEnv,
195+
ValueFrom: &v1.EnvVarSource{
196+
FieldRef: &v1.ObjectFieldSelector{
197+
FieldPath: constants.NamespaceFieldRef,
198+
},
199+
},
200+
}, v1.EnvVar{
201+
Name: constants.PodNameEnv,
202+
ValueFrom: &v1.EnvVarSource{
203+
FieldRef: &v1.ObjectFieldSelector{
204+
FieldPath: constants.ResourceNameFieldRef,
205+
},
206+
},
207+
}, v1.EnvVar{
208+
Name: constants.ContainerNameEnv,
209+
Value: pod.Spec.Containers[injectContainerIndex].Name,
193210
})
194211
}
195212
}
@@ -287,7 +304,7 @@ func composeHypervisorContainer(spec *v1.PodSpec, pool *tfv1.GPUPool) {
287304
Name: constants.HypervisorListenAddrEnv,
288305
Value: fmt.Sprintf("%s:%d", constants.DefaultHttpBindIP, port),
289306
}, v1.EnvVar{
290-
Name: constants.HypervisorPodNameEnv,
307+
Name: constants.PodNameEnv,
291308
ValueFrom: &v1.EnvVarSource{
292309
FieldRef: &v1.ObjectFieldSelector{
293310
FieldPath: constants.ResourceNameFieldRef,
@@ -425,6 +442,10 @@ func AddTFNodeDiscoveryConfAfterTemplate(ctx context.Context, tmpl *v1.PodTempla
425442
func AddWorkerConfAfterTemplate(ctx context.Context, spec *v1.PodSpec, workerConfig *tfv1.WorkerConfig, hypervisorConfig *tfv1.HypervisorConfig) {
426443
// NOTE: need to set environment variable to make all GPUs visible to the worker,
427444
// vgpu.rs limiter will limit to specific devices after Pod started
445+
spec.Containers[0].Name = constants.TFContainerNameWorker
446+
if workerConfig.Image != "" {
447+
spec.Containers[0].Image = workerConfig.Image
448+
}
428449
spec.Containers[0].Env = append(spec.Containers[0].Env, v1.EnvVar{
429450
Name: constants.NvidiaVisibleAllDeviceEnv,
430451
Value: constants.NvidiaVisibleAllDeviceValue,
@@ -441,6 +462,23 @@ func AddWorkerConfAfterTemplate(ctx context.Context, spec *v1.PodSpec, workerCon
441462
}, v1.EnvVar{
442463
Name: constants.HypervisorPortEnv,
443464
Value: strconv.Itoa(int(getHypervisorPortNumber(hypervisorConfig))),
465+
}, v1.EnvVar{
466+
Name: constants.PodNameEnv,
467+
ValueFrom: &v1.EnvVarSource{
468+
FieldRef: &v1.ObjectFieldSelector{
469+
FieldPath: constants.ResourceNameFieldRef,
470+
},
471+
},
472+
}, v1.EnvVar{
473+
Name: constants.ContainerNameEnv,
474+
Value: constants.TFContainerNameWorker,
475+
}, v1.EnvVar{
476+
Name: constants.PodNamespaceEnv,
477+
ValueFrom: &v1.EnvVarSource{
478+
FieldRef: &v1.ObjectFieldSelector{
479+
FieldPath: constants.NamespaceFieldRef,
480+
},
481+
},
444482
})
445483

446484
// Add volume from host for CUDA hot migration and snapshot

0 commit comments

Comments
 (0)