Skip to content

Commit 458ae74

Browse files
authored
fix: node expansion issue, add lib path to env (#402)
* fix: add ld lib path back to env * fix: node expansion issue
1 parent 12c3dd5 commit 458ae74

File tree

3 files changed

+11
-2
lines changed

3 files changed

+11
-2
lines changed

internal/constants/env.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,8 @@ const (
7373
RealNvmlLibPathValue = "/lib/x86_64-linux-gnu/libnvidia-ml.so.1"
7474
RealCUDALibPathValue = "/lib/x86_64-linux-gnu/libcuda.so"
7575

76-
PrependPathEnv = "TF_PREPEND_PATH"
76+
PrependPathEnv = "TF_PREPEND_PATH"
77+
PrependLibPathEnv = "TF_LD_LIBRARY_PATH"
7778

7879
RunInsideGPUEnv = "RUN_INSIDE_GPU_NODE"
7980

internal/scheduler/expander/handler.go

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -159,11 +159,16 @@ func (e *NodeExpander) ProcessExpansion(ctx context.Context, pod *corev1.Pod) er
159159
// Step 4: Caused by insufficient GPU resources, try find node util it satisfies the pod
160160
preScheduled := false
161161
for _, gpuNode := range gpuNodes {
162+
// when node is not owned by any known provisioner, skip check, util find a node can be expanded
163+
if len(gpuNode.OwnerReferences) == 0 {
164+
continue
165+
}
162166
preparedNode, preparedGPUs := e.prepareNewNodesForScheduleAttempt(gpuNode, nodeGPUs[gpuNode.Name])
163167
if !e.checkGPUFitForNewNode(pod, preparedGPUs) {
164168
continue
165169
}
166170

171+
e.logger.Info("prepare new node for schedule attempt from existing node", "existingNode", gpuNode.Name, "newNode", preparedNode.Name)
167172
err = e.createGPUNodeClaim(ctx, pod, preparedNode)
168173
if err != nil {
169174
return err
@@ -402,7 +407,7 @@ func (e *NodeExpander) createGPUNodeClaim(ctx context.Context, pod *corev1.Pod,
402407
}
403408
if !isKarpenterNodeClaim && !isGPUNodeClaim {
404409
e.logger.Info("node is not owned by any known provisioner, skip expansion", "node", preparedNode.Name)
405-
return nil
410+
return fmt.Errorf("node is not owned by any known provisioner, skip expansion")
406411
}
407412
e.logger.Info("start expanding node from existing template node", "tmplNode", preparedNode.Name)
408413
if isKarpenterNodeClaim {

internal/utils/compose.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,9 @@ func AddTFDefaultClientConfBeforePatch(
196196
pod.Spec.Containers[injectContainerIndex].Env = append(pod.Spec.Containers[injectContainerIndex].Env, v1.EnvVar{
197197
Name: constants.PrependPathEnv,
198198
Value: constants.TFLibsVolumeMountPath,
199+
}, v1.EnvVar{
200+
Name: constants.PrependLibPathEnv,
201+
Value: constants.TFLibsVolumeMountPath,
199202
})
200203

201204
// Known issue: glibc ldd config style, does NOT support musl, fortunately, musl rarely used in AI workloads

0 commit comments

Comments
 (0)