Skip to content

Commit 1f9350d

Browse files
authored
unset env NVIDIA_VISIBLE_DEVICES when gpushare is enabled (#1273)
* unset env NVIDIA_VISIBLE_DEVICES when gpushare is enabled Signed-off-by: Yi Chen <github@chenyicn.net> * Group constants into one const block Signed-off-by: Yi Chen <github@chenyicn.net> --------- Signed-off-by: Yi Chen <github@chenyicn.net>
1 parent 23e9731 commit 1f9350d

File tree

4 files changed

+68
-10
lines changed

4 files changed

+68
-10
lines changed

pkg/argsbuilder/serving.go

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,15 +24,16 @@ import (
2424
"strings"
2525
"time"
2626

27+
log "github.com/sirupsen/logrus"
28+
"github.com/spf13/cobra"
29+
"k8s.io/apimachinery/pkg/api/errors"
2730
"k8s.io/apimachinery/pkg/api/resource"
31+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2832

2933
"github.com/kubeflow/arena/pkg/apis/config"
3034
"github.com/kubeflow/arena/pkg/apis/types"
35+
"github.com/kubeflow/arena/pkg/common"
3136
"github.com/kubeflow/arena/pkg/util"
32-
log "github.com/sirupsen/logrus"
33-
"github.com/spf13/cobra"
34-
"k8s.io/apimachinery/pkg/api/errors"
35-
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3637
)
3738

3839
type ServingArgsBuilder struct {
@@ -685,12 +686,26 @@ func (s *ServingArgsBuilder) disabledNvidiaENVWithNoneGPURequest() error {
685686
if s.args.Envs == nil {
686687
s.args.Envs = map[string]string{}
687688
}
689+
690+
// Handle cloud vendor special configurations first.
691+
if s.hasAliyunGPUConfig() {
692+
return nil
693+
}
694+
695+
// Handle general GPU logic.
688696
if s.args.GPUCount == 0 && s.args.GPUMemory == 0 && s.args.GPUCore == 0 {
689-
s.args.Envs["NVIDIA_VISIBLE_DEVICES"] = "void"
697+
s.args.Envs[common.ENV_NVIDIA_VISIBLE_DEVICES] = "void"
690698
}
699+
691700
return nil
692701
}
693702

703+
// hasAliyunGPUConfig check whether args has labels/devices related to Aliyun GPU config
704+
func (s *ServingArgsBuilder) hasAliyunGPUConfig() bool {
705+
return (s.args.Labels != nil && s.args.Labels[common.LABEL_ALIYUN_COM_GPU_COUNT] != "") ||
706+
(s.args.Devices != nil && s.args.Devices[common.DEVICE_ALIYUN_COM_GPU_MEM] != "")
707+
}
708+
694709
func (s *ServingArgsBuilder) checkGPUCore() error {
695710
if s.args.GPUCore%5 != 0 {
696711
return fmt.Errorf("GPUCore should be the multiple of 5")

pkg/argsbuilder/serving_distributed.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,10 @@ import (
1919
"reflect"
2020
"strings"
2121

22-
"github.com/kubeflow/arena/pkg/apis/types"
2322
"github.com/spf13/cobra"
23+
24+
"github.com/kubeflow/arena/pkg/apis/types"
25+
"github.com/kubeflow/arena/pkg/common"
2426
)
2527

2628
type DistributedServingArgsBuilder struct {
@@ -139,7 +141,7 @@ func (s *DistributedServingArgsBuilder) setNvidiaENV() error {
139141
// Since master and worker share the same envs, but they may have
140142
// different gpu resource, we delete the NVIDIA_VISIBLE_DEVICES env
141143
// and set it in helm chart manually
142-
delete(s.args.Envs, "NVIDIA_VISIBLE_DEVICES")
144+
delete(s.args.Envs, common.ENV_NVIDIA_VISIBLE_DEVICES)
143145
return nil
144146
}
145147

pkg/argsbuilder/submit.go

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,13 @@ import (
2323
"strconv"
2424
"strings"
2525

26+
log "github.com/sirupsen/logrus"
27+
"github.com/spf13/cobra"
28+
2629
"github.com/kubeflow/arena/pkg/apis/config"
2730
"github.com/kubeflow/arena/pkg/apis/types"
31+
"github.com/kubeflow/arena/pkg/common"
2832
"github.com/kubeflow/arena/pkg/util"
29-
log "github.com/sirupsen/logrus"
30-
"github.com/spf13/cobra"
3133
)
3234

3335
type SubmitArgsBuilder struct {
@@ -628,12 +630,26 @@ func (s *SubmitArgsBuilder) disabledNvidiaENVWithNoneGPURequest() error {
628630
if s.args.Envs == nil {
629631
s.args.Envs = map[string]string{}
630632
}
633+
634+
// Handle cloud vendor special configurations first.
635+
if s.hasAliyunGPUConfig() {
636+
return nil
637+
}
638+
639+
// Handle general GPU logic.
631640
if s.args.GPUCount == 0 {
632-
s.args.Envs["NVIDIA_VISIBLE_DEVICES"] = "void"
641+
s.args.Envs[common.ENV_NVIDIA_VISIBLE_DEVICES] = "void"
633642
}
643+
634644
return nil
635645
}
636646

647+
// hasAliyunGPUConfig check whether args has labels/devices related to Aliyun GPU config.
648+
func (s *SubmitArgsBuilder) hasAliyunGPUConfig() bool {
649+
return (s.args.Labels != nil && s.args.Labels[common.LABEL_ALIYUN_COM_GPU_COUNT] != "") ||
650+
(s.args.Devices != nil && s.args.Devices[common.DEVICE_ALIYUN_COM_GPU_MEM] != "")
651+
}
652+
637653
func (s *SubmitArgsBuilder) setJobInfoToEnv() error {
638654
if s.args.Envs == nil {
639655
s.args.Envs = map[string]string{}

pkg/common/constants.go

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
//
2+
// Copyright 2025 The Kubeflow Authors.
3+
//
4+
// Licensed under the Apache License, Version 2.0 (the "License");
5+
// you may not use this file except in compliance with the License.
6+
// You may obtain a copy of the License at
7+
//
8+
// http://www.apache.org/licenses/LICENSE-2.0
9+
//
10+
// Unless required by applicable law or agreed to in writing, software
11+
// distributed under the License is distributed on an "AS IS" BASIS,
12+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
// See the License for the specific language governing permissions and
14+
// limitations under the License.
15+
//
16+
17+
package common
18+
19+
const (
20+
LABEL_ALIYUN_COM_GPU_COUNT = "aliyun.com/gpu-count"
21+
22+
ENV_NVIDIA_VISIBLE_DEVICES = "NVIDIA_VISIBLE_DEVICES"
23+
24+
DEVICE_ALIYUN_COM_GPU_MEM = "aliyun.com/gpu-mem"
25+
)

0 commit comments

Comments
 (0)