Skip to content

Commit 6686d1a

Browse files
wkd-woo장재영B
authored andcommitted
Add configurable allocation policy for replicated and MIG resources
The existing distributedAlloc spreads replicated devices evenly across physical GPUs, which was designed for time-slicing where workloads compete for shared compute. MIG instances, however, are hardware-isolated and do not suffer from contention when packed onto the same GPU. This adds a --allocation-policy flag (env: ALLOCATION_POLICY) with two options: "distributed" (default, preserving current behavior) and "packed" (bin-packing onto fewest physical GPUs). The packed policy helps free up entire GPUs for full-GPU workloads in mixed clusters. The flag applies uniformly to all non-aligned allocation paths (MIG, time-slicing, MPS) and can be configured per-node via ConfigMap and the nvidia.com/device-plugin.config node label. Signed-off-by: 장재영B <jae.j@tossinvest.com>
1 parent fdbb5e0 commit 6686d1a

File tree

6 files changed

+516
-1
lines changed

6 files changed

+516
-1
lines changed

api/config/v1/consts.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,12 @@ const (
4848
DeviceIDStrategyIndex = "index"
4949
)
5050

51+
// Constants to represent the various allocation policies
52+
const (
53+
AllocationPolicyDistributed = "distributed"
54+
AllocationPolicyPacked = "packed"
55+
)
56+
5157
// Constants related to generating CDI specifications
5258
const (
5359
DefaultCDIAnnotationPrefix = cdiapi.AnnotationPrefix

api/config/v1/flags.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ type PluginCommandLineFlags struct {
7979
CDIAnnotationPrefix *string `json:"cdiAnnotationPrefix" yaml:"cdiAnnotationPrefix"`
8080
NvidiaCTKPath *string `json:"nvidiaCTKPath" yaml:"nvidiaCTKPath"`
8181
ContainerDriverRoot *string `json:"containerDriverRoot" yaml:"containerDriverRoot"`
82+
AllocationPolicy *string `json:"allocationPolicy" yaml:"allocationPolicy"`
8283
}
8384

8485
// deviceListStrategyFlag is a custom type for parsing the deviceListStrategy flag.
@@ -157,6 +158,8 @@ func (f *Flags) UpdateFromCLIFlags(c *cli.Context, flags []cli.Flag) {
157158
updateFromCLIFlag(&f.Plugin.NvidiaCTKPath, c, n)
158159
case "container-driver-root":
159160
updateFromCLIFlag(&f.Plugin.ContainerDriverRoot, c, n)
161+
case "allocation-policy":
162+
updateFromCLIFlag(&f.Plugin.AllocationPolicy, c, n)
160163
}
161164
// GFD specific flags
162165
if f.GFD == nil {

cmd/nvidia-device-plugin/main.go

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,12 @@ func main() {
144144
Usage: "the path on the host where MPS-specific mounts and files are created by the MPS control daemon manager",
145145
EnvVars: []string{"MPS_ROOT"},
146146
},
147+
&cli.StringFlag{
148+
Name: "allocation-policy",
149+
Value: spec.AllocationPolicyDistributed,
150+
Usage: "the allocation policy for replicated and MIG resources:\n\t\t[distributed | packed]",
151+
EnvVars: []string{"ALLOCATION_POLICY"},
152+
},
147153
&cli.StringFlag{
148154
Name: "device-discovery-strategy",
149155
Value: "auto",
@@ -205,6 +211,15 @@ func validateFlags(infolib nvinfo.Interface, config *spec.Config) error {
205211
return fmt.Errorf("invalid --device-id-strategy option: %v", *config.Flags.Plugin.DeviceIDStrategy)
206212
}
207213

214+
if config.Flags.Plugin.AllocationPolicy != nil {
215+
switch *config.Flags.Plugin.AllocationPolicy {
216+
case spec.AllocationPolicyDistributed:
217+
case spec.AllocationPolicyPacked:
218+
default:
219+
return fmt.Errorf("invalid --allocation-policy option: %v", *config.Flags.Plugin.AllocationPolicy)
220+
}
221+
}
222+
208223
if config.Sharing.SharingStrategy() == spec.SharingStrategyMPS {
209224
if *config.Flags.MigStrategy == spec.MigStrategyMixed {
210225
return fmt.Errorf("using --mig-strategy=mixed is not supported with MPS")

internal/rm/allocate.go

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,3 +78,61 @@ func (r *resourceManager) distributedAlloc(available, required []string, size in
7878

7979
return devices, nil
8080
}
81+
82+
// packedAlloc returns a list of devices such that any replicated devices are
83+
// packed onto as few physical GPUs as possible. It preferentially allocates
84+
// replicas from GPUs that already have the most allocated replicas, which
85+
// helps consolidate workloads and free up entire GPUs for other uses.
86+
func (r *resourceManager) packedAlloc(available, required []string, size int) ([]string, error) {
87+
// Get the set of candidate devices as the difference between available and required.
88+
candidates := r.devices.Subset(available).Difference(r.devices.Subset(required)).GetIDs()
89+
needed := size - len(required)
90+
91+
if len(candidates) < needed {
92+
return nil, fmt.Errorf("not enough available devices to satisfy allocation")
93+
}
94+
95+
// For each candidate device, build a mapping of (stripped) device ID to
96+
// total / available replicas for that device.
97+
replicas := make(map[string]*struct{ total, available int })
98+
for _, c := range candidates {
99+
id := AnnotatedID(c).GetID()
100+
if _, exists := replicas[id]; !exists {
101+
replicas[id] = &struct{ total, available int }{}
102+
}
103+
replicas[id].available++
104+
}
105+
for d := range r.devices {
106+
id := AnnotatedID(d).GetID()
107+
if _, exists := replicas[id]; !exists {
108+
continue
109+
}
110+
replicas[id].total++
111+
}
112+
113+
// Grab the set of 'needed' devices one-by-one from the candidates list.
114+
// Before selecting each candidate, first sort the candidate list using the
115+
// replicas map above. After sorting, the first element in the list will
116+
// contain the device with the greatest difference between total and available
117+
// replications (i.e. the most already allocated). This packs allocations
118+
// onto GPUs that are already in use, freeing up other GPUs entirely.
119+
var devices []string
120+
for i := 0; i < needed; i++ {
121+
sort.Slice(candidates, func(i, j int) bool {
122+
iid := AnnotatedID(candidates[i]).GetID()
123+
jid := AnnotatedID(candidates[j]).GetID()
124+
idiff := replicas[iid].total - replicas[iid].available
125+
jdiff := replicas[jid].total - replicas[jid].available
126+
return idiff > jdiff
127+
})
128+
id := AnnotatedID(candidates[0]).GetID()
129+
replicas[id].available--
130+
devices = append(devices, candidates[0])
131+
candidates = candidates[1:]
132+
}
133+
134+
// Add the set of required devices to this list and return it.
135+
devices = append(required, devices...)
136+
137+
return devices, nil
138+
}

0 commit comments

Comments
 (0)