Skip to content

Commit 421e3c9

Browse files
committed
Add configurable allocation policy for replicated and MIG resources
The existing distributedAlloc spreads replicated devices evenly across physical GPUs, which was designed for time-slicing where workloads compete for shared compute. MIG instances, however, are hardware-isolated and do not suffer from contention when packed onto the same GPU. This adds a --allocation-policy flag (env: ALLOCATION_POLICY) with two options: "distributed" (default, preserving current behavior) and "packed" (bin-packing onto fewest physical GPUs). The packed policy helps free up entire GPUs for full-GPU workloads in mixed clusters. The flag applies uniformly to all non-aligned allocation paths (MIG, time-slicing, MPS) and can be configured per-node via ConfigMap and the nvidia.com/device-plugin.config node label.
1 parent fdbb5e0 commit 421e3c9

File tree

6 files changed

+516
-1
lines changed

6 files changed

+516
-1
lines changed

api/config/v1/consts.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,12 @@ const (
4848
DeviceIDStrategyIndex = "index"
4949
)
5050

51+
// Constants to represent the various allocation policies
52+
const (
53+
AllocationPolicyDistributed = "distributed"
54+
AllocationPolicyPacked = "packed"
55+
)
56+
5157
// Constants related to generating CDI specifications
5258
const (
5359
DefaultCDIAnnotationPrefix = cdiapi.AnnotationPrefix

api/config/v1/flags.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ type PluginCommandLineFlags struct {
7979
CDIAnnotationPrefix *string `json:"cdiAnnotationPrefix" yaml:"cdiAnnotationPrefix"`
8080
NvidiaCTKPath *string `json:"nvidiaCTKPath" yaml:"nvidiaCTKPath"`
8181
ContainerDriverRoot *string `json:"containerDriverRoot" yaml:"containerDriverRoot"`
82+
AllocationPolicy *string `json:"allocationPolicy" yaml:"allocationPolicy"`
8283
}
8384

8485
// deviceListStrategyFlag is a custom type for parsing the deviceListStrategy flag.
@@ -157,6 +158,8 @@ func (f *Flags) UpdateFromCLIFlags(c *cli.Context, flags []cli.Flag) {
157158
updateFromCLIFlag(&f.Plugin.NvidiaCTKPath, c, n)
158159
case "container-driver-root":
159160
updateFromCLIFlag(&f.Plugin.ContainerDriverRoot, c, n)
161+
case "allocation-policy":
162+
updateFromCLIFlag(&f.Plugin.AllocationPolicy, c, n)
160163
}
161164
// GFD specific flags
162165
if f.GFD == nil {

cmd/nvidia-device-plugin/main.go

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,12 @@ func main() {
144144
Usage: "the path on the host where MPS-specific mounts and files are created by the MPS control daemon manager",
145145
EnvVars: []string{"MPS_ROOT"},
146146
},
147+
&cli.StringFlag{
148+
Name: "allocation-policy",
149+
Value: spec.AllocationPolicyDistributed,
150+
Usage: "the allocation policy for replicated and MIG resources:\n\t\t[distributed | packed]",
151+
EnvVars: []string{"ALLOCATION_POLICY"},
152+
},
147153
&cli.StringFlag{
148154
Name: "device-discovery-strategy",
149155
Value: "auto",
@@ -205,6 +211,15 @@ func validateFlags(infolib nvinfo.Interface, config *spec.Config) error {
205211
return fmt.Errorf("invalid --device-id-strategy option: %v", *config.Flags.Plugin.DeviceIDStrategy)
206212
}
207213

214+
if config.Flags.Plugin.AllocationPolicy != nil {
215+
switch *config.Flags.Plugin.AllocationPolicy {
216+
case spec.AllocationPolicyDistributed:
217+
case spec.AllocationPolicyPacked:
218+
default:
219+
return fmt.Errorf("invalid --allocation-policy option: %v", *config.Flags.Plugin.AllocationPolicy)
220+
}
221+
}
222+
208223
if config.Sharing.SharingStrategy() == spec.SharingStrategyMPS {
209224
if *config.Flags.MigStrategy == spec.MigStrategyMixed {
210225
return fmt.Errorf("using --mig-strategy=mixed is not supported with MPS")

internal/rm/allocate.go

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,3 +78,61 @@ func (r *resourceManager) distributedAlloc(available, required []string, size in
7878

7979
return devices, nil
8080
}
81+
82+
// packedAlloc returns a list of devices such that any replicated devices are
83+
// packed onto as few physical GPUs as possible. It preferentially allocates
84+
// replicas from GPUs that already have the most allocated replicas, which
85+
// helps consolidate workloads and free up entire GPUs for other uses.
86+
func (r *resourceManager) packedAlloc(available, required []string, size int) ([]string, error) {
87+
// Get the set of candidate devices as the difference between available and required.
88+
candidates := r.devices.Subset(available).Difference(r.devices.Subset(required)).GetIDs()
89+
needed := size - len(required)
90+
91+
if len(candidates) < needed {
92+
return nil, fmt.Errorf("not enough available devices to satisfy allocation")
93+
}
94+
95+
// For each candidate device, build a mapping of (stripped) device ID to
96+
// total / available replicas for that device.
97+
replicas := make(map[string]*struct{ total, available int })
98+
for _, c := range candidates {
99+
id := AnnotatedID(c).GetID()
100+
if _, exists := replicas[id]; !exists {
101+
replicas[id] = &struct{ total, available int }{}
102+
}
103+
replicas[id].available++
104+
}
105+
for d := range r.devices {
106+
id := AnnotatedID(d).GetID()
107+
if _, exists := replicas[id]; !exists {
108+
continue
109+
}
110+
replicas[id].total++
111+
}
112+
113+
// Grab the set of 'needed' devices one-by-one from the candidates list.
114+
// Before selecting each candidate, first sort the candidate list using the
115+
// replicas map above. After sorting, the first element in the list will
116+
// contain the device with the greatest difference between total and available
117+
// replications (i.e. the most already allocated). This packs allocations
118+
// onto GPUs that are already in use, freeing up other GPUs entirely.
119+
var devices []string
120+
for i := 0; i < needed; i++ {
121+
sort.Slice(candidates, func(i, j int) bool {
122+
iid := AnnotatedID(candidates[i]).GetID()
123+
jid := AnnotatedID(candidates[j]).GetID()
124+
idiff := replicas[iid].total - replicas[iid].available
125+
jdiff := replicas[jid].total - replicas[jid].available
126+
return idiff > jdiff
127+
})
128+
id := AnnotatedID(candidates[0]).GetID()
129+
replicas[id].available--
130+
devices = append(devices, candidates[0])
131+
candidates = candidates[1:]
132+
}
133+
134+
// Add the set of required devices to this list and return it.
135+
devices = append(required, devices...)
136+
137+
return devices, nil
138+
}

0 commit comments

Comments
 (0)