Skip to content

Commit 7f90a6f

Browse files
committed
scheduler: fixed a bug where the bandwidth of reserved cores were not taken into account
1 parent 52b2c04 commit 7f90a6f

File tree

5 files changed

+20
-45
lines changed

5 files changed

+20
-45
lines changed

.changelog/26768.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
```release-note:bug
2-
client: fixed a bug where the bandwidth of reserved cores were not taken into account
2+
scheduler: fixed a bug where the bandwidth of reserved cores were not taken into account
33
```

client/client_test.go

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1691,17 +1691,17 @@ func TestClient_UpdateNodeFromDevicesAccumulates(t *testing.T) {
16911691
func TestClient_UpdateNodeFromFingerprintCalculatesReservedResources(t *testing.T) {
16921692
ci.Parallel(t)
16931693

1694-
// Client without network configured updates to match fingerprint
16951694
client, cleanup := TestClient(t, func(c *config.Config) {
1695+
// Parsed version of the client reserved field
16961696
c.Node.ReservedResources.Cpu = structs.NodeReservedCpuResources{
16971697
CpuShares: 100,
16981698
ReservedCpuCores: []uint16{0},
16991699
}
17001700
})
17011701
defer cleanup()
17021702

1703-
// Set up a basic topology where the first core is and an
1704-
// additionally 100 MHz are reserved
1703+
// Set up a basic topology where the first core is reserved and
1704+
// an additionally 100 MHz are reserved
17051705
basicTopology := structs.MockBasicTopology()
17061706
basicTopology.Cores[0].Disable = true
17071707
basicTopology.OverrideWitholdCompute = 100
@@ -1720,7 +1720,9 @@ func TestClient_UpdateNodeFromFingerprintCalculatesReservedResources(t *testing.
17201720

17211721
expectedReservedResources := &structs.NodeReservedResources{
17221722
Cpu: structs.NodeReservedCpuResources{
1723-
CpuShares: 3_600, // 1 * 3500 MHz and 100 MHz
1723+
// set by fingerprinting callback, once the total compute has been detected, it should have converted the
1724+
// client reserved configuration into the effective reserved bandwidth (1 core * 3500 MHz + 100 MHz)
1725+
CpuShares: 3_600,
17241726
ReservedCpuCores: []uint16{0},
17251727
},
17261728
Memory: structs.NodeReservedMemoryResources{

client/lib/numalib/topology.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -282,17 +282,17 @@ func (st *Topology) NumECores() int {
282282
return total
283283
}
284284

285-
// TotalCores returns the number of logical cores detected. The UsableCores will
285+
// AllCores returns the set of logical cores detected. The UsableCores will
286286
// be equal to or less than this value.
287-
func (st *Topology) TotalCores() *idset.Set[hw.CoreID] {
287+
func (st *Topology) AllCores() *idset.Set[hw.CoreID] {
288288
result := idset.Empty[hw.CoreID]()
289289
for _, cpu := range st.Cores {
290290
result.Insert(cpu.ID)
291291
}
292292
return result
293293
}
294294

295-
// UsableCores returns the number of logical cores usable by the Nomad client
295+
// UsableCores returns the set of logical cores usable by the Nomad client
296296
// for running tasks. Nomad must subtract off any reserved cores (reserved.cores)
297297
// and/or must mask the cpuset to the one set in config (config.reservable_cores).
298298
func (st *Topology) UsableCores() *idset.Set[hw.CoreID] {

nomad/structs/funcs_test.go

Lines changed: 8 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,9 @@ func node2k() *Node {
149149
},
150150
ReservedResources: &NodeReservedResources{
151151
Cpu: NodeReservedCpuResources{
152-
CpuShares: 3000, // set by fingerprinting callback
152+
// set by fingerprinting callback, topology of 1000 MHz * 4 cores (4000 MHz), of which 2 cores are reserved
153+
// plus 1000 MHz of reserved amount of CPU, effectively a total of 3000 MHz of reserved CPU
154+
CpuShares: 3000,
153155
ReservedCpuCores: []uint16{2, 3},
154156
},
155157
Memory: NodeReservedMemoryResources{
@@ -214,9 +216,10 @@ func TestAllocsFit(t *testing.T) {
214216
must.Eq(t, 1024, used.Flattened.Memory.MemoryMB)
215217

216218
// Should not fit second allocation
217-
fit, _, used, err = AllocsFit(n, []*Allocation{a1, a1}, nil, false)
219+
fit, dim, used, err = AllocsFit(n, []*Allocation{a1, a1}, nil, false)
218220
must.NoError(t, err)
219221
must.False(t, fit)
222+
must.Eq(t, "cpu", dim)
220223
must.Eq(t, 2000, used.Flattened.Cpu.CpuShares)
221224
must.Eq(t, 2048, used.Flattened.Memory.MemoryMB)
222225

@@ -261,38 +264,6 @@ func TestAllocsFit(t *testing.T) {
261264
must.Eq(t, 1000, used.Flattened.Cpu.CpuShares)
262265
must.Eq(t, []uint16{0}, used.Flattened.Cpu.ReservedCores)
263266
must.Eq(t, 1024, used.Flattened.Memory.MemoryMB)
264-
265-
a3 := &Allocation{
266-
AllocatedResources: &AllocatedResources{
267-
Tasks: map[string]*AllocatedTaskResources{
268-
"web": {
269-
Cpu: AllocatedCpuResources{
270-
CpuShares: 1000,
271-
},
272-
Memory: AllocatedMemoryResources{
273-
MemoryMB: 512,
274-
},
275-
},
276-
},
277-
},
278-
}
279-
280-
// Should fit one allocation
281-
fit, dim, used, err = AllocsFit(n, []*Allocation{a3}, nil, false)
282-
must.NoError(t, err)
283-
must.True(t, fit, must.Sprintf("failed for dimension %q", dim))
284-
must.Eq(t, 1000, used.Flattened.Cpu.CpuShares)
285-
must.Eq(t, []uint16{}, used.Flattened.Cpu.ReservedCores)
286-
must.Eq(t, 512, used.Flattened.Memory.MemoryMB)
287-
288-
// Should not fit second allocation
289-
fit, dim, used, err = AllocsFit(n, []*Allocation{a3, a3}, nil, false)
290-
must.NoError(t, err)
291-
must.False(t, fit)
292-
must.Eq(t, "cpu", dim)
293-
must.Eq(t, 2000, used.Flattened.Cpu.CpuShares)
294-
must.Eq(t, []uint16{}, used.Flattened.Cpu.ReservedCores)
295-
must.Eq(t, 1024, used.Flattened.Memory.MemoryMB)
296267
}
297268

298269
func TestAllocsFit_Cores(t *testing.T) {
@@ -721,7 +692,9 @@ func TestScoreFitBinPack(t *testing.T) {
721692
node.NodeResources.Compatibility()
722693
node.ReservedResources = &NodeReservedResources{
723694
Cpu: NodeReservedCpuResources{
724-
CpuShares: 6144, // set by fingerprinting callback
695+
// set by fingerprinting callback, topology of 2048 MHz * 4 cores (8192 MHz), of which 2 cores are reserved
696+
// plus 2048 MHz of reserved amount of CPU, effectively a total of 6144 MHz of reserved CPU
697+
CpuShares: 6144,
725698
ReservedCpuCores: []uint16{2, 3},
726699
},
727700
Memory: NodeReservedMemoryResources{

nomad/structs/structs.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3244,8 +3244,8 @@ func (n *NodeResources) Comparable() *ComparableResources {
32443244
return nil
32453245
}
32463246

3247-
totalCores := n.Processors.Topology.TotalCores().Slice()
3248-
cores := helper.ConvertSlice(totalCores, func(id hw.CoreID) uint16 {
3247+
allCores := n.Processors.Topology.AllCores().Slice()
3248+
cores := helper.ConvertSlice(allCores, func(id hw.CoreID) uint16 {
32493249
return uint16(id)
32503250
})
32513251

0 commit comments

Comments
 (0)