Skip to content

Commit f28e224

Browse files
Waiman-Longhtejun
authored andcommitted
cgroup/cpuset: Add a new isolated cpus.partition type
Cpuset v1 uses the sched_load_balance control file to determine if load balancing should be enabled. Cpuset v2 gets rid of sched_load_balance as its use may require disabling load balancing at cgroup root. For workloads that require very low latency like DPDK, the latency jitters caused by periodic load balancing may exceed the desired latency limit. When cpuset v2 is in use, the only way to avoid this latency cost is to use the "isolcpus=" kernel boot option to isolate a set of CPUs. After the kernel boot, however, there is no way to add or remove CPUs from this isolated set. For workloads that are more dynamic in nature, that means users have to provision enough CPUs for the worst case situation resulting in excess idle CPUs. To address this issue for cpuset v2, a new cpuset.cpus.partition type "isolated" is added which allows the creation of a cpuset partition without load balancing. This will allow system administrators to dynamically adjust the size of isolated partition to the current need of the workload without rebooting the system. Signed-off-by: Waiman Long <[email protected]> Signed-off-by: Tejun Heo <[email protected]>
1 parent f0af1bf commit f28e224

File tree

1 file changed

+63
-11
lines changed

1 file changed

+63
-11
lines changed

kernel/cgroup/cpuset.c

Lines changed: 63 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -178,11 +178,15 @@ struct cpuset {
178178
*
179179
* 0 - member (not a partition root)
180180
* 1 - partition root
181+
* 2 - partition root without load balancing (isolated)
181182
* -1 - invalid partition root
183+
* -2 - invalid isolated partition root
182184
*/
183185
#define PRS_MEMBER 0
184186
#define PRS_ROOT 1
187+
#define PRS_ISOLATED 2
185188
#define PRS_INVALID_ROOT -1
189+
#define PRS_INVALID_ISOLATED -2
186190

187191
static inline bool is_prs_invalid(int prs_state)
188192
{
@@ -282,7 +286,8 @@ static inline int is_partition_invalid(const struct cpuset *cs)
282286
*/
283287
static inline void make_partition_invalid(struct cpuset *cs)
284288
{
285-
cs->partition_root_state = PRS_INVALID_ROOT;
289+
if (is_partition_valid(cs))
290+
cs->partition_root_state = -cs->partition_root_state;
286291
}
287292

288293
/*
@@ -1380,17 +1385,19 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd,
13801385

13811386
if (cmd == partcmd_update) {
13821387
/*
1383-
* Check for possible transition between PRS_ROOT
1384-
* and PRS_INVALID_ROOT.
1388+
* Check for possible transition between valid and invalid
1389+
* partition root.
13851390
*/
13861391
switch (cs->partition_root_state) {
13871392
case PRS_ROOT:
1393+
case PRS_ISOLATED:
13881394
if (part_error)
1389-
new_prs = PRS_INVALID_ROOT;
1395+
new_prs = -old_prs;
13901396
break;
13911397
case PRS_INVALID_ROOT:
1398+
case PRS_INVALID_ISOLATED:
13921399
if (!part_error)
1393-
new_prs = PRS_ROOT;
1400+
new_prs = -old_prs;
13941401
break;
13951402
}
13961403
}
@@ -1400,7 +1407,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd,
14001407

14011408
/*
14021409
* Transitioning between invalid to valid or vice versa may require
1403-
* changing CS_CPU_EXCLUSIVE.
1410+
* changing CS_CPU_EXCLUSIVE and CS_SCHED_LOAD_BALANCE.
14041411
*/
14051412
if (old_prs != new_prs) {
14061413
if (is_prs_invalid(old_prs) && !is_cpu_exclusive(cs) &&
@@ -1443,8 +1450,17 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd,
14431450
if (adding || deleting)
14441451
update_tasks_cpumask(parent);
14451452

1453+
/*
1454+
* Set or clear CS_SCHED_LOAD_BALANCE when partcmd_update, if necessary.
1455+
* rebuild_sched_domains_locked() may be called.
1456+
*/
1457+
if (old_prs != new_prs) {
1458+
if (old_prs == PRS_ISOLATED)
1459+
update_flag(CS_SCHED_LOAD_BALANCE, cs, 1);
1460+
else if (new_prs == PRS_ISOLATED)
1461+
update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
1462+
}
14461463
notify_partition_change(cs, old_prs);
1447-
14481464
return 0;
14491465
}
14501466

@@ -1519,6 +1535,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
15191535
if ((cp != cs) && old_prs) {
15201536
switch (parent->partition_root_state) {
15211537
case PRS_ROOT:
1538+
case PRS_ISOLATED:
15221539
update_parent = true;
15231540
break;
15241541

@@ -1528,7 +1545,8 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
15281545
* invalid, child partition roots become
15291546
* invalid too.
15301547
*/
1531-
new_prs = PRS_INVALID_ROOT;
1548+
if (is_partition_valid(cp))
1549+
new_prs = -cp->partition_root_state;
15321550
break;
15331551
}
15341552
}
@@ -2110,6 +2128,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
21102128
static int update_prstate(struct cpuset *cs, int new_prs)
21112129
{
21122130
int err = 0, old_prs = cs->partition_root_state;
2131+
bool sched_domain_rebuilt = false;
21132132
struct cpuset *parent = parent_cs(cs);
21142133
struct tmpmasks tmpmask;
21152134

@@ -2120,8 +2139,10 @@ static int update_prstate(struct cpuset *cs, int new_prs)
21202139
* For a previously invalid partition root, leave it at being
21212140
* invalid if new_prs is not "member".
21222141
*/
2123-
if (new_prs && is_prs_invalid(old_prs))
2142+
if (new_prs && is_prs_invalid(old_prs)) {
2143+
cs->partition_root_state = -new_prs;
21242144
return 0;
2145+
}
21252146

21262147
if (alloc_cpumasks(NULL, &tmpmask))
21272148
return -ENOMEM;
@@ -2147,6 +2168,22 @@ static int update_prstate(struct cpuset *cs, int new_prs)
21472168
update_flag(CS_CPU_EXCLUSIVE, cs, 0);
21482169
goto out;
21492170
}
2171+
2172+
if (new_prs == PRS_ISOLATED) {
2173+
/*
2174+
* Disable the load balance flag should not return an
2175+
* error unless the system is running out of memory.
2176+
*/
2177+
update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
2178+
sched_domain_rebuilt = true;
2179+
}
2180+
} else if (old_prs && new_prs) {
2181+
/*
2182+
* A change in load balance state only, no change in cpumasks.
2183+
*/
2184+
update_flag(CS_SCHED_LOAD_BALANCE, cs, (new_prs != PRS_ISOLATED));
2185+
sched_domain_rebuilt = true;
2186+
goto out; /* Sched domain is rebuilt in update_flag() */
21502187
} else {
21512188
/*
21522189
* Switching back to member is always allowed even if it
@@ -2168,20 +2205,27 @@ static int update_prstate(struct cpuset *cs, int new_prs)
21682205

21692206
/* Turning off CS_CPU_EXCLUSIVE will not return error */
21702207
update_flag(CS_CPU_EXCLUSIVE, cs, 0);
2208+
2209+
if (!is_sched_load_balance(cs)) {
2210+
/* Make sure load balance is on */
2211+
update_flag(CS_SCHED_LOAD_BALANCE, cs, 1);
2212+
sched_domain_rebuilt = true;
2213+
}
21712214
}
21722215

21732216
update_tasks_cpumask(parent);
21742217

21752218
if (parent->child_ecpus_count)
21762219
update_sibling_cpumasks(parent, cs, &tmpmask);
21772220

2178-
rebuild_sched_domains_locked();
2221+
if (!sched_domain_rebuilt)
2222+
rebuild_sched_domains_locked();
21792223
out:
21802224
/*
21812225
* Make partition invalid if an error happen
21822226
*/
21832227
if (err)
2184-
new_prs = PRS_INVALID_ROOT;
2228+
new_prs = -new_prs;
21852229
spin_lock_irq(&callback_lock);
21862230
cs->partition_root_state = new_prs;
21872231
spin_unlock_irq(&callback_lock);
@@ -2691,12 +2735,18 @@ static int sched_partition_show(struct seq_file *seq, void *v)
26912735
case PRS_ROOT:
26922736
seq_puts(seq, "root\n");
26932737
break;
2738+
case PRS_ISOLATED:
2739+
seq_puts(seq, "isolated\n");
2740+
break;
26942741
case PRS_MEMBER:
26952742
seq_puts(seq, "member\n");
26962743
break;
26972744
case PRS_INVALID_ROOT:
26982745
seq_puts(seq, "root invalid\n");
26992746
break;
2747+
case PRS_INVALID_ISOLATED:
2748+
seq_puts(seq, "isolated invalid\n");
2749+
break;
27002750
}
27012751
return 0;
27022752
}
@@ -2717,6 +2767,8 @@ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf,
27172767
val = PRS_ROOT;
27182768
else if (!strcmp(buf, "member"))
27192769
val = PRS_MEMBER;
2770+
else if (!strcmp(buf, "isolated"))
2771+
val = PRS_ISOLATED;
27202772
else
27212773
return -EINVAL;
27222774

0 commit comments

Comments
 (0)