Skip to content

Commit ccac8e8

Browse files
Waiman-Longhtejun
authored andcommitted
cgroup/cpuset: Fix remote root partition creation problem
Since commit 181c8e0 ("cgroup/cpuset: Introduce remote partition"), a remote partition can be created underneath a non-partition root cpuset as long as its exclusive_cpus are set to distribute exclusive CPUs down to its children. The generate_sched_domains() function, however, doesn't take into account this new behavior and hence will fail to create the sched domain needed for a remote root (non-isolated) partition. There are two issues related to remote partition support. First of all, generate_sched_domains() has a fast path that is activated if root_load_balance is true and top_cpuset.nr_subparts is non-zero. The later condition isn't quite correct for remote partitions as nr_subparts just shows the number of local child partitions underneath it. There can be no local child partition under top_cpuset even if there are remote partitions further down the hierarchy. Fix that by checking for subpartitions_cpus which contains exclusive CPUs allocated to both local and remote partitions. Secondly, the valid partition check for subtree skipping in the csa[] generation loop isn't enough as remote partition does not need to have a partition root parent. Fix this problem by breaking csa[] array generation loop of generate_sched_domains() into v1 and v2 specific parts and checking a cpuset's exclusive_cpus before skipping its subtree in the v2 case. Also simplify generate_sched_domains() for cgroup v2 as only non-isolating partition roots should be included in building the cpuset array and none of the v1 scheduling attributes other than a different way to create an isolated partition are supported. Fixes: 181c8e0 ("cgroup/cpuset: Introduce remote partition") Signed-off-by: Waiman Long <[email protected]> Signed-off-by: Tejun Heo <[email protected]>
1 parent 6fe9601 commit ccac8e8

File tree

1 file changed

+42
-13
lines changed

1 file changed

+42
-13
lines changed

kernel/cgroup/cpuset.c

Lines changed: 42 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -169,7 +169,7 @@ struct cpuset {
169169
/* for custom sched domain */
170170
int relax_domain_level;
171171

172-
/* number of valid sub-partitions */
172+
/* number of valid local child partitions */
173173
int nr_subparts;
174174

175175
/* partition root state */
@@ -957,13 +957,14 @@ static int generate_sched_domains(cpumask_var_t **domains,
957957
int nslot; /* next empty doms[] struct cpumask slot */
958958
struct cgroup_subsys_state *pos_css;
959959
bool root_load_balance = is_sched_load_balance(&top_cpuset);
960+
bool cgrpv2 = cgroup_subsys_on_dfl(cpuset_cgrp_subsys);
960961

961962
doms = NULL;
962963
dattr = NULL;
963964
csa = NULL;
964965

965966
/* Special case for the 99% of systems with one, full, sched domain */
966-
if (root_load_balance && !top_cpuset.nr_subparts) {
967+
if (root_load_balance && cpumask_empty(subpartitions_cpus)) {
967968
single_root_domain:
968969
ndoms = 1;
969970
doms = alloc_sched_domains(ndoms);
@@ -992,33 +993,47 @@ static int generate_sched_domains(cpumask_var_t **domains,
992993
cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
993994
if (cp == &top_cpuset)
994995
continue;
996+
997+
if (cgrpv2)
998+
goto v2;
999+
9951000
/*
1001+
* v1:
9961002
* Continue traversing beyond @cp iff @cp has some CPUs and
9971003
* isn't load balancing. The former is obvious. The
9981004
* latter: All child cpusets contain a subset of the
9991005
* parent's cpus, so just skip them, and then we call
10001006
* update_domain_attr_tree() to calc relax_domain_level of
10011007
* the corresponding sched domain.
1002-
*
1003-
* If root is load-balancing, we can skip @cp if it
1004-
* is a subset of the root's effective_cpus.
10051008
*/
10061009
if (!cpumask_empty(cp->cpus_allowed) &&
10071010
!(is_sched_load_balance(cp) &&
10081011
cpumask_intersects(cp->cpus_allowed,
10091012
housekeeping_cpumask(HK_TYPE_DOMAIN))))
10101013
continue;
10111014

1012-
if (root_load_balance &&
1013-
cpumask_subset(cp->cpus_allowed, top_cpuset.effective_cpus))
1014-
continue;
1015-
10161015
if (is_sched_load_balance(cp) &&
10171016
!cpumask_empty(cp->effective_cpus))
10181017
csa[csn++] = cp;
10191018

1020-
/* skip @cp's subtree if not a partition root */
1021-
if (!is_partition_valid(cp))
1019+
/* skip @cp's subtree */
1020+
pos_css = css_rightmost_descendant(pos_css);
1021+
continue;
1022+
1023+
v2:
1024+
/*
1025+
* Only valid partition roots that are not isolated and with
1026+
* non-empty effective_cpus will be saved into csn[].
1027+
*/
1028+
if ((cp->partition_root_state == PRS_ROOT) &&
1029+
!cpumask_empty(cp->effective_cpus))
1030+
csa[csn++] = cp;
1031+
1032+
/*
1033+
* Skip @cp's subtree if not a partition root and has no
1034+
* exclusive CPUs to be granted to child cpusets.
1035+
*/
1036+
if (!is_partition_valid(cp) && cpumask_empty(cp->exclusive_cpus))
10221037
pos_css = css_rightmost_descendant(pos_css);
10231038
}
10241039
rcu_read_unlock();
@@ -1072,6 +1087,20 @@ static int generate_sched_domains(cpumask_var_t **domains,
10721087
dattr = kmalloc_array(ndoms, sizeof(struct sched_domain_attr),
10731088
GFP_KERNEL);
10741089

1090+
/*
1091+
* Cgroup v2 doesn't support domain attributes, just set all of them
1092+
* to SD_ATTR_INIT. Also non-isolating partition root CPUs are a
1093+
* subset of HK_TYPE_DOMAIN housekeeping CPUs.
1094+
*/
1095+
if (cgrpv2) {
1096+
for (i = 0; i < ndoms; i++) {
1097+
cpumask_copy(doms[i], csa[i]->effective_cpus);
1098+
if (dattr)
1099+
dattr[i] = SD_ATTR_INIT;
1100+
}
1101+
goto done;
1102+
}
1103+
10751104
for (nslot = 0, i = 0; i < csn; i++) {
10761105
struct cpuset *a = csa[i];
10771106
struct cpumask *dp;
@@ -1231,7 +1260,7 @@ static void rebuild_sched_domains_locked(void)
12311260
* root should be only a subset of the active CPUs. Since a CPU in any
12321261
* partition root could be offlined, all must be checked.
12331262
*/
1234-
if (top_cpuset.nr_subparts) {
1263+
if (!cpumask_empty(subpartitions_cpus)) {
12351264
rcu_read_lock();
12361265
cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
12371266
if (!is_partition_valid(cs)) {
@@ -4575,7 +4604,7 @@ static void cpuset_handle_hotplug(void)
45754604
* In the rare case that hotplug removes all the cpus in
45764605
* subpartitions_cpus, we assumed that cpus are updated.
45774606
*/
4578-
if (!cpus_updated && top_cpuset.nr_subparts)
4607+
if (!cpus_updated && !cpumask_empty(subpartitions_cpus))
45794608
cpus_updated = true;
45804609

45814610
/* For v1, synchronize cpus_allowed to cpu_active_mask */

0 commit comments

Comments
 (0)