|
32 | 32 | #include <linux/relay.h>
|
33 | 33 | #include <linux/slab.h>
|
34 | 34 | #include <linux/percpu-rwsem.h>
|
| 35 | +#include <linux/cpuset.h> |
35 | 36 |
|
36 | 37 | #include <trace/events/power.h>
|
37 | 38 | #define CREATE_TRACE_POINTS
|
@@ -873,6 +874,52 @@ void __init cpuhp_threads_init(void)
|
873 | 874 | kthread_unpark(this_cpu_read(cpuhp_state.thread));
|
874 | 875 | }
|
875 | 876 |
|
| 877 | +/* |
| 878 | + * |
| 879 | + * Serialize hotplug trainwrecks outside of the cpu_hotplug_lock |
| 880 | + * protected region. |
| 881 | + * |
| 882 | + * The operation is still serialized against concurrent CPU hotplug via |
| 883 | + * cpu_add_remove_lock, i.e. CPU map protection. But it is _not_ |
| 884 | + * serialized against other hotplug related activity like adding or |
| 885 | + * removing of state callbacks and state instances, which invoke either the |
| 886 | + * startup or the teardown callback of the affected state. |
| 887 | + * |
| 888 | + * This is required for subsystems which are unfixable vs. CPU hotplug and |
| 889 | + * evade lock inversion problems by scheduling work which has to be |
| 890 | + * completed _before_ cpu_up()/_cpu_down() returns. |
| 891 | + * |
| 892 | + * Don't even think about adding anything to this for any new code or even |
| 893 | + * drivers. It's only purpose is to keep existing lock order trainwrecks |
| 894 | + * working. |
| 895 | + * |
| 896 | + * For cpu_down() there might be valid reasons to finish cleanups which are |
| 897 | + * not required to be done under cpu_hotplug_lock, but that's a different |
| 898 | + * story and would be not invoked via this. |
| 899 | + */ |
| 900 | +static void cpu_up_down_serialize_trainwrecks(bool tasks_frozen) |
| 901 | +{ |
| 902 | + /* |
| 903 | + * cpusets delegate hotplug operations to a worker to "solve" the |
| 904 | + * lock order problems. Wait for the worker, but only if tasks are |
| 905 | + * _not_ frozen (suspend, hibernate) as that would wait forever. |
| 906 | + * |
| 907 | + * The wait is required because otherwise the hotplug operation |
| 908 | + * returns with inconsistent state, which could even be observed in |
| 909 | + * user space when a new CPU is brought up. The CPU plug uevent |
| 910 | + * would be delivered and user space reacting on it would fail to |
| 911 | + * move tasks to the newly plugged CPU up to the point where the |
| 912 | + * work has finished because up to that point the newly plugged CPU |
| 913 | + * is not assignable in cpusets/cgroups. On unplug that's not |
| 914 | + * necessarily a visible issue, but it is still inconsistent state, |
| 915 | + * which is the real problem which needs to be "fixed". This can't |
| 916 | + * prevent the transient state between scheduling the work and |
| 917 | + * returning from waiting for it. |
| 918 | + */ |
| 919 | + if (!tasks_frozen) |
| 920 | + cpuset_wait_for_hotplug(); |
| 921 | +} |
| 922 | + |
876 | 923 | #ifdef CONFIG_HOTPLUG_CPU
|
877 | 924 | #ifndef arch_clear_mm_cpumask_cpu
|
878 | 925 | #define arch_clear_mm_cpumask_cpu(cpu, mm) cpumask_clear_cpu(cpu, mm_cpumask(mm))
|
@@ -1108,6 +1155,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
|
1108 | 1155 | */
|
1109 | 1156 | lockup_detector_cleanup();
|
1110 | 1157 | arch_smt_update();
|
| 1158 | + cpu_up_down_serialize_trainwrecks(tasks_frozen); |
1111 | 1159 | return ret;
|
1112 | 1160 | }
|
1113 | 1161 |
|
@@ -1302,6 +1350,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
|
1302 | 1350 | out:
|
1303 | 1351 | cpus_write_unlock();
|
1304 | 1352 | arch_smt_update();
|
| 1353 | + cpu_up_down_serialize_trainwrecks(tasks_frozen); |
1305 | 1354 | return ret;
|
1306 | 1355 | }
|
1307 | 1356 |
|
|
0 commit comments