Skip to content

Commit 4ba4f1a

Browse files
Kan LiangPeter Zijlstra
authored andcommitted
perf: Generic hotplug support for a PMU with a scope
The perf subsystem assumes that the counters of a PMU are per-CPU. So the user space tool reads a counter from each CPU in the system wide mode. However, many PMUs don't have a per-CPU counter. The counter is effective for a scope, e.g., a die or a socket. To address this, a cpumask is exposed by the kernel driver to restrict to one CPU to stand for a specific scope. In case the given CPU is removed, the hotplug support has to be implemented for each such driver. The codes to support the cpumask and hotplug are very similar. - Expose a cpumask into sysfs - Pickup another CPU in the same scope if the given CPU is removed. - Invoke the perf_pmu_migrate_context() to migrate to a new CPU. - In event init, always set the CPU in the cpumask to event->cpu Similar duplicated codes are implemented for each such PMU driver. It would be good to introduce a generic infrastructure to avoid such duplication. 5 popular scopes are implemented here, core, die, cluster, pkg, and the system-wide. The scope can be set when a PMU is registered. If so, a "cpumask" is automatically exposed for the PMU. The "cpumask" is from the perf_online_<scope>_mask, which is to track the active CPU for each scope. They are set when the first CPU of the scope is online via the generic perf hotplug support. When a corresponding CPU is removed, the perf_online_<scope>_mask is updated accordingly and the PMU will be moved to a new CPU from the same scope if possible. Signed-off-by: Kan Liang <[email protected]> Signed-off-by: Peter Zijlstra (Intel) <[email protected]> Link: https://lore.kernel.org/r/[email protected]
1 parent cd7bdd9 commit 4ba4f1a

File tree

2 files changed

+180
-2
lines changed

2 files changed

+180
-2
lines changed

include/linux/perf_event.h

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -295,6 +295,19 @@ struct perf_event_pmu_context;
295295
#define PERF_PMU_CAP_AUX_OUTPUT 0x0080
296296
#define PERF_PMU_CAP_EXTENDED_HW_TYPE 0x0100
297297

298+
/**
299+
* pmu::scope
300+
*/
301+
enum perf_pmu_scope {
302+
PERF_PMU_SCOPE_NONE = 0,
303+
PERF_PMU_SCOPE_CORE,
304+
PERF_PMU_SCOPE_DIE,
305+
PERF_PMU_SCOPE_CLUSTER,
306+
PERF_PMU_SCOPE_PKG,
307+
PERF_PMU_SCOPE_SYS_WIDE,
308+
PERF_PMU_MAX_SCOPE,
309+
};
310+
298311
struct perf_output_handle;
299312

300313
#define PMU_NULL_DEV ((void *)(~0UL))
@@ -318,6 +331,11 @@ struct pmu {
318331
*/
319332
int capabilities;
320333

334+
/*
335+
* PMU scope
336+
*/
337+
unsigned int scope;
338+
321339
int __percpu *pmu_disable_count;
322340
struct perf_cpu_pmu_context __percpu *cpu_pmu_context;
323341
atomic_t exclusive_cnt; /* < 0: cpu; > 0: tsk */

kernel/events/core.c

Lines changed: 162 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -436,6 +436,11 @@ static LIST_HEAD(pmus);
436436
static DEFINE_MUTEX(pmus_lock);
437437
static struct srcu_struct pmus_srcu;
438438
static cpumask_var_t perf_online_mask;
439+
static cpumask_var_t perf_online_core_mask;
440+
static cpumask_var_t perf_online_die_mask;
441+
static cpumask_var_t perf_online_cluster_mask;
442+
static cpumask_var_t perf_online_pkg_mask;
443+
static cpumask_var_t perf_online_sys_mask;
439444
static struct kmem_cache *perf_event_cache;
440445

441446
/*
@@ -11578,10 +11583,60 @@ perf_event_mux_interval_ms_store(struct device *dev,
1157811583
}
1157911584
static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
1158011585

11586+
static inline const struct cpumask *perf_scope_cpu_topology_cpumask(unsigned int scope, int cpu)
11587+
{
11588+
switch (scope) {
11589+
case PERF_PMU_SCOPE_CORE:
11590+
return topology_sibling_cpumask(cpu);
11591+
case PERF_PMU_SCOPE_DIE:
11592+
return topology_die_cpumask(cpu);
11593+
case PERF_PMU_SCOPE_CLUSTER:
11594+
return topology_cluster_cpumask(cpu);
11595+
case PERF_PMU_SCOPE_PKG:
11596+
return topology_core_cpumask(cpu);
11597+
case PERF_PMU_SCOPE_SYS_WIDE:
11598+
return cpu_online_mask;
11599+
}
11600+
11601+
return NULL;
11602+
}
11603+
11604+
static inline struct cpumask *perf_scope_cpumask(unsigned int scope)
11605+
{
11606+
switch (scope) {
11607+
case PERF_PMU_SCOPE_CORE:
11608+
return perf_online_core_mask;
11609+
case PERF_PMU_SCOPE_DIE:
11610+
return perf_online_die_mask;
11611+
case PERF_PMU_SCOPE_CLUSTER:
11612+
return perf_online_cluster_mask;
11613+
case PERF_PMU_SCOPE_PKG:
11614+
return perf_online_pkg_mask;
11615+
case PERF_PMU_SCOPE_SYS_WIDE:
11616+
return perf_online_sys_mask;
11617+
}
11618+
11619+
return NULL;
11620+
}
11621+
11622+
static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr,
11623+
char *buf)
11624+
{
11625+
struct pmu *pmu = dev_get_drvdata(dev);
11626+
struct cpumask *mask = perf_scope_cpumask(pmu->scope);
11627+
11628+
if (mask)
11629+
return cpumap_print_to_pagebuf(true, buf, mask);
11630+
return 0;
11631+
}
11632+
11633+
static DEVICE_ATTR_RO(cpumask);
11634+
1158111635
static struct attribute *pmu_dev_attrs[] = {
1158211636
&dev_attr_type.attr,
1158311637
&dev_attr_perf_event_mux_interval_ms.attr,
1158411638
&dev_attr_nr_addr_filters.attr,
11639+
&dev_attr_cpumask.attr,
1158511640
NULL,
1158611641
};
1158711642

@@ -11593,6 +11648,10 @@ static umode_t pmu_dev_is_visible(struct kobject *kobj, struct attribute *a, int
1159311648
if (n == 2 && !pmu->nr_addr_filters)
1159411649
return 0;
1159511650

11651+
/* cpumask */
11652+
if (n == 3 && pmu->scope == PERF_PMU_SCOPE_NONE)
11653+
return 0;
11654+
1159611655
return a->mode;
1159711656
}
1159811657

@@ -11677,6 +11736,11 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
1167711736
goto free_pdc;
1167811737
}
1167911738

11739+
if (WARN_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE, "Can not register a pmu with an invalid scope.\n")) {
11740+
ret = -EINVAL;
11741+
goto free_pdc;
11742+
}
11743+
1168011744
pmu->name = name;
1168111745

1168211746
if (type >= 0)
@@ -11831,6 +11895,22 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
1183111895
event_has_any_exclude_flag(event))
1183211896
ret = -EINVAL;
1183311897

11898+
if (pmu->scope != PERF_PMU_SCOPE_NONE && event->cpu >= 0) {
11899+
const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(pmu->scope, event->cpu);
11900+
struct cpumask *pmu_cpumask = perf_scope_cpumask(pmu->scope);
11901+
int cpu;
11902+
11903+
if (pmu_cpumask && cpumask) {
11904+
cpu = cpumask_any_and(pmu_cpumask, cpumask);
11905+
if (cpu >= nr_cpu_ids)
11906+
ret = -ENODEV;
11907+
else
11908+
event->cpu = cpu;
11909+
} else {
11910+
ret = -ENODEV;
11911+
}
11912+
}
11913+
1183411914
if (ret && event->destroy)
1183511915
event->destroy(event);
1183611916
}
@@ -13784,6 +13864,12 @@ static void __init perf_event_init_all_cpus(void)
1378413864
int cpu;
1378513865

1378613866
zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL);
13867+
zalloc_cpumask_var(&perf_online_core_mask, GFP_KERNEL);
13868+
zalloc_cpumask_var(&perf_online_die_mask, GFP_KERNEL);
13869+
zalloc_cpumask_var(&perf_online_cluster_mask, GFP_KERNEL);
13870+
zalloc_cpumask_var(&perf_online_pkg_mask, GFP_KERNEL);
13871+
zalloc_cpumask_var(&perf_online_sys_mask, GFP_KERNEL);
13872+
1378713873

1378813874
for_each_possible_cpu(cpu) {
1378913875
swhash = &per_cpu(swevent_htable, cpu);
@@ -13833,21 +13919,59 @@ static void __perf_event_exit_context(void *__info)
1383313919
raw_spin_unlock(&ctx->lock);
1383413920
}
1383513921

13922+
static void perf_event_clear_cpumask(unsigned int cpu)
13923+
{
13924+
int target[PERF_PMU_MAX_SCOPE];
13925+
unsigned int scope;
13926+
struct pmu *pmu;
13927+
13928+
cpumask_clear_cpu(cpu, perf_online_mask);
13929+
13930+
for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) {
13931+
const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(scope, cpu);
13932+
struct cpumask *pmu_cpumask = perf_scope_cpumask(scope);
13933+
13934+
target[scope] = -1;
13935+
if (WARN_ON_ONCE(!pmu_cpumask || !cpumask))
13936+
continue;
13937+
13938+
if (!cpumask_test_and_clear_cpu(cpu, pmu_cpumask))
13939+
continue;
13940+
target[scope] = cpumask_any_but(cpumask, cpu);
13941+
if (target[scope] < nr_cpu_ids)
13942+
cpumask_set_cpu(target[scope], pmu_cpumask);
13943+
}
13944+
13945+
/* migrate */
13946+
list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) {
13947+
if (pmu->scope == PERF_PMU_SCOPE_NONE ||
13948+
WARN_ON_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE))
13949+
continue;
13950+
13951+
if (target[pmu->scope] >= 0 && target[pmu->scope] < nr_cpu_ids)
13952+
perf_pmu_migrate_context(pmu, cpu, target[pmu->scope]);
13953+
}
13954+
}
13955+
1383613956
static void perf_event_exit_cpu_context(int cpu)
1383713957
{
1383813958
struct perf_cpu_context *cpuctx;
1383913959
struct perf_event_context *ctx;
1384013960

1384113961
// XXX simplify cpuctx->online
1384213962
mutex_lock(&pmus_lock);
13963+
/*
13964+
* Clear the cpumasks, and migrate to other CPUs if possible.
13965+
* Must be invoked before the __perf_event_exit_context.
13966+
*/
13967+
perf_event_clear_cpumask(cpu);
1384313968
cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
1384413969
ctx = &cpuctx->ctx;
1384513970

1384613971
mutex_lock(&ctx->mutex);
1384713972
smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
1384813973
cpuctx->online = 0;
1384913974
mutex_unlock(&ctx->mutex);
13850-
cpumask_clear_cpu(cpu, perf_online_mask);
1385113975
mutex_unlock(&pmus_lock);
1385213976
}
1385313977
#else
@@ -13856,6 +13980,42 @@ static void perf_event_exit_cpu_context(int cpu) { }
1385613980

1385713981
#endif
1385813982

13983+
static void perf_event_setup_cpumask(unsigned int cpu)
13984+
{
13985+
struct cpumask *pmu_cpumask;
13986+
unsigned int scope;
13987+
13988+
cpumask_set_cpu(cpu, perf_online_mask);
13989+
13990+
/*
13991+
* Early boot stage, the cpumask hasn't been set yet.
13992+
* The perf_online_<domain>_masks includes the first CPU of each domain.
13993+
* Always uncondifionally set the boot CPU for the perf_online_<domain>_masks.
13994+
*/
13995+
if (!topology_sibling_cpumask(cpu)) {
13996+
for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) {
13997+
pmu_cpumask = perf_scope_cpumask(scope);
13998+
if (WARN_ON_ONCE(!pmu_cpumask))
13999+
continue;
14000+
cpumask_set_cpu(cpu, pmu_cpumask);
14001+
}
14002+
return;
14003+
}
14004+
14005+
for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) {
14006+
const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(scope, cpu);
14007+
14008+
pmu_cpumask = perf_scope_cpumask(scope);
14009+
14010+
if (WARN_ON_ONCE(!pmu_cpumask || !cpumask))
14011+
continue;
14012+
14013+
if (!cpumask_empty(cpumask) &&
14014+
cpumask_any_and(pmu_cpumask, cpumask) >= nr_cpu_ids)
14015+
cpumask_set_cpu(cpu, pmu_cpumask);
14016+
}
14017+
}
14018+
1385914019
int perf_event_init_cpu(unsigned int cpu)
1386014020
{
1386114021
struct perf_cpu_context *cpuctx;
@@ -13864,7 +14024,7 @@ int perf_event_init_cpu(unsigned int cpu)
1386414024
perf_swevent_init_cpu(cpu);
1386514025

1386614026
mutex_lock(&pmus_lock);
13867-
cpumask_set_cpu(cpu, perf_online_mask);
14027+
perf_event_setup_cpumask(cpu);
1386814028
cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
1386914029
ctx = &cpuctx->ctx;
1387014030

0 commit comments

Comments
 (0)