Skip to content

Commit 506e64e

Browse files
Kan LiangPeter Zijlstra
authored andcommitted
perf: attach/detach PMU specific data
The LBR call stack data has to be saved/restored during context switch to fix the shorter LBRs call stacks issue in the system-wide mode. Allocate PMU specific data and attach them to the corresponding task_struct during LBR call stack monitoring. When a LBR call stack event is accounted, the perf_ctx_data for the related tasks will be allocated/attached by attach_perf_ctx_data(). When a LBR call stack event is unaccounted, the perf_ctx_data for related tasks will be detached/freed by detach_perf_ctx_data(). The LBR call stack event could be a per-task event or a system-wide event. - For a per-task event, perf only allocates the perf_ctx_data for the current task. If the allocation fails, perf will error out. - For a system-wide event, perf has to allocate the perf_ctx_data for both the existing tasks and the upcoming tasks. The allocation for the existing tasks is done in perf_event_alloc(). If any allocation fails, perf will error out. The allocation for the new tasks will be done in perf_event_fork(). A global reader/writer semaphore, global_ctx_data_rwsem, is added to address the global race. - The perf_ctx_data only be freed by the last LBR call stack event. The number of the per-task events is tracked by refcount of each task. Since the system-wide events impact all tasks, it's not practical to go through the whole task list to update the refcount for each system-wide event. The number of system-wide events is tracked by a global variable global_ctx_data_ref. Suggested-by: "Peter Zijlstra (Intel)" <[email protected]> Signed-off-by: Kan Liang <[email protected]> Signed-off-by: Peter Zijlstra (Intel) <[email protected]> Link: https://lore.kernel.org/r/[email protected]
1 parent fdfda86 commit 506e64e

File tree

2 files changed

+291
-1
lines changed

2 files changed

+291
-1
lines changed

include/linux/perf_event.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -676,11 +676,12 @@ struct swevent_hlist {
676676
#define PERF_ATTACH_GROUP 0x0002
677677
#define PERF_ATTACH_TASK 0x0004
678678
#define PERF_ATTACH_TASK_DATA 0x0008
679-
#define PERF_ATTACH_ITRACE 0x0010
679+
#define PERF_ATTACH_GLOBAL_DATA 0x0010
680680
#define PERF_ATTACH_SCHED_CB 0x0020
681681
#define PERF_ATTACH_CHILD 0x0040
682682
#define PERF_ATTACH_EXCLUSIVE 0x0080
683683
#define PERF_ATTACH_CALLCHAIN 0x0100
684+
#define PERF_ATTACH_ITRACE 0x0200
684685

685686
struct bpf_prog;
686687
struct perf_cgroup;

kernel/events/core.c

Lines changed: 289 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@
5555
#include <linux/pgtable.h>
5656
#include <linux/buildid.h>
5757
#include <linux/task_work.h>
58+
#include <linux/percpu-rwsem.h>
5859

5960
#include "internal.h"
6061

@@ -5217,6 +5218,225 @@ static void unaccount_freq_event(void)
52175218
atomic_dec(&nr_freq_events);
52185219
}
52195220

5221+
5222+
static struct perf_ctx_data *
5223+
alloc_perf_ctx_data(struct kmem_cache *ctx_cache, bool global)
5224+
{
5225+
struct perf_ctx_data *cd;
5226+
5227+
cd = kzalloc(sizeof(*cd), GFP_KERNEL);
5228+
if (!cd)
5229+
return NULL;
5230+
5231+
cd->data = kmem_cache_zalloc(ctx_cache, GFP_KERNEL);
5232+
if (!cd->data) {
5233+
kfree(cd);
5234+
return NULL;
5235+
}
5236+
5237+
cd->global = global;
5238+
cd->ctx_cache = ctx_cache;
5239+
refcount_set(&cd->refcount, 1);
5240+
5241+
return cd;
5242+
}
5243+
5244+
static void free_perf_ctx_data(struct perf_ctx_data *cd)
5245+
{
5246+
kmem_cache_free(cd->ctx_cache, cd->data);
5247+
kfree(cd);
5248+
}
5249+
5250+
static void __free_perf_ctx_data_rcu(struct rcu_head *rcu_head)
5251+
{
5252+
struct perf_ctx_data *cd;
5253+
5254+
cd = container_of(rcu_head, struct perf_ctx_data, rcu_head);
5255+
free_perf_ctx_data(cd);
5256+
}
5257+
5258+
static inline void perf_free_ctx_data_rcu(struct perf_ctx_data *cd)
5259+
{
5260+
call_rcu(&cd->rcu_head, __free_perf_ctx_data_rcu);
5261+
}
5262+
5263+
static int
5264+
attach_task_ctx_data(struct task_struct *task, struct kmem_cache *ctx_cache,
5265+
bool global)
5266+
{
5267+
struct perf_ctx_data *cd, *old = NULL;
5268+
5269+
cd = alloc_perf_ctx_data(ctx_cache, global);
5270+
if (!cd)
5271+
return -ENOMEM;
5272+
5273+
for (;;) {
5274+
if (try_cmpxchg((struct perf_ctx_data **)&task->perf_ctx_data, &old, cd)) {
5275+
if (old)
5276+
perf_free_ctx_data_rcu(old);
5277+
return 0;
5278+
}
5279+
5280+
if (!old) {
5281+
/*
5282+
* After seeing a dead @old, we raced with
5283+
* removal and lost, try again to install @cd.
5284+
*/
5285+
continue;
5286+
}
5287+
5288+
if (refcount_inc_not_zero(&old->refcount)) {
5289+
free_perf_ctx_data(cd); /* unused */
5290+
return 0;
5291+
}
5292+
5293+
/*
5294+
* @old is a dead object, refcount==0 is stable, try and
5295+
* replace it with @cd.
5296+
*/
5297+
}
5298+
return 0;
5299+
}
5300+
5301+
static void __detach_global_ctx_data(void);
5302+
DEFINE_STATIC_PERCPU_RWSEM(global_ctx_data_rwsem);
5303+
static refcount_t global_ctx_data_ref;
5304+
5305+
static int
5306+
attach_global_ctx_data(struct kmem_cache *ctx_cache)
5307+
{
5308+
struct task_struct *g, *p;
5309+
struct perf_ctx_data *cd;
5310+
int ret;
5311+
5312+
if (refcount_inc_not_zero(&global_ctx_data_ref))
5313+
return 0;
5314+
5315+
guard(percpu_write)(&global_ctx_data_rwsem);
5316+
if (refcount_inc_not_zero(&global_ctx_data_ref))
5317+
return 0;
5318+
again:
5319+
/* Allocate everything */
5320+
scoped_guard (rcu) {
5321+
for_each_process_thread(g, p) {
5322+
cd = rcu_dereference(p->perf_ctx_data);
5323+
if (cd && !cd->global) {
5324+
cd->global = 1;
5325+
if (!refcount_inc_not_zero(&cd->refcount))
5326+
cd = NULL;
5327+
}
5328+
if (!cd) {
5329+
get_task_struct(p);
5330+
goto alloc;
5331+
}
5332+
}
5333+
}
5334+
5335+
refcount_set(&global_ctx_data_ref, 1);
5336+
5337+
return 0;
5338+
alloc:
5339+
ret = attach_task_ctx_data(p, ctx_cache, true);
5340+
put_task_struct(p);
5341+
if (ret) {
5342+
__detach_global_ctx_data();
5343+
return ret;
5344+
}
5345+
goto again;
5346+
}
5347+
5348+
static int
5349+
attach_perf_ctx_data(struct perf_event *event)
5350+
{
5351+
struct task_struct *task = event->hw.target;
5352+
struct kmem_cache *ctx_cache = event->pmu->task_ctx_cache;
5353+
int ret;
5354+
5355+
if (!ctx_cache)
5356+
return -ENOMEM;
5357+
5358+
if (task)
5359+
return attach_task_ctx_data(task, ctx_cache, false);
5360+
5361+
ret = attach_global_ctx_data(ctx_cache);
5362+
if (ret)
5363+
return ret;
5364+
5365+
event->attach_state |= PERF_ATTACH_GLOBAL_DATA;
5366+
return 0;
5367+
}
5368+
5369+
static void
5370+
detach_task_ctx_data(struct task_struct *p)
5371+
{
5372+
struct perf_ctx_data *cd;
5373+
5374+
scoped_guard (rcu) {
5375+
cd = rcu_dereference(p->perf_ctx_data);
5376+
if (!cd || !refcount_dec_and_test(&cd->refcount))
5377+
return;
5378+
}
5379+
5380+
/*
5381+
* The old ctx_data may be lost because of the race.
5382+
* Nothing is required to do for the case.
5383+
* See attach_task_ctx_data().
5384+
*/
5385+
if (try_cmpxchg((struct perf_ctx_data **)&p->perf_ctx_data, &cd, NULL))
5386+
perf_free_ctx_data_rcu(cd);
5387+
}
5388+
5389+
static void __detach_global_ctx_data(void)
5390+
{
5391+
struct task_struct *g, *p;
5392+
struct perf_ctx_data *cd;
5393+
5394+
again:
5395+
scoped_guard (rcu) {
5396+
for_each_process_thread(g, p) {
5397+
cd = rcu_dereference(p->perf_ctx_data);
5398+
if (!cd || !cd->global)
5399+
continue;
5400+
cd->global = 0;
5401+
get_task_struct(p);
5402+
goto detach;
5403+
}
5404+
}
5405+
return;
5406+
detach:
5407+
detach_task_ctx_data(p);
5408+
put_task_struct(p);
5409+
goto again;
5410+
}
5411+
5412+
static void detach_global_ctx_data(void)
5413+
{
5414+
if (refcount_dec_not_one(&global_ctx_data_ref))
5415+
return;
5416+
5417+
guard(percpu_write)(&global_ctx_data_rwsem);
5418+
if (!refcount_dec_and_test(&global_ctx_data_ref))
5419+
return;
5420+
5421+
/* remove everything */
5422+
__detach_global_ctx_data();
5423+
}
5424+
5425+
static void detach_perf_ctx_data(struct perf_event *event)
5426+
{
5427+
struct task_struct *task = event->hw.target;
5428+
5429+
event->attach_state &= ~PERF_ATTACH_TASK_DATA;
5430+
5431+
if (task)
5432+
return detach_task_ctx_data(task);
5433+
5434+
if (event->attach_state & PERF_ATTACH_GLOBAL_DATA) {
5435+
detach_global_ctx_data();
5436+
event->attach_state &= ~PERF_ATTACH_GLOBAL_DATA;
5437+
}
5438+
}
5439+
52205440
static void unaccount_event(struct perf_event *event)
52215441
{
52225442
bool dec = false;
@@ -5398,6 +5618,9 @@ static void __free_event(struct perf_event *event)
53985618
if (is_cgroup_event(event))
53995619
perf_detach_cgroup(event);
54005620

5621+
if (event->attach_state & PERF_ATTACH_TASK_DATA)
5622+
detach_perf_ctx_data(event);
5623+
54015624
if (event->destroy)
54025625
event->destroy(event);
54035626

@@ -8607,10 +8830,58 @@ static void perf_event_task(struct task_struct *task,
86078830
task_ctx);
86088831
}
86098832

8833+
/*
8834+
* Allocate data for a new task when profiling system-wide
8835+
* events which require PMU specific data
8836+
*/
8837+
static void
8838+
perf_event_alloc_task_data(struct task_struct *child,
8839+
struct task_struct *parent)
8840+
{
8841+
struct kmem_cache *ctx_cache = NULL;
8842+
struct perf_ctx_data *cd;
8843+
8844+
if (!refcount_read(&global_ctx_data_ref))
8845+
return;
8846+
8847+
scoped_guard (rcu) {
8848+
cd = rcu_dereference(parent->perf_ctx_data);
8849+
if (cd)
8850+
ctx_cache = cd->ctx_cache;
8851+
}
8852+
8853+
if (!ctx_cache)
8854+
return;
8855+
8856+
guard(percpu_read)(&global_ctx_data_rwsem);
8857+
scoped_guard (rcu) {
8858+
cd = rcu_dereference(child->perf_ctx_data);
8859+
if (!cd) {
8860+
/*
8861+
* A system-wide event may be unaccount,
8862+
* when attaching the perf_ctx_data.
8863+
*/
8864+
if (!refcount_read(&global_ctx_data_ref))
8865+
return;
8866+
goto attach;
8867+
}
8868+
8869+
if (!cd->global) {
8870+
cd->global = 1;
8871+
refcount_inc(&cd->refcount);
8872+
}
8873+
}
8874+
8875+
return;
8876+
attach:
8877+
attach_task_ctx_data(child, ctx_cache, true);
8878+
}
8879+
86108880
void perf_event_fork(struct task_struct *task)
86118881
{
86128882
perf_event_task(task, NULL, 1);
86138883
perf_event_namespaces(task);
8884+
perf_event_alloc_task_data(task, current);
86148885
}
86158886

86168887
/*
@@ -12490,6 +12761,18 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
1249012761
if (IS_ERR(pmu))
1249112762
return (void*)pmu;
1249212763

12764+
/*
12765+
* The PERF_ATTACH_TASK_DATA is set in the event_init()->hw_config().
12766+
* The attach should be right after the perf_init_event().
12767+
* Otherwise, the __free_event() would mistakenly detach the non-exist
12768+
* perf_ctx_data because of the other errors between them.
12769+
*/
12770+
if (event->attach_state & PERF_ATTACH_TASK_DATA) {
12771+
err = attach_perf_ctx_data(event);
12772+
if (err)
12773+
return ERR_PTR(err);
12774+
}
12775+
1249312776
/*
1249412777
* Disallow uncore-task events. Similarly, disallow uncore-cgroup
1249512778
* events (they don't make sense as the cgroup will be different
@@ -13637,6 +13920,12 @@ void perf_event_exit_task(struct task_struct *child)
1363713920
* At this point we need to send EXIT events to cpu contexts.
1363813921
*/
1363913922
perf_event_task(child, NULL, 0);
13923+
13924+
/*
13925+
* Detach the perf_ctx_data for the system-wide event.
13926+
*/
13927+
guard(percpu_read)(&global_ctx_data_rwsem);
13928+
detach_task_ctx_data(child);
1364013929
}
1364113930

1364213931
static void perf_free_event(struct perf_event *event,

0 commit comments

Comments
 (0)