5454#include <linux/highmem.h>
5555#include <linux/pgtable.h>
5656#include <linux/buildid.h>
57+ #include <linux/task_work.h>
5758
5859#include "internal.h"
5960
@@ -2276,11 +2277,26 @@ event_sched_out(struct perf_event *event,
22762277 event -> pmu -> del (event , 0 );
22772278 event -> oncpu = -1 ;
22782279
2279- if (READ_ONCE ( event -> pending_disable ) >= 0 ) {
2280- WRITE_ONCE ( event -> pending_disable , -1 ) ;
2280+ if (event -> pending_disable ) {
2281+ event -> pending_disable = 0 ;
22812282 perf_cgroup_event_disable (event , ctx );
22822283 state = PERF_EVENT_STATE_OFF ;
22832284 }
2285+
2286+ if (event -> pending_sigtrap ) {
2287+ bool dec = true;
2288+
2289+ event -> pending_sigtrap = 0 ;
2290+ if (state != PERF_EVENT_STATE_OFF &&
2291+ !event -> pending_work ) {
2292+ event -> pending_work = 1 ;
2293+ dec = false;
2294+ task_work_add (current , & event -> pending_task , TWA_RESUME );
2295+ }
2296+ if (dec )
2297+ local_dec (& event -> ctx -> nr_pending );
2298+ }
2299+
22842300 perf_event_set_state (event , state );
22852301
22862302 if (!is_software_event (event ))
@@ -2432,7 +2448,7 @@ static void __perf_event_disable(struct perf_event *event,
24322448 * hold the top-level event's child_mutex, so any descendant that
24332449 * goes to exit will block in perf_event_exit_event().
24342450 *
2435- * When called from perf_pending_event it's OK because event->ctx
2451+ * When called from perf_pending_irq it's OK because event->ctx
24362452 * is the current context on this CPU and preemption is disabled,
24372453 * hence we can't get into perf_event_task_sched_out for this context.
24382454 */
@@ -2471,9 +2487,8 @@ EXPORT_SYMBOL_GPL(perf_event_disable);
24712487
24722488void perf_event_disable_inatomic (struct perf_event * event )
24732489{
2474- WRITE_ONCE (event -> pending_disable , smp_processor_id ());
2475- /* can fail, see perf_pending_event_disable() */
2476- irq_work_queue (& event -> pending );
2490+ event -> pending_disable = 1 ;
2491+ irq_work_queue (& event -> pending_irq );
24772492}
24782493
24792494#define MAX_INTERRUPTS (~0ULL)
@@ -3428,11 +3443,23 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
34283443 raw_spin_lock_nested (& next_ctx -> lock , SINGLE_DEPTH_NESTING );
34293444 if (context_equiv (ctx , next_ctx )) {
34303445
3446+ perf_pmu_disable (pmu );
3447+
3448+ /* PMIs are disabled; ctx->nr_pending is stable. */
3449+ if (local_read (& ctx -> nr_pending ) ||
3450+ local_read (& next_ctx -> nr_pending )) {
3451+ /*
3452+ * Must not swap out ctx when there's pending
3453+ * events that rely on the ctx->task relation.
3454+ */
3455+ raw_spin_unlock (& next_ctx -> lock );
3456+ rcu_read_unlock ();
3457+ goto inside_switch ;
3458+ }
3459+
34313460 WRITE_ONCE (ctx -> task , next );
34323461 WRITE_ONCE (next_ctx -> task , task );
34333462
3434- perf_pmu_disable (pmu );
3435-
34363463 if (cpuctx -> sched_cb_usage && pmu -> sched_task )
34373464 pmu -> sched_task (ctx , false);
34383465
@@ -3473,6 +3500,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
34733500 raw_spin_lock (& ctx -> lock );
34743501 perf_pmu_disable (pmu );
34753502
3503+ inside_switch :
34763504 if (cpuctx -> sched_cb_usage && pmu -> sched_task )
34773505 pmu -> sched_task (ctx , false);
34783506 task_ctx_sched_out (cpuctx , ctx , EVENT_ALL );
@@ -4939,7 +4967,7 @@ static void perf_addr_filters_splice(struct perf_event *event,
49394967
49404968static void _free_event (struct perf_event * event )
49414969{
4942- irq_work_sync (& event -> pending );
4970+ irq_work_sync (& event -> pending_irq );
49434971
49444972 unaccount_event (event );
49454973
@@ -6439,7 +6467,8 @@ static void perf_sigtrap(struct perf_event *event)
64396467 return ;
64406468
64416469 /*
6442- * perf_pending_event() can race with the task exiting.
6470+ * Both perf_pending_task() and perf_pending_irq() can race with the
6471+ * task exiting.
64436472 */
64446473 if (current -> flags & PF_EXITING )
64456474 return ;
@@ -6448,23 +6477,33 @@ static void perf_sigtrap(struct perf_event *event)
64486477 event -> attr .type , event -> attr .sig_data );
64496478}
64506479
6451- static void perf_pending_event_disable (struct perf_event * event )
6480+ /*
6481+ * Deliver the pending work in-event-context or follow the context.
6482+ */
6483+ static void __perf_pending_irq (struct perf_event * event )
64526484{
6453- int cpu = READ_ONCE (event -> pending_disable );
6485+ int cpu = READ_ONCE (event -> oncpu );
64546486
6487+ /*
6488+ * If the event isn't running; we done. event_sched_out() will have
6489+ * taken care of things.
6490+ */
64556491 if (cpu < 0 )
64566492 return ;
64576493
6494+ /*
6495+ * Yay, we hit home and are in the context of the event.
6496+ */
64586497 if (cpu == smp_processor_id ()) {
6459- WRITE_ONCE (event -> pending_disable , -1 );
6460-
6461- if (event -> attr .sigtrap ) {
6498+ if (event -> pending_sigtrap ) {
6499+ event -> pending_sigtrap = 0 ;
64626500 perf_sigtrap (event );
6463- atomic_set_release (& event -> event_limit , 1 ); /* rearm event */
6464- return ;
6501+ local_dec (& event -> ctx -> nr_pending );
6502+ }
6503+ if (event -> pending_disable ) {
6504+ event -> pending_disable = 0 ;
6505+ perf_event_disable_local (event );
64656506 }
6466-
6467- perf_event_disable_local (event );
64686507 return ;
64696508 }
64706509
@@ -6484,35 +6523,62 @@ static void perf_pending_event_disable(struct perf_event *event)
64846523 * irq_work_queue(); // FAILS
64856524 *
64866525 * irq_work_run()
6487- * perf_pending_event ()
6526+ * perf_pending_irq ()
64886527 *
64896528 * But the event runs on CPU-B and wants disabling there.
64906529 */
6491- irq_work_queue_on (& event -> pending , cpu );
6530+ irq_work_queue_on (& event -> pending_irq , cpu );
64926531}
64936532
6494- static void perf_pending_event (struct irq_work * entry )
6533+ static void perf_pending_irq (struct irq_work * entry )
64956534{
6496- struct perf_event * event = container_of (entry , struct perf_event , pending );
6535+ struct perf_event * event = container_of (entry , struct perf_event , pending_irq );
64976536 int rctx ;
64986537
6499- rctx = perf_swevent_get_recursion_context ();
65006538 /*
65016539 * If we 'fail' here, that's OK, it means recursion is already disabled
65026540 * and we won't recurse 'further'.
65036541 */
6542+ rctx = perf_swevent_get_recursion_context ();
65046543
6505- perf_pending_event_disable (event );
6506-
6544+ /*
6545+ * The wakeup isn't bound to the context of the event -- it can happen
6546+ * irrespective of where the event is.
6547+ */
65076548 if (event -> pending_wakeup ) {
65086549 event -> pending_wakeup = 0 ;
65096550 perf_event_wakeup (event );
65106551 }
65116552
6553+ __perf_pending_irq (event );
6554+
65126555 if (rctx >= 0 )
65136556 perf_swevent_put_recursion_context (rctx );
65146557}
65156558
6559+ static void perf_pending_task (struct callback_head * head )
6560+ {
6561+ struct perf_event * event = container_of (head , struct perf_event , pending_task );
6562+ int rctx ;
6563+
6564+ /*
6565+ * If we 'fail' here, that's OK, it means recursion is already disabled
6566+ * and we won't recurse 'further'.
6567+ */
6568+ preempt_disable_notrace ();
6569+ rctx = perf_swevent_get_recursion_context ();
6570+
6571+ if (event -> pending_work ) {
6572+ event -> pending_work = 0 ;
6573+ perf_sigtrap (event );
6574+ local_dec (& event -> ctx -> nr_pending );
6575+ }
6576+
6577+ if (rctx >= 0 )
6578+ perf_swevent_put_recursion_context (rctx );
6579+ preempt_enable_notrace ();
6580+ }
6581+
65166582#ifdef CONFIG_GUEST_PERF_EVENTS
65176583struct perf_guest_info_callbacks __rcu * perf_guest_cbs ;
65186584
@@ -9212,8 +9278,8 @@ int perf_event_account_interrupt(struct perf_event *event)
92129278 */
92139279
92149280static int __perf_event_overflow (struct perf_event * event ,
9215- int throttle , struct perf_sample_data * data ,
9216- struct pt_regs * regs )
9281+ int throttle , struct perf_sample_data * data ,
9282+ struct pt_regs * regs )
92179283{
92189284 int events = atomic_read (& event -> event_limit );
92199285 int ret = 0 ;
@@ -9236,24 +9302,36 @@ static int __perf_event_overflow(struct perf_event *event,
92369302 if (events && atomic_dec_and_test (& event -> event_limit )) {
92379303 ret = 1 ;
92389304 event -> pending_kill = POLL_HUP ;
9239- event -> pending_addr = data -> addr ;
9240-
92419305 perf_event_disable_inatomic (event );
92429306 }
92439307
9308+ if (event -> attr .sigtrap ) {
9309+ /*
9310+ * Should not be able to return to user space without processing
9311+ * pending_sigtrap (kernel events can overflow multiple times).
9312+ */
9313+ WARN_ON_ONCE (event -> pending_sigtrap && event -> attr .exclude_kernel );
9314+ if (!event -> pending_sigtrap ) {
9315+ event -> pending_sigtrap = 1 ;
9316+ local_inc (& event -> ctx -> nr_pending );
9317+ }
9318+ event -> pending_addr = data -> addr ;
9319+ irq_work_queue (& event -> pending_irq );
9320+ }
9321+
92449322 READ_ONCE (event -> overflow_handler )(event , data , regs );
92459323
92469324 if (* perf_event_fasync (event ) && event -> pending_kill ) {
92479325 event -> pending_wakeup = 1 ;
9248- irq_work_queue (& event -> pending );
9326+ irq_work_queue (& event -> pending_irq );
92499327 }
92509328
92519329 return ret ;
92529330}
92539331
92549332int perf_event_overflow (struct perf_event * event ,
9255- struct perf_sample_data * data ,
9256- struct pt_regs * regs )
9333+ struct perf_sample_data * data ,
9334+ struct pt_regs * regs )
92579335{
92589336 return __perf_event_overflow (event , 1 , data , regs );
92599337}
@@ -11570,8 +11648,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
1157011648
1157111649
1157211650 init_waitqueue_head (& event -> waitq );
11573- event -> pending_disable = -1 ;
11574- init_irq_work (& event -> pending , perf_pending_event );
11651+ init_irq_work ( & event -> pending_irq , perf_pending_irq ) ;
11652+ init_task_work (& event -> pending_task , perf_pending_task );
1157511653
1157611654 mutex_init (& event -> mmap_mutex );
1157711655 raw_spin_lock_init (& event -> addr_filters .lock );
@@ -11593,9 +11671,6 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
1159311671 if (parent_event )
1159411672 event -> event_caps = parent_event -> event_caps ;
1159511673
11596- if (event -> attr .sigtrap )
11597- atomic_set (& event -> event_limit , 1 );
11598-
1159911674 if (task ) {
1160011675 event -> attach_state = PERF_ATTACH_TASK ;
1160111676 /*
0 commit comments