54
54
#include <linux/highmem.h>
55
55
#include <linux/pgtable.h>
56
56
#include <linux/buildid.h>
57
+ #include <linux/task_work.h>
57
58
58
59
#include "internal.h"
59
60
@@ -2276,11 +2277,26 @@ event_sched_out(struct perf_event *event,
2276
2277
event -> pmu -> del (event , 0 );
2277
2278
event -> oncpu = -1 ;
2278
2279
2279
- if (READ_ONCE ( event -> pending_disable ) >= 0 ) {
2280
- WRITE_ONCE ( event -> pending_disable , -1 ) ;
2280
+ if (event -> pending_disable ) {
2281
+ event -> pending_disable = 0 ;
2281
2282
perf_cgroup_event_disable (event , ctx );
2282
2283
state = PERF_EVENT_STATE_OFF ;
2283
2284
}
2285
+
2286
+ if (event -> pending_sigtrap ) {
2287
+ bool dec = true;
2288
+
2289
+ event -> pending_sigtrap = 0 ;
2290
+ if (state != PERF_EVENT_STATE_OFF &&
2291
+ !event -> pending_work ) {
2292
+ event -> pending_work = 1 ;
2293
+ dec = false;
2294
+ task_work_add (current , & event -> pending_task , TWA_RESUME );
2295
+ }
2296
+ if (dec )
2297
+ local_dec (& event -> ctx -> nr_pending );
2298
+ }
2299
+
2284
2300
perf_event_set_state (event , state );
2285
2301
2286
2302
if (!is_software_event (event ))
@@ -2432,7 +2448,7 @@ static void __perf_event_disable(struct perf_event *event,
2432
2448
* hold the top-level event's child_mutex, so any descendant that
2433
2449
* goes to exit will block in perf_event_exit_event().
2434
2450
*
2435
- * When called from perf_pending_event it's OK because event->ctx
2451
+ * When called from perf_pending_irq it's OK because event->ctx
2436
2452
* is the current context on this CPU and preemption is disabled,
2437
2453
* hence we can't get into perf_event_task_sched_out for this context.
2438
2454
*/
@@ -2471,9 +2487,8 @@ EXPORT_SYMBOL_GPL(perf_event_disable);
2471
2487
2472
2488
void perf_event_disable_inatomic (struct perf_event * event )
2473
2489
{
2474
- WRITE_ONCE (event -> pending_disable , smp_processor_id ());
2475
- /* can fail, see perf_pending_event_disable() */
2476
- irq_work_queue (& event -> pending );
2490
+ event -> pending_disable = 1 ;
2491
+ irq_work_queue (& event -> pending_irq );
2477
2492
}
2478
2493
2479
2494
#define MAX_INTERRUPTS (~0ULL)
@@ -3428,11 +3443,23 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
3428
3443
raw_spin_lock_nested (& next_ctx -> lock , SINGLE_DEPTH_NESTING );
3429
3444
if (context_equiv (ctx , next_ctx )) {
3430
3445
3446
+ perf_pmu_disable (pmu );
3447
+
3448
+ /* PMIs are disabled; ctx->nr_pending is stable. */
3449
+ if (local_read (& ctx -> nr_pending ) ||
3450
+ local_read (& next_ctx -> nr_pending )) {
3451
+ /*
3452
+ * Must not swap out ctx when there's pending
3453
+ * events that rely on the ctx->task relation.
3454
+ */
3455
+ raw_spin_unlock (& next_ctx -> lock );
3456
+ rcu_read_unlock ();
3457
+ goto inside_switch ;
3458
+ }
3459
+
3431
3460
WRITE_ONCE (ctx -> task , next );
3432
3461
WRITE_ONCE (next_ctx -> task , task );
3433
3462
3434
- perf_pmu_disable (pmu );
3435
-
3436
3463
if (cpuctx -> sched_cb_usage && pmu -> sched_task )
3437
3464
pmu -> sched_task (ctx , false);
3438
3465
@@ -3473,6 +3500,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
3473
3500
raw_spin_lock (& ctx -> lock );
3474
3501
perf_pmu_disable (pmu );
3475
3502
3503
+ inside_switch :
3476
3504
if (cpuctx -> sched_cb_usage && pmu -> sched_task )
3477
3505
pmu -> sched_task (ctx , false);
3478
3506
task_ctx_sched_out (cpuctx , ctx , EVENT_ALL );
@@ -4939,7 +4967,7 @@ static void perf_addr_filters_splice(struct perf_event *event,
4939
4967
4940
4968
static void _free_event (struct perf_event * event )
4941
4969
{
4942
- irq_work_sync (& event -> pending );
4970
+ irq_work_sync (& event -> pending_irq );
4943
4971
4944
4972
unaccount_event (event );
4945
4973
@@ -6439,7 +6467,8 @@ static void perf_sigtrap(struct perf_event *event)
6439
6467
return ;
6440
6468
6441
6469
/*
6442
- * perf_pending_event() can race with the task exiting.
6470
+ * Both perf_pending_task() and perf_pending_irq() can race with the
6471
+ * task exiting.
6443
6472
*/
6444
6473
if (current -> flags & PF_EXITING )
6445
6474
return ;
@@ -6448,23 +6477,33 @@ static void perf_sigtrap(struct perf_event *event)
6448
6477
event -> attr .type , event -> attr .sig_data );
6449
6478
}
6450
6479
6451
- static void perf_pending_event_disable (struct perf_event * event )
6480
+ /*
6481
+ * Deliver the pending work in-event-context or follow the context.
6482
+ */
6483
+ static void __perf_pending_irq (struct perf_event * event )
6452
6484
{
6453
- int cpu = READ_ONCE (event -> pending_disable );
6485
+ int cpu = READ_ONCE (event -> oncpu );
6454
6486
6487
+ /*
6488
+ * If the event isn't running; we done. event_sched_out() will have
6489
+ * taken care of things.
6490
+ */
6455
6491
if (cpu < 0 )
6456
6492
return ;
6457
6493
6494
+ /*
6495
+ * Yay, we hit home and are in the context of the event.
6496
+ */
6458
6497
if (cpu == smp_processor_id ()) {
6459
- WRITE_ONCE (event -> pending_disable , -1 );
6460
-
6461
- if (event -> attr .sigtrap ) {
6498
+ if (event -> pending_sigtrap ) {
6499
+ event -> pending_sigtrap = 0 ;
6462
6500
perf_sigtrap (event );
6463
- atomic_set_release (& event -> event_limit , 1 ); /* rearm event */
6464
- return ;
6501
+ local_dec (& event -> ctx -> nr_pending );
6502
+ }
6503
+ if (event -> pending_disable ) {
6504
+ event -> pending_disable = 0 ;
6505
+ perf_event_disable_local (event );
6465
6506
}
6466
-
6467
- perf_event_disable_local (event );
6468
6507
return ;
6469
6508
}
6470
6509
@@ -6484,35 +6523,62 @@ static void perf_pending_event_disable(struct perf_event *event)
6484
6523
* irq_work_queue(); // FAILS
6485
6524
*
6486
6525
* irq_work_run()
6487
- * perf_pending_event ()
6526
+ * perf_pending_irq ()
6488
6527
*
6489
6528
* But the event runs on CPU-B and wants disabling there.
6490
6529
*/
6491
- irq_work_queue_on (& event -> pending , cpu );
6530
+ irq_work_queue_on (& event -> pending_irq , cpu );
6492
6531
}
6493
6532
6494
- static void perf_pending_event (struct irq_work * entry )
6533
+ static void perf_pending_irq (struct irq_work * entry )
6495
6534
{
6496
- struct perf_event * event = container_of (entry , struct perf_event , pending );
6535
+ struct perf_event * event = container_of (entry , struct perf_event , pending_irq );
6497
6536
int rctx ;
6498
6537
6499
- rctx = perf_swevent_get_recursion_context ();
6500
6538
/*
6501
6539
* If we 'fail' here, that's OK, it means recursion is already disabled
6502
6540
* and we won't recurse 'further'.
6503
6541
*/
6542
+ rctx = perf_swevent_get_recursion_context ();
6504
6543
6505
- perf_pending_event_disable (event );
6506
-
6544
+ /*
6545
+ * The wakeup isn't bound to the context of the event -- it can happen
6546
+ * irrespective of where the event is.
6547
+ */
6507
6548
if (event -> pending_wakeup ) {
6508
6549
event -> pending_wakeup = 0 ;
6509
6550
perf_event_wakeup (event );
6510
6551
}
6511
6552
6553
+ __perf_pending_irq (event );
6554
+
6512
6555
if (rctx >= 0 )
6513
6556
perf_swevent_put_recursion_context (rctx );
6514
6557
}
6515
6558
6559
+ static void perf_pending_task (struct callback_head * head )
6560
+ {
6561
+ struct perf_event * event = container_of (head , struct perf_event , pending_task );
6562
+ int rctx ;
6563
+
6564
+ /*
6565
+ * If we 'fail' here, that's OK, it means recursion is already disabled
6566
+ * and we won't recurse 'further'.
6567
+ */
6568
+ preempt_disable_notrace ();
6569
+ rctx = perf_swevent_get_recursion_context ();
6570
+
6571
+ if (event -> pending_work ) {
6572
+ event -> pending_work = 0 ;
6573
+ perf_sigtrap (event );
6574
+ local_dec (& event -> ctx -> nr_pending );
6575
+ }
6576
+
6577
+ if (rctx >= 0 )
6578
+ perf_swevent_put_recursion_context (rctx );
6579
+ preempt_enable_notrace ();
6580
+ }
6581
+
6516
6582
#ifdef CONFIG_GUEST_PERF_EVENTS
6517
6583
struct perf_guest_info_callbacks __rcu * perf_guest_cbs ;
6518
6584
@@ -9212,8 +9278,8 @@ int perf_event_account_interrupt(struct perf_event *event)
9212
9278
*/
9213
9279
9214
9280
static int __perf_event_overflow (struct perf_event * event ,
9215
- int throttle , struct perf_sample_data * data ,
9216
- struct pt_regs * regs )
9281
+ int throttle , struct perf_sample_data * data ,
9282
+ struct pt_regs * regs )
9217
9283
{
9218
9284
int events = atomic_read (& event -> event_limit );
9219
9285
int ret = 0 ;
@@ -9236,24 +9302,36 @@ static int __perf_event_overflow(struct perf_event *event,
9236
9302
if (events && atomic_dec_and_test (& event -> event_limit )) {
9237
9303
ret = 1 ;
9238
9304
event -> pending_kill = POLL_HUP ;
9239
- event -> pending_addr = data -> addr ;
9240
-
9241
9305
perf_event_disable_inatomic (event );
9242
9306
}
9243
9307
9308
+ if (event -> attr .sigtrap ) {
9309
+ /*
9310
+ * Should not be able to return to user space without processing
9311
+ * pending_sigtrap (kernel events can overflow multiple times).
9312
+ */
9313
+ WARN_ON_ONCE (event -> pending_sigtrap && event -> attr .exclude_kernel );
9314
+ if (!event -> pending_sigtrap ) {
9315
+ event -> pending_sigtrap = 1 ;
9316
+ local_inc (& event -> ctx -> nr_pending );
9317
+ }
9318
+ event -> pending_addr = data -> addr ;
9319
+ irq_work_queue (& event -> pending_irq );
9320
+ }
9321
+
9244
9322
READ_ONCE (event -> overflow_handler )(event , data , regs );
9245
9323
9246
9324
if (* perf_event_fasync (event ) && event -> pending_kill ) {
9247
9325
event -> pending_wakeup = 1 ;
9248
- irq_work_queue (& event -> pending );
9326
+ irq_work_queue (& event -> pending_irq );
9249
9327
}
9250
9328
9251
9329
return ret ;
9252
9330
}
9253
9331
9254
9332
int perf_event_overflow (struct perf_event * event ,
9255
- struct perf_sample_data * data ,
9256
- struct pt_regs * regs )
9333
+ struct perf_sample_data * data ,
9334
+ struct pt_regs * regs )
9257
9335
{
9258
9336
return __perf_event_overflow (event , 1 , data , regs );
9259
9337
}
@@ -11570,8 +11648,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
11570
11648
11571
11649
11572
11650
init_waitqueue_head (& event -> waitq );
11573
- event -> pending_disable = -1 ;
11574
- init_irq_work (& event -> pending , perf_pending_event );
11651
+ init_irq_work ( & event -> pending_irq , perf_pending_irq ) ;
11652
+ init_task_work (& event -> pending_task , perf_pending_task );
11575
11653
11576
11654
mutex_init (& event -> mmap_mutex );
11577
11655
raw_spin_lock_init (& event -> addr_filters .lock );
@@ -11593,9 +11671,6 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
11593
11671
if (parent_event )
11594
11672
event -> event_caps = parent_event -> event_caps ;
11595
11673
11596
- if (event -> attr .sigtrap )
11597
- atomic_set (& event -> event_limit , 1 );
11598
-
11599
11674
if (task ) {
11600
11675
event -> attach_state = PERF_ATTACH_TASK ;
11601
11676
/*
0 commit comments