@@ -1084,7 +1084,10 @@ struct bpf_async_cb {
1084
1084
struct bpf_prog * prog ;
1085
1085
void __rcu * callback_fn ;
1086
1086
void * value ;
1087
- struct rcu_head rcu ;
1087
+ union {
1088
+ struct rcu_head rcu ;
1089
+ struct work_struct delete_work ;
1090
+ };
1088
1091
u64 flags ;
1089
1092
};
1090
1093
@@ -1107,6 +1110,7 @@ struct bpf_async_cb {
1107
1110
struct bpf_hrtimer {
1108
1111
struct bpf_async_cb cb ;
1109
1112
struct hrtimer timer ;
1113
+ atomic_t cancelling ;
1110
1114
};
1111
1115
1112
1116
struct bpf_work {
@@ -1219,6 +1223,21 @@ static void bpf_wq_delete_work(struct work_struct *work)
1219
1223
kfree_rcu (w , cb .rcu );
1220
1224
}
1221
1225
1226
+ static void bpf_timer_delete_work (struct work_struct * work )
1227
+ {
1228
+ struct bpf_hrtimer * t = container_of (work , struct bpf_hrtimer , cb .delete_work );
1229
+
1230
+ /* Cancel the timer and wait for callback to complete if it was running.
1231
+ * If hrtimer_cancel() can be safely called it's safe to call
1232
+ * kfree_rcu(t) right after for both preallocated and non-preallocated
1233
+ * maps. The async->cb = NULL was already done and no code path can see
1234
+ * address 't' anymore. Timer if armed for existing bpf_hrtimer before
1235
+ * bpf_timer_cancel_and_free will have been cancelled.
1236
+ */
1237
+ hrtimer_cancel (& t -> timer );
1238
+ kfree_rcu (t , cb .rcu );
1239
+ }
1240
+
1222
1241
static int __bpf_async_init (struct bpf_async_kern * async , struct bpf_map * map , u64 flags ,
1223
1242
enum bpf_async_type type )
1224
1243
{
@@ -1262,6 +1281,8 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u
1262
1281
clockid = flags & (MAX_CLOCKS - 1 );
1263
1282
t = (struct bpf_hrtimer * )cb ;
1264
1283
1284
+ atomic_set (& t -> cancelling , 0 );
1285
+ INIT_WORK (& t -> cb .delete_work , bpf_timer_delete_work );
1265
1286
hrtimer_init (& t -> timer , clockid , HRTIMER_MODE_REL_SOFT );
1266
1287
t -> timer .function = bpf_timer_cb ;
1267
1288
cb -> value = (void * )async - map -> record -> timer_off ;
@@ -1440,7 +1461,8 @@ static void drop_prog_refcnt(struct bpf_async_cb *async)
1440
1461
1441
1462
BPF_CALL_1 (bpf_timer_cancel , struct bpf_async_kern * , timer )
1442
1463
{
1443
- struct bpf_hrtimer * t ;
1464
+ struct bpf_hrtimer * t , * cur_t ;
1465
+ bool inc = false;
1444
1466
int ret = 0 ;
1445
1467
1446
1468
if (in_nmi ())
@@ -1452,21 +1474,50 @@ BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer)
1452
1474
ret = - EINVAL ;
1453
1475
goto out ;
1454
1476
}
1455
- if (this_cpu_read (hrtimer_running ) == t ) {
1477
+
1478
+ cur_t = this_cpu_read (hrtimer_running );
1479
+ if (cur_t == t ) {
1456
1480
/* If bpf callback_fn is trying to bpf_timer_cancel()
1457
1481
* its own timer the hrtimer_cancel() will deadlock
1458
- * since it waits for callback_fn to finish
1482
+ * since it waits for callback_fn to finish.
1459
1483
*/
1460
1484
ret = - EDEADLK ;
1461
1485
goto out ;
1462
1486
}
1487
+
1488
+ /* Only account in-flight cancellations when invoked from a timer
1489
+ * callback, since we want to avoid waiting only if other _callbacks_
1490
+ * are waiting on us, to avoid introducing lockups. Non-callback paths
1491
+ * are ok, since nobody would synchronously wait for their completion.
1492
+ */
1493
+ if (!cur_t )
1494
+ goto drop ;
1495
+ atomic_inc (& t -> cancelling );
1496
+ /* Need full barrier after relaxed atomic_inc */
1497
+ smp_mb__after_atomic ();
1498
+ inc = true;
1499
+ if (atomic_read (& cur_t -> cancelling )) {
1500
+ /* We're cancelling timer t, while some other timer callback is
1501
+ * attempting to cancel us. In such a case, it might be possible
1502
+ * that timer t belongs to the other callback, or some other
1503
+ * callback waiting upon it (creating transitive dependencies
1504
+ * upon us), and we will enter a deadlock if we continue
1505
+ * cancelling and waiting for it synchronously, since it might
1506
+ * do the same. Bail!
1507
+ */
1508
+ ret = - EDEADLK ;
1509
+ goto out ;
1510
+ }
1511
+ drop :
1463
1512
drop_prog_refcnt (& t -> cb );
1464
1513
out :
1465
1514
__bpf_spin_unlock_irqrestore (& timer -> lock );
1466
1515
/* Cancel the timer and wait for associated callback to finish
1467
1516
* if it was running.
1468
1517
*/
1469
1518
ret = ret ?: hrtimer_cancel (& t -> timer );
1519
+ if (inc )
1520
+ atomic_dec (& t -> cancelling );
1470
1521
rcu_read_unlock ();
1471
1522
return ret ;
1472
1523
}
@@ -1512,25 +1563,39 @@ void bpf_timer_cancel_and_free(void *val)
1512
1563
1513
1564
if (!t )
1514
1565
return ;
1515
- /* Cancel the timer and wait for callback to complete if it was running.
1516
- * If hrtimer_cancel() can be safely called it's safe to call kfree(t)
1517
- * right after for both preallocated and non-preallocated maps.
1518
- * The async->cb = NULL was already done and no code path can
1519
- * see address 't' anymore.
1520
- *
1521
- * Check that bpf_map_delete/update_elem() wasn't called from timer
1522
- * callback_fn. In such case don't call hrtimer_cancel() (since it will
1523
- * deadlock) and don't call hrtimer_try_to_cancel() (since it will just
1524
- * return -1). Though callback_fn is still running on this cpu it's
1566
+ /* We check that bpf_map_delete/update_elem() was called from timer
1567
+ * callback_fn. In such case we don't call hrtimer_cancel() (since it
1568
+ * will deadlock) and don't call hrtimer_try_to_cancel() (since it will
1569
+ * just return -1). Though callback_fn is still running on this cpu it's
1525
1570
* safe to do kfree(t) because bpf_timer_cb() read everything it needed
1526
1571
* from 't'. The bpf subprog callback_fn won't be able to access 't',
1527
1572
* since async->cb = NULL was already done. The timer will be
1528
1573
* effectively cancelled because bpf_timer_cb() will return
1529
1574
* HRTIMER_NORESTART.
1575
+ *
1576
+ * However, it is possible the timer callback_fn calling us armed the
1577
+ * timer _before_ calling us, such that failing to cancel it here will
1578
+ * cause it to possibly use struct hrtimer after freeing bpf_hrtimer.
1579
+ * Therefore, we _need_ to cancel any outstanding timers before we do
1580
+ * kfree_rcu, even though no more timers can be armed.
1581
+ *
1582
+ * Moreover, we need to schedule work even if timer does not belong to
1583
+ * the calling callback_fn, as on two different CPUs, we can end up in a
1584
+ * situation where both sides run in parallel, try to cancel one
1585
+ * another, and we end up waiting on both sides in hrtimer_cancel
1586
+ * without making forward progress, since timer1 depends on time2
1587
+ * callback to finish, and vice versa.
1588
+ *
1589
+ * CPU 1 (timer1_cb) CPU 2 (timer2_cb)
1590
+ * bpf_timer_cancel_and_free(timer2) bpf_timer_cancel_and_free(timer1)
1591
+ *
1592
+ * To avoid these issues, punt to workqueue context when we are in a
1593
+ * timer callback.
1530
1594
*/
1531
- if (this_cpu_read (hrtimer_running ) != t )
1532
- hrtimer_cancel (& t -> timer );
1533
- kfree_rcu (t , cb .rcu );
1595
+ if (this_cpu_read (hrtimer_running ))
1596
+ queue_work (system_unbound_wq , & t -> cb .delete_work );
1597
+ else
1598
+ bpf_timer_delete_work (& t -> cb .delete_work );
1534
1599
}
1535
1600
1536
1601
/* This function is called by map_delete/update_elem for individual element and
0 commit comments