Skip to content

Commit 0b80f98

Browse files
htejunaxboe
authored andcommitted
iocost: protect iocg->abs_vdebt with iocg->waitq.lock
abs_vdebt is an atomic_64 which tracks how much over budget a given cgroup is and controls the activation of use_delay mechanism. Once a cgroup goes over budget from forced IOs, it has to pay it back with its future budget. The progress guarantee on debt paying comes from the iocg being active - active iocgs are processed by the periodic timer, which ensures that as time passes the debts dissipate and the iocg returns to normal operation. However, both iocg activation and vdebt handling are asynchronous and a sequence like the following may happen. 1. The iocg is in the process of being deactivated by the periodic timer. 2. A bio enters ioc_rqos_throttle(), calls iocg_activate() which returns without anything because it still sees that the iocg is already active. 3. The iocg is deactivated. 4. The bio from #2 is over budget but needs to be forced. It increases abs_vdebt and goes over the threshold and enables use_delay. 5. IO control is enabled for the iocg's subtree and now IOs are attributed to the descendant cgroups and the iocg itself no longer issues IOs. This leaves the iocg with stuck abs_vdebt - it has debt but inactive and no further IOs which can activate it. This can end up unduly punishing all the descendants cgroups. The usual throttling path has the same issue - the iocg must be active while throttled to ensure that future event will wake it up - and solves the problem by synchronizing the throttling path with a spinlock. abs_vdebt handling is another form of overage handling and shares a lot of characteristics including the fact that it isn't in the hottest path. This patch fixes the above and other possible races by strictly synchronizing abs_vdebt and use_delay handling with iocg->waitq.lock. Signed-off-by: Tejun Heo <[email protected]> Reported-by: Vlad Dmitriev <[email protected]> Cc: [email protected] # v5.4+ Fixes: e1518f6 ("blk-iocost: Don't let merges push vtime into the future") Signed-off-by: Jens Axboe <[email protected]>
1 parent 10c70d9 commit 0b80f98

File tree

2 files changed

+77
-47
lines changed

2 files changed

+77
-47
lines changed

block/blk-iocost.c

Lines changed: 71 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -466,7 +466,7 @@ struct ioc_gq {
466466
*/
467467
atomic64_t vtime;
468468
atomic64_t done_vtime;
469-
atomic64_t abs_vdebt;
469+
u64 abs_vdebt;
470470
u64 last_vtime;
471471

472472
/*
@@ -1142,7 +1142,7 @@ static void iocg_kick_waitq(struct ioc_gq *iocg, struct ioc_now *now)
11421142
struct iocg_wake_ctx ctx = { .iocg = iocg };
11431143
u64 margin_ns = (u64)(ioc->period_us *
11441144
WAITQ_TIMER_MARGIN_PCT / 100) * NSEC_PER_USEC;
1145-
u64 abs_vdebt, vdebt, vshortage, expires, oexpires;
1145+
u64 vdebt, vshortage, expires, oexpires;
11461146
s64 vbudget;
11471147
u32 hw_inuse;
11481148

@@ -1152,18 +1152,15 @@ static void iocg_kick_waitq(struct ioc_gq *iocg, struct ioc_now *now)
11521152
vbudget = now->vnow - atomic64_read(&iocg->vtime);
11531153

11541154
/* pay off debt */
1155-
abs_vdebt = atomic64_read(&iocg->abs_vdebt);
1156-
vdebt = abs_cost_to_cost(abs_vdebt, hw_inuse);
1155+
vdebt = abs_cost_to_cost(iocg->abs_vdebt, hw_inuse);
11571156
if (vdebt && vbudget > 0) {
11581157
u64 delta = min_t(u64, vbudget, vdebt);
11591158
u64 abs_delta = min(cost_to_abs_cost(delta, hw_inuse),
1160-
abs_vdebt);
1159+
iocg->abs_vdebt);
11611160

11621161
atomic64_add(delta, &iocg->vtime);
11631162
atomic64_add(delta, &iocg->done_vtime);
1164-
atomic64_sub(abs_delta, &iocg->abs_vdebt);
1165-
if (WARN_ON_ONCE(atomic64_read(&iocg->abs_vdebt) < 0))
1166-
atomic64_set(&iocg->abs_vdebt, 0);
1163+
iocg->abs_vdebt -= abs_delta;
11671164
}
11681165

11691166
/*
@@ -1219,12 +1216,18 @@ static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now, u64 cost)
12191216
u64 expires, oexpires;
12201217
u32 hw_inuse;
12211218

1219+
lockdep_assert_held(&iocg->waitq.lock);
1220+
12221221
/* debt-adjust vtime */
12231222
current_hweight(iocg, NULL, &hw_inuse);
1224-
vtime += abs_cost_to_cost(atomic64_read(&iocg->abs_vdebt), hw_inuse);
1223+
vtime += abs_cost_to_cost(iocg->abs_vdebt, hw_inuse);
12251224

1226-
/* clear or maintain depending on the overage */
1227-
if (time_before_eq64(vtime, now->vnow)) {
1225+
/*
1226+
* Clear or maintain depending on the overage. Non-zero vdebt is what
1227+
* guarantees that @iocg is online and future iocg_kick_delay() will
1228+
* clear use_delay. Don't leave it on when there's no vdebt.
1229+
*/
1230+
if (!iocg->abs_vdebt || time_before_eq64(vtime, now->vnow)) {
12281231
blkcg_clear_delay(blkg);
12291232
return false;
12301233
}
@@ -1258,9 +1261,12 @@ static enum hrtimer_restart iocg_delay_timer_fn(struct hrtimer *timer)
12581261
{
12591262
struct ioc_gq *iocg = container_of(timer, struct ioc_gq, delay_timer);
12601263
struct ioc_now now;
1264+
unsigned long flags;
12611265

1266+
spin_lock_irqsave(&iocg->waitq.lock, flags);
12621267
ioc_now(iocg->ioc, &now);
12631268
iocg_kick_delay(iocg, &now, 0);
1269+
spin_unlock_irqrestore(&iocg->waitq.lock, flags);
12641270

12651271
return HRTIMER_NORESTART;
12661272
}
@@ -1368,14 +1374,13 @@ static void ioc_timer_fn(struct timer_list *timer)
13681374
* should have woken up in the last period and expire idle iocgs.
13691375
*/
13701376
list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) {
1371-
if (!waitqueue_active(&iocg->waitq) &&
1372-
!atomic64_read(&iocg->abs_vdebt) && !iocg_is_idle(iocg))
1377+
if (!waitqueue_active(&iocg->waitq) && iocg->abs_vdebt &&
1378+
!iocg_is_idle(iocg))
13731379
continue;
13741380

13751381
spin_lock(&iocg->waitq.lock);
13761382

1377-
if (waitqueue_active(&iocg->waitq) ||
1378-
atomic64_read(&iocg->abs_vdebt)) {
1383+
if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt) {
13791384
/* might be oversleeping vtime / hweight changes, kick */
13801385
iocg_kick_waitq(iocg, &now);
13811386
iocg_kick_delay(iocg, &now, 0);
@@ -1718,28 +1723,49 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
17181723
* tests are racy but the races aren't systemic - we only miss once
17191724
* in a while which is fine.
17201725
*/
1721-
if (!waitqueue_active(&iocg->waitq) &&
1722-
!atomic64_read(&iocg->abs_vdebt) &&
1726+
if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
17231727
time_before_eq64(vtime + cost, now.vnow)) {
17241728
iocg_commit_bio(iocg, bio, cost);
17251729
return;
17261730
}
17271731

17281732
/*
1729-
* We're over budget. If @bio has to be issued regardless,
1730-
* remember the abs_cost instead of advancing vtime.
1731-
* iocg_kick_waitq() will pay off the debt before waking more IOs.
1733+
* We activated above but w/o any synchronization. Deactivation is
1734+
* synchronized with waitq.lock and we won't get deactivated as long
1735+
* as we're waiting or has debt, so we're good if we're activated
1736+
* here. In the unlikely case that we aren't, just issue the IO.
1737+
*/
1738+
spin_lock_irq(&iocg->waitq.lock);
1739+
1740+
if (unlikely(list_empty(&iocg->active_list))) {
1741+
spin_unlock_irq(&iocg->waitq.lock);
1742+
iocg_commit_bio(iocg, bio, cost);
1743+
return;
1744+
}
1745+
1746+
/*
1747+
* We're over budget. If @bio has to be issued regardless, remember
1748+
* the abs_cost instead of advancing vtime. iocg_kick_waitq() will pay
1749+
* off the debt before waking more IOs.
1750+
*
17321751
* This way, the debt is continuously paid off each period with the
1733-
* actual budget available to the cgroup. If we just wound vtime,
1734-
* we would incorrectly use the current hw_inuse for the entire
1735-
* amount which, for example, can lead to the cgroup staying
1736-
* blocked for a long time even with substantially raised hw_inuse.
1752+
* actual budget available to the cgroup. If we just wound vtime, we
1753+
* would incorrectly use the current hw_inuse for the entire amount
1754+
* which, for example, can lead to the cgroup staying blocked for a
1755+
* long time even with substantially raised hw_inuse.
1756+
*
1757+
* An iocg with vdebt should stay online so that the timer can keep
1758+
* deducting its vdebt and [de]activate use_delay mechanism
1759+
* accordingly. We don't want to race against the timer trying to
1760+
* clear them and leave @iocg inactive w/ dangling use_delay heavily
1761+
* penalizing the cgroup and its descendants.
17371762
*/
17381763
if (bio_issue_as_root_blkg(bio) || fatal_signal_pending(current)) {
1739-
atomic64_add(abs_cost, &iocg->abs_vdebt);
1764+
iocg->abs_vdebt += abs_cost;
17401765
if (iocg_kick_delay(iocg, &now, cost))
17411766
blkcg_schedule_throttle(rqos->q,
17421767
(bio->bi_opf & REQ_SWAP) == REQ_SWAP);
1768+
spin_unlock_irq(&iocg->waitq.lock);
17431769
return;
17441770
}
17451771

@@ -1756,20 +1782,6 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
17561782
* All waiters are on iocg->waitq and the wait states are
17571783
* synchronized using waitq.lock.
17581784
*/
1759-
spin_lock_irq(&iocg->waitq.lock);
1760-
1761-
/*
1762-
* We activated above but w/o any synchronization. Deactivation is
1763-
* synchronized with waitq.lock and we won't get deactivated as
1764-
* long as we're waiting, so we're good if we're activated here.
1765-
* In the unlikely case that we are deactivated, just issue the IO.
1766-
*/
1767-
if (unlikely(list_empty(&iocg->active_list))) {
1768-
spin_unlock_irq(&iocg->waitq.lock);
1769-
iocg_commit_bio(iocg, bio, cost);
1770-
return;
1771-
}
1772-
17731785
init_waitqueue_func_entry(&wait.wait, iocg_wake_fn);
17741786
wait.wait.private = current;
17751787
wait.bio = bio;
@@ -1801,6 +1813,7 @@ static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq,
18011813
struct ioc_now now;
18021814
u32 hw_inuse;
18031815
u64 abs_cost, cost;
1816+
unsigned long flags;
18041817

18051818
/* bypass if disabled or for root cgroup */
18061819
if (!ioc->enabled || !iocg->level)
@@ -1820,15 +1833,28 @@ static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq,
18201833
iocg->cursor = bio_end;
18211834

18221835
/*
1823-
* Charge if there's enough vtime budget and the existing request
1824-
* has cost assigned. Otherwise, account it as debt. See debt
1825-
* handling in ioc_rqos_throttle() for details.
1836+
* Charge if there's enough vtime budget and the existing request has
1837+
* cost assigned.
18261838
*/
18271839
if (rq->bio && rq->bio->bi_iocost_cost &&
1828-
time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow))
1840+
time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow)) {
18291841
iocg_commit_bio(iocg, bio, cost);
1830-
else
1831-
atomic64_add(abs_cost, &iocg->abs_vdebt);
1842+
return;
1843+
}
1844+
1845+
/*
1846+
* Otherwise, account it as debt if @iocg is online, which it should
1847+
* be for the vast majority of cases. See debt handling in
1848+
* ioc_rqos_throttle() for details.
1849+
*/
1850+
spin_lock_irqsave(&iocg->waitq.lock, flags);
1851+
if (likely(!list_empty(&iocg->active_list))) {
1852+
iocg->abs_vdebt += abs_cost;
1853+
iocg_kick_delay(iocg, &now, cost);
1854+
} else {
1855+
iocg_commit_bio(iocg, bio, cost);
1856+
}
1857+
spin_unlock_irqrestore(&iocg->waitq.lock, flags);
18321858
}
18331859

18341860
static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio)
@@ -1998,7 +2024,6 @@ static void ioc_pd_init(struct blkg_policy_data *pd)
19982024
iocg->ioc = ioc;
19992025
atomic64_set(&iocg->vtime, now.vnow);
20002026
atomic64_set(&iocg->done_vtime, now.vnow);
2001-
atomic64_set(&iocg->abs_vdebt, 0);
20022027
atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period));
20032028
INIT_LIST_HEAD(&iocg->active_list);
20042029
iocg->hweight_active = HWEIGHT_WHOLE;

tools/cgroup/iocost_monitor.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,12 @@ def __init__(self, iocg):
159159
else:
160160
self.inflight_pct = 0
161161

162-
self.debt_ms = iocg.abs_vdebt.counter.value_() / VTIME_PER_USEC / 1000
162+
# vdebt used to be an atomic64_t and is now u64, support both
163+
try:
164+
self.debt_ms = iocg.abs_vdebt.counter.value_() / VTIME_PER_USEC / 1000
165+
except:
166+
self.debt_ms = iocg.abs_vdebt.value_() / VTIME_PER_USEC / 1000
167+
163168
self.use_delay = blkg.use_delay.counter.value_()
164169
self.delay_ms = blkg.delay_nsec.counter.value_() / 1_000_000
165170

0 commit comments

Comments
 (0)