Skip to content

Commit b215557

Browse files
Mikulas Patockasnitm
authored andcommitted
dm snapshot: rework COW throttling to fix deadlock
Commit 721b1d9 ("dm snapshot: Fix excessive memory usage and workqueue stalls") introduced a semaphore to limit the maximum number of in-flight kcopyd (COW) jobs. The implementation of this throttling mechanism is prone to a deadlock: 1. One or more threads write to the origin device causing COW, which is performed by kcopyd. 2. At some point some of these threads might reach the s->cow_count semaphore limit and block in down(&s->cow_count), holding a read lock on _origins_lock. 3. Someone tries to acquire a write lock on _origins_lock, e.g., snapshot_ctr(), which blocks because the threads at step (2) already hold a read lock on it. 4. A COW operation completes and kcopyd runs dm-snapshot's completion callback, which ends up calling pending_complete(). pending_complete() tries to resubmit any deferred origin bios. This requires acquiring a read lock on _origins_lock, which blocks. This happens because the read-write semaphore implementation gives priority to writers, meaning that as soon as a writer tries to enter the critical section, no readers will be allowed in, until all writers have completed their work. So, pending_complete() waits for the writer at step (3) to acquire and release the lock. This writer waits for the readers at step (2) to release the read lock and those readers wait for pending_complete() (the kcopyd thread) to signal the s->cow_count semaphore: DEADLOCK. The above was thoroughly analyzed and documented by Nikos Tsironis as part of his initial proposal for fixing this deadlock, see: https://www.redhat.com/archives/dm-devel/2019-October/msg00001.html Fix this deadlock by reworking COW throttling so that it waits without holding any locks. Add a variable 'in_progress' that counts how many kcopyd jobs are running. A function wait_for_in_progress() will sleep if 'in_progress' is over the limit. It drops _origins_lock in order to avoid the deadlock. Reported-by: Guruswamy Basavaiah <[email protected]> Reported-by: Nikos Tsironis <[email protected]> Reviewed-by: Nikos Tsironis <[email protected]> Tested-by: Nikos Tsironis <[email protected]> Fixes: 721b1d9 ("dm snapshot: Fix excessive memory usage and workqueue stalls") Cc: [email protected] # v5.0+ Depends-on: 4a3f111 ("dm snapshot: introduce account_start_copy() and account_end_copy()") Signed-off-by: Mikulas Patocka <[email protected]> Signed-off-by: Mike Snitzer <[email protected]>
1 parent a2f83e8 commit b215557

File tree

1 file changed

+64
-14
lines changed

1 file changed

+64
-14
lines changed

drivers/md/dm-snap.c

Lines changed: 64 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
#include <linux/vmalloc.h>
1919
#include <linux/log2.h>
2020
#include <linux/dm-kcopyd.h>
21-
#include <linux/semaphore.h>
2221

2322
#include "dm.h"
2423

@@ -107,8 +106,8 @@ struct dm_snapshot {
107106
/* The on disk metadata handler */
108107
struct dm_exception_store *store;
109108

110-
/* Maximum number of in-flight COW jobs. */
111-
struct semaphore cow_count;
109+
unsigned in_progress;
110+
struct wait_queue_head in_progress_wait;
112111

113112
struct dm_kcopyd_client *kcopyd_client;
114113

@@ -162,8 +161,8 @@ struct dm_snapshot {
162161
*/
163162
#define DEFAULT_COW_THRESHOLD 2048
164163

165-
static int cow_threshold = DEFAULT_COW_THRESHOLD;
166-
module_param_named(snapshot_cow_threshold, cow_threshold, int, 0644);
164+
static unsigned cow_threshold = DEFAULT_COW_THRESHOLD;
165+
module_param_named(snapshot_cow_threshold, cow_threshold, uint, 0644);
167166
MODULE_PARM_DESC(snapshot_cow_threshold, "Maximum number of chunks being copied on write");
168167

169168
DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle,
@@ -1327,7 +1326,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
13271326
goto bad_hash_tables;
13281327
}
13291328

1330-
sema_init(&s->cow_count, (cow_threshold > 0) ? cow_threshold : INT_MAX);
1329+
init_waitqueue_head(&s->in_progress_wait);
13311330

13321331
s->kcopyd_client = dm_kcopyd_client_create(&dm_kcopyd_throttle);
13331332
if (IS_ERR(s->kcopyd_client)) {
@@ -1509,17 +1508,54 @@ static void snapshot_dtr(struct dm_target *ti)
15091508

15101509
dm_put_device(ti, s->origin);
15111510

1511+
WARN_ON(s->in_progress);
1512+
15121513
kfree(s);
15131514
}
15141515

15151516
static void account_start_copy(struct dm_snapshot *s)
15161517
{
1517-
down(&s->cow_count);
1518+
spin_lock(&s->in_progress_wait.lock);
1519+
s->in_progress++;
1520+
spin_unlock(&s->in_progress_wait.lock);
15181521
}
15191522

15201523
static void account_end_copy(struct dm_snapshot *s)
15211524
{
1522-
up(&s->cow_count);
1525+
spin_lock(&s->in_progress_wait.lock);
1526+
BUG_ON(!s->in_progress);
1527+
s->in_progress--;
1528+
if (likely(s->in_progress <= cow_threshold) &&
1529+
unlikely(waitqueue_active(&s->in_progress_wait)))
1530+
wake_up_locked(&s->in_progress_wait);
1531+
spin_unlock(&s->in_progress_wait.lock);
1532+
}
1533+
1534+
static bool wait_for_in_progress(struct dm_snapshot *s, bool unlock_origins)
1535+
{
1536+
if (unlikely(s->in_progress > cow_threshold)) {
1537+
spin_lock(&s->in_progress_wait.lock);
1538+
if (likely(s->in_progress > cow_threshold)) {
1539+
/*
1540+
* NOTE: this throttle doesn't account for whether
1541+
* the caller is servicing an IO that will trigger a COW
1542+
* so excess throttling may result for chunks not required
1543+
* to be COW'd. But if cow_threshold was reached, extra
1544+
* throttling is unlikely to negatively impact performance.
1545+
*/
1546+
DECLARE_WAITQUEUE(wait, current);
1547+
__add_wait_queue(&s->in_progress_wait, &wait);
1548+
__set_current_state(TASK_UNINTERRUPTIBLE);
1549+
spin_unlock(&s->in_progress_wait.lock);
1550+
if (unlock_origins)
1551+
up_read(&_origins_lock);
1552+
io_schedule();
1553+
remove_wait_queue(&s->in_progress_wait, &wait);
1554+
return false;
1555+
}
1556+
spin_unlock(&s->in_progress_wait.lock);
1557+
}
1558+
return true;
15231559
}
15241560

15251561
/*
@@ -1537,7 +1573,7 @@ static void flush_bios(struct bio *bio)
15371573
}
15381574
}
15391575

1540-
static int do_origin(struct dm_dev *origin, struct bio *bio);
1576+
static int do_origin(struct dm_dev *origin, struct bio *bio, bool limit);
15411577

15421578
/*
15431579
* Flush a list of buffers.
@@ -1550,7 +1586,7 @@ static void retry_origin_bios(struct dm_snapshot *s, struct bio *bio)
15501586
while (bio) {
15511587
n = bio->bi_next;
15521588
bio->bi_next = NULL;
1553-
r = do_origin(s->origin, bio);
1589+
r = do_origin(s->origin, bio, false);
15541590
if (r == DM_MAPIO_REMAPPED)
15551591
generic_make_request(bio);
15561592
bio = n;
@@ -1926,6 +1962,11 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
19261962
if (!s->valid)
19271963
return DM_MAPIO_KILL;
19281964

1965+
if (bio_data_dir(bio) == WRITE) {
1966+
while (unlikely(!wait_for_in_progress(s, false)))
1967+
; /* wait_for_in_progress() has slept */
1968+
}
1969+
19291970
down_read(&s->lock);
19301971
dm_exception_table_lock(&lock);
19311972

@@ -2122,7 +2163,7 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio)
21222163

21232164
if (bio_data_dir(bio) == WRITE) {
21242165
up_write(&s->lock);
2125-
return do_origin(s->origin, bio);
2166+
return do_origin(s->origin, bio, false);
21262167
}
21272168

21282169
out_unlock:
@@ -2497,15 +2538,24 @@ static int __origin_write(struct list_head *snapshots, sector_t sector,
24972538
/*
24982539
* Called on a write from the origin driver.
24992540
*/
2500-
static int do_origin(struct dm_dev *origin, struct bio *bio)
2541+
static int do_origin(struct dm_dev *origin, struct bio *bio, bool limit)
25012542
{
25022543
struct origin *o;
25032544
int r = DM_MAPIO_REMAPPED;
25042545

2546+
again:
25052547
down_read(&_origins_lock);
25062548
o = __lookup_origin(origin->bdev);
2507-
if (o)
2549+
if (o) {
2550+
if (limit) {
2551+
struct dm_snapshot *s;
2552+
list_for_each_entry(s, &o->snapshots, list)
2553+
if (unlikely(!wait_for_in_progress(s, true)))
2554+
goto again;
2555+
}
2556+
25082557
r = __origin_write(&o->snapshots, bio->bi_iter.bi_sector, bio);
2558+
}
25092559
up_read(&_origins_lock);
25102560

25112561
return r;
@@ -2618,7 +2668,7 @@ static int origin_map(struct dm_target *ti, struct bio *bio)
26182668
dm_accept_partial_bio(bio, available_sectors);
26192669

26202670
/* Only tell snapshots if this is a write */
2621-
return do_origin(o->dev, bio);
2671+
return do_origin(o->dev, bio, true);
26222672
}
26232673

26242674
/*

0 commit comments

Comments
 (0)