Skip to content

Commit d7cb6be

Browse files
committed
Revert "md/raid10: improve raid10 discard request"
This reverts commit bcc90d2. Matthew Ruffell reported data corruption in raid10 due to the changes in discard handling [1]. Revert these changes before we find a proper fix. [1] https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1907262/ Cc: Matthew Ruffell <[email protected]> Cc: Xiao Ni <[email protected]> Signed-off-by: Song Liu <[email protected]>
1 parent 82fe9af commit d7cb6be

File tree

1 file changed

+1
-255
lines changed

1 file changed

+1
-255
lines changed

drivers/md/raid10.c

Lines changed: 1 addition & 255 deletions
Original file line numberDiff line numberDiff line change
@@ -1516,256 +1516,6 @@ static void __make_request(struct mddev *mddev, struct bio *bio, int sectors)
15161516
raid10_write_request(mddev, bio, r10_bio);
15171517
}
15181518

1519-
static struct bio *raid10_split_bio(struct r10conf *conf,
1520-
struct bio *bio, sector_t sectors, bool want_first)
1521-
{
1522-
struct bio *split;
1523-
1524-
split = bio_split(bio, sectors, GFP_NOIO, &conf->bio_split);
1525-
bio_chain(split, bio);
1526-
allow_barrier(conf);
1527-
if (want_first) {
1528-
submit_bio_noacct(bio);
1529-
bio = split;
1530-
} else
1531-
submit_bio_noacct(split);
1532-
wait_barrier(conf);
1533-
1534-
return bio;
1535-
}
1536-
1537-
static void raid10_end_discard_request(struct bio *bio)
1538-
{
1539-
struct r10bio *r10_bio = bio->bi_private;
1540-
struct r10conf *conf = r10_bio->mddev->private;
1541-
struct md_rdev *rdev = NULL;
1542-
int dev;
1543-
int slot, repl;
1544-
1545-
/*
1546-
* We don't care the return value of discard bio
1547-
*/
1548-
if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
1549-
set_bit(R10BIO_Uptodate, &r10_bio->state);
1550-
1551-
dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
1552-
if (repl)
1553-
rdev = conf->mirrors[dev].replacement;
1554-
if (!rdev) {
1555-
/* raid10_remove_disk uses smp_mb to make sure rdev is set to
1556-
* replacement before setting replacement to NULL. It can read
1557-
* rdev first without barrier protect even replacment is NULL
1558-
*/
1559-
smp_rmb();
1560-
rdev = conf->mirrors[dev].rdev;
1561-
}
1562-
1563-
if (atomic_dec_and_test(&r10_bio->remaining)) {
1564-
md_write_end(r10_bio->mddev);
1565-
raid_end_bio_io(r10_bio);
1566-
}
1567-
1568-
rdev_dec_pending(rdev, conf->mddev);
1569-
}
1570-
1571-
/* There are some limitations to handle discard bio
1572-
* 1st, the discard size is bigger than stripe_size*2.
1573-
* 2st, if the discard bio spans reshape progress, we use the old way to
1574-
* handle discard bio
1575-
*/
1576-
static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
1577-
{
1578-
struct r10conf *conf = mddev->private;
1579-
struct geom *geo = &conf->geo;
1580-
struct r10bio *r10_bio;
1581-
1582-
int disk;
1583-
sector_t chunk;
1584-
unsigned int stripe_size;
1585-
sector_t split_size;
1586-
1587-
sector_t bio_start, bio_end;
1588-
sector_t first_stripe_index, last_stripe_index;
1589-
sector_t start_disk_offset;
1590-
unsigned int start_disk_index;
1591-
sector_t end_disk_offset;
1592-
unsigned int end_disk_index;
1593-
unsigned int remainder;
1594-
1595-
if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
1596-
return -EAGAIN;
1597-
1598-
wait_barrier(conf);
1599-
1600-
/* Check reshape again to avoid reshape happens after checking
1601-
* MD_RECOVERY_RESHAPE and before wait_barrier
1602-
*/
1603-
if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
1604-
goto out;
1605-
1606-
stripe_size = geo->raid_disks << geo->chunk_shift;
1607-
bio_start = bio->bi_iter.bi_sector;
1608-
bio_end = bio_end_sector(bio);
1609-
1610-
/* Maybe one discard bio is smaller than strip size or across one stripe
1611-
* and discard region is larger than one stripe size. For far offset layout,
1612-
* if the discard region is not aligned with stripe size, there is hole
1613-
* when we submit discard bio to member disk. For simplicity, we only
1614-
* handle discard bio which discard region is bigger than stripe_size*2
1615-
*/
1616-
if (bio_sectors(bio) < stripe_size*2)
1617-
goto out;
1618-
1619-
/* For far offset layout, if bio is not aligned with stripe size, it splits
1620-
* the part that is not aligned with strip size.
1621-
*/
1622-
div_u64_rem(bio_start, stripe_size, &remainder);
1623-
if (geo->far_offset && remainder) {
1624-
split_size = stripe_size - remainder;
1625-
bio = raid10_split_bio(conf, bio, split_size, false);
1626-
}
1627-
div_u64_rem(bio_end, stripe_size, &remainder);
1628-
if (geo->far_offset && remainder) {
1629-
split_size = bio_sectors(bio) - remainder;
1630-
bio = raid10_split_bio(conf, bio, split_size, true);
1631-
}
1632-
1633-
r10_bio = mempool_alloc(&conf->r10bio_pool, GFP_NOIO);
1634-
r10_bio->mddev = mddev;
1635-
r10_bio->state = 0;
1636-
r10_bio->sectors = 0;
1637-
memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * geo->raid_disks);
1638-
1639-
wait_blocked_dev(mddev, r10_bio);
1640-
1641-
r10_bio->master_bio = bio;
1642-
1643-
bio_start = bio->bi_iter.bi_sector;
1644-
bio_end = bio_end_sector(bio);
1645-
1646-
/* raid10 uses chunk as the unit to store data. It's similar like raid0.
1647-
* One stripe contains the chunks from all member disk (one chunk from
1648-
* one disk at the same HBA address). For layout detail, see 'man md 4'
1649-
*/
1650-
chunk = bio_start >> geo->chunk_shift;
1651-
chunk *= geo->near_copies;
1652-
first_stripe_index = chunk;
1653-
start_disk_index = sector_div(first_stripe_index, geo->raid_disks);
1654-
if (geo->far_offset)
1655-
first_stripe_index *= geo->far_copies;
1656-
start_disk_offset = (bio_start & geo->chunk_mask) +
1657-
(first_stripe_index << geo->chunk_shift);
1658-
1659-
chunk = bio_end >> geo->chunk_shift;
1660-
chunk *= geo->near_copies;
1661-
last_stripe_index = chunk;
1662-
end_disk_index = sector_div(last_stripe_index, geo->raid_disks);
1663-
if (geo->far_offset)
1664-
last_stripe_index *= geo->far_copies;
1665-
end_disk_offset = (bio_end & geo->chunk_mask) +
1666-
(last_stripe_index << geo->chunk_shift);
1667-
1668-
rcu_read_lock();
1669-
for (disk = 0; disk < geo->raid_disks; disk++) {
1670-
struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev);
1671-
struct md_rdev *rrdev = rcu_dereference(
1672-
conf->mirrors[disk].replacement);
1673-
1674-
r10_bio->devs[disk].bio = NULL;
1675-
r10_bio->devs[disk].repl_bio = NULL;
1676-
1677-
if (rdev && (test_bit(Faulty, &rdev->flags)))
1678-
rdev = NULL;
1679-
if (rrdev && (test_bit(Faulty, &rrdev->flags)))
1680-
rrdev = NULL;
1681-
if (!rdev && !rrdev)
1682-
continue;
1683-
1684-
if (rdev) {
1685-
r10_bio->devs[disk].bio = bio;
1686-
atomic_inc(&rdev->nr_pending);
1687-
}
1688-
if (rrdev) {
1689-
r10_bio->devs[disk].repl_bio = bio;
1690-
atomic_inc(&rrdev->nr_pending);
1691-
}
1692-
}
1693-
rcu_read_unlock();
1694-
1695-
atomic_set(&r10_bio->remaining, 1);
1696-
for (disk = 0; disk < geo->raid_disks; disk++) {
1697-
sector_t dev_start, dev_end;
1698-
struct bio *mbio, *rbio = NULL;
1699-
struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev);
1700-
struct md_rdev *rrdev = rcu_dereference(
1701-
conf->mirrors[disk].replacement);
1702-
1703-
/*
1704-
* Now start to calculate the start and end address for each disk.
1705-
* The space between dev_start and dev_end is the discard region.
1706-
*
1707-
* For dev_start, it needs to consider three conditions:
1708-
* 1st, the disk is before start_disk, you can imagine the disk in
1709-
* the next stripe. So the dev_start is the start address of next
1710-
* stripe.
1711-
* 2st, the disk is after start_disk, it means the disk is at the
1712-
* same stripe of first disk
1713-
* 3st, the first disk itself, we can use start_disk_offset directly
1714-
*/
1715-
if (disk < start_disk_index)
1716-
dev_start = (first_stripe_index + 1) * mddev->chunk_sectors;
1717-
else if (disk > start_disk_index)
1718-
dev_start = first_stripe_index * mddev->chunk_sectors;
1719-
else
1720-
dev_start = start_disk_offset;
1721-
1722-
if (disk < end_disk_index)
1723-
dev_end = (last_stripe_index + 1) * mddev->chunk_sectors;
1724-
else if (disk > end_disk_index)
1725-
dev_end = last_stripe_index * mddev->chunk_sectors;
1726-
else
1727-
dev_end = end_disk_offset;
1728-
1729-
/* It only handles discard bio which size is >= stripe size, so
1730-
* dev_end > dev_start all the time
1731-
*/
1732-
if (r10_bio->devs[disk].bio) {
1733-
mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
1734-
mbio->bi_end_io = raid10_end_discard_request;
1735-
mbio->bi_private = r10_bio;
1736-
r10_bio->devs[disk].bio = mbio;
1737-
r10_bio->devs[disk].devnum = disk;
1738-
atomic_inc(&r10_bio->remaining);
1739-
md_submit_discard_bio(mddev, rdev, mbio,
1740-
dev_start + choose_data_offset(r10_bio, rdev),
1741-
dev_end - dev_start);
1742-
bio_endio(mbio);
1743-
}
1744-
if (r10_bio->devs[disk].repl_bio) {
1745-
rbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
1746-
rbio->bi_end_io = raid10_end_discard_request;
1747-
rbio->bi_private = r10_bio;
1748-
r10_bio->devs[disk].repl_bio = rbio;
1749-
r10_bio->devs[disk].devnum = disk;
1750-
atomic_inc(&r10_bio->remaining);
1751-
md_submit_discard_bio(mddev, rrdev, rbio,
1752-
dev_start + choose_data_offset(r10_bio, rrdev),
1753-
dev_end - dev_start);
1754-
bio_endio(rbio);
1755-
}
1756-
}
1757-
1758-
if (atomic_dec_and_test(&r10_bio->remaining)) {
1759-
md_write_end(r10_bio->mddev);
1760-
raid_end_bio_io(r10_bio);
1761-
}
1762-
1763-
return 0;
1764-
out:
1765-
allow_barrier(conf);
1766-
return -EAGAIN;
1767-
}
1768-
17691519
static bool raid10_make_request(struct mddev *mddev, struct bio *bio)
17701520
{
17711521
struct r10conf *conf = mddev->private;
@@ -1780,10 +1530,6 @@ static bool raid10_make_request(struct mddev *mddev, struct bio *bio)
17801530
if (!md_write_start(mddev, bio))
17811531
return false;
17821532

1783-
if (unlikely(bio_op(bio) == REQ_OP_DISCARD))
1784-
if (!raid10_handle_discard(mddev, bio))
1785-
return true;
1786-
17871533
/*
17881534
* If this request crosses a chunk boundary, we need to split
17891535
* it.
@@ -4023,7 +3769,7 @@ static int raid10_run(struct mddev *mddev)
40233769

40243770
if (mddev->queue) {
40253771
blk_queue_max_discard_sectors(mddev->queue,
4026-
UINT_MAX);
3772+
mddev->chunk_sectors);
40273773
blk_queue_max_write_same_sectors(mddev->queue, 0);
40283774
blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
40293775
blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);

0 commit comments

Comments
 (0)