Skip to content

Commit 9449ad3

Browse files
biger410torvalds
authored andcommitted
ocfs2: issue zeroout to EOF blocks
For punch holes in EOF blocks, fallocate used buffer write to zero the EOF blocks in last cluster. But since ->writepage will ignore EOF pages, those zeros will not be flushed. This "looks" ok as commit 6bba447 ("ocfs2: fix data corruption by fallocate") will zero the EOF blocks when extend the file size, but it isn't. The problem happened on those EOF pages, before writeback, those pages had DIRTY flag set and all buffer_head in them also had DIRTY flag set, when writeback run by write_cache_pages(), DIRTY flag on the page was cleared, but DIRTY flag on the buffer_head not. When next write happened to those EOF pages, since buffer_head already had DIRTY flag set, it would not mark page DIRTY again. That made writeback ignore them forever. That will cause data corruption. Even directio write can't work because it will fail when trying to drop pages caches before direct io, as it found the buffer_head for those pages still had DIRTY flag set, then it will fall back to buffer io mode. To make a summary of the issue, as writeback ingores EOF pages, once any EOF page is generated, any write to it will only go to the page cache, it will never be flushed to disk even file size extends and that page is not EOF page any more. The fix is to avoid zero EOF blocks with buffer write. The following code snippet from qemu-img could trigger the corruption. 656 open("6b3711ae-3306-4bdd-823c-cf1c0060a095.conv.2", O_RDWR|O_DIRECT|O_CLOEXEC) = 11 ... 660 fallocate(11, FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE, 2275868672, 327680 <unfinished ...> 660 fallocate(11, 0, 2275868672, 327680) = 0 658 pwrite64(11, " Link: https://lkml.kernel.org/r/[email protected] Signed-off-by: Junxiao Bi <[email protected]> Reviewed-by: Joseph Qi <[email protected]> Cc: Mark Fasheh <[email protected]> Cc: Joel Becker <[email protected]> Cc: Changwei Ge <[email protected]> Cc: Gang He <[email protected]> Cc: Jun Piao <[email protected]> Cc: <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent f267aeb commit 9449ad3

File tree

1 file changed

+60
-39
lines changed

1 file changed

+60
-39
lines changed

fs/ocfs2/file.c

Lines changed: 60 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -1529,6 +1529,45 @@ static void ocfs2_truncate_cluster_pages(struct inode *inode, u64 byte_start,
15291529
}
15301530
}
15311531

1532+
/*
1533+
* zero out partial blocks of one cluster.
1534+
*
1535+
* start: file offset where zero starts, will be made upper block aligned.
1536+
* len: it will be trimmed to the end of current cluster if "start + len"
1537+
* is bigger than it.
1538+
*/
1539+
static int ocfs2_zeroout_partial_cluster(struct inode *inode,
1540+
u64 start, u64 len)
1541+
{
1542+
int ret;
1543+
u64 start_block, end_block, nr_blocks;
1544+
u64 p_block, offset;
1545+
u32 cluster, p_cluster, nr_clusters;
1546+
struct super_block *sb = inode->i_sb;
1547+
u64 end = ocfs2_align_bytes_to_clusters(sb, start);
1548+
1549+
if (start + len < end)
1550+
end = start + len;
1551+
1552+
start_block = ocfs2_blocks_for_bytes(sb, start);
1553+
end_block = ocfs2_blocks_for_bytes(sb, end);
1554+
nr_blocks = end_block - start_block;
1555+
if (!nr_blocks)
1556+
return 0;
1557+
1558+
cluster = ocfs2_bytes_to_clusters(sb, start);
1559+
ret = ocfs2_get_clusters(inode, cluster, &p_cluster,
1560+
&nr_clusters, NULL);
1561+
if (ret)
1562+
return ret;
1563+
if (!p_cluster)
1564+
return 0;
1565+
1566+
offset = start_block - ocfs2_clusters_to_blocks(sb, cluster);
1567+
p_block = ocfs2_clusters_to_blocks(sb, p_cluster) + offset;
1568+
return sb_issue_zeroout(sb, p_block, nr_blocks, GFP_NOFS);
1569+
}
1570+
15321571
static int ocfs2_zero_partial_clusters(struct inode *inode,
15331572
u64 start, u64 len)
15341573
{
@@ -1538,6 +1577,7 @@ static int ocfs2_zero_partial_clusters(struct inode *inode,
15381577
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
15391578
unsigned int csize = osb->s_clustersize;
15401579
handle_t *handle;
1580+
loff_t isize = i_size_read(inode);
15411581

15421582
/*
15431583
* The "start" and "end" values are NOT necessarily part of
@@ -1558,6 +1598,26 @@ static int ocfs2_zero_partial_clusters(struct inode *inode,
15581598
if ((start & (csize - 1)) == 0 && (end & (csize - 1)) == 0)
15591599
goto out;
15601600

1601+
/* No page cache for EOF blocks, issue zero out to disk. */
1602+
if (end > isize) {
1603+
/*
1604+
* zeroout eof blocks in last cluster starting from
1605+
* "isize" even "start" > "isize" because it is
1606+
* complicated to zeroout just at "start" as "start"
1607+
* may be not aligned with block size, buffer write
1608+
* would be required to do that, but out of eof buffer
1609+
* write is not supported.
1610+
*/
1611+
ret = ocfs2_zeroout_partial_cluster(inode, isize,
1612+
end - isize);
1613+
if (ret) {
1614+
mlog_errno(ret);
1615+
goto out;
1616+
}
1617+
if (start >= isize)
1618+
goto out;
1619+
end = isize;
1620+
}
15611621
handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
15621622
if (IS_ERR(handle)) {
15631623
ret = PTR_ERR(handle);
@@ -1855,45 +1915,6 @@ int ocfs2_remove_inode_range(struct inode *inode,
18551915
return ret;
18561916
}
18571917

1858-
/*
1859-
* zero out partial blocks of one cluster.
1860-
*
1861-
* start: file offset where zero starts, will be made upper block aligned.
1862-
* len: it will be trimmed to the end of current cluster if "start + len"
1863-
* is bigger than it.
1864-
*/
1865-
static int ocfs2_zeroout_partial_cluster(struct inode *inode,
1866-
u64 start, u64 len)
1867-
{
1868-
int ret;
1869-
u64 start_block, end_block, nr_blocks;
1870-
u64 p_block, offset;
1871-
u32 cluster, p_cluster, nr_clusters;
1872-
struct super_block *sb = inode->i_sb;
1873-
u64 end = ocfs2_align_bytes_to_clusters(sb, start);
1874-
1875-
if (start + len < end)
1876-
end = start + len;
1877-
1878-
start_block = ocfs2_blocks_for_bytes(sb, start);
1879-
end_block = ocfs2_blocks_for_bytes(sb, end);
1880-
nr_blocks = end_block - start_block;
1881-
if (!nr_blocks)
1882-
return 0;
1883-
1884-
cluster = ocfs2_bytes_to_clusters(sb, start);
1885-
ret = ocfs2_get_clusters(inode, cluster, &p_cluster,
1886-
&nr_clusters, NULL);
1887-
if (ret)
1888-
return ret;
1889-
if (!p_cluster)
1890-
return 0;
1891-
1892-
offset = start_block - ocfs2_clusters_to_blocks(sb, cluster);
1893-
p_block = ocfs2_clusters_to_blocks(sb, p_cluster) + offset;
1894-
return sb_issue_zeroout(sb, p_block, nr_blocks, GFP_NOFS);
1895-
}
1896-
18971918
/*
18981919
* Parts of this function taken from xfs_change_file_space()
18991920
*/

0 commit comments

Comments
 (0)