Skip to content

Commit 9156289

Browse files
jankaratytso
authored andcommitted
ext4: properly sync file size update after O_SYNC direct IO
Gao Xiang has reported that on ext4 O_SYNC direct IO does not properly sync file size update and thus if we crash at unfortunate moment, the file can have smaller size although O_SYNC IO has reported successful completion. The problem happens because update of on-disk inode size is handled in ext4_dio_write_iter() *after* iomap_dio_rw() (and thus dio_complete() in particular) has returned and generic_file_sync() gets called by dio_complete(). Fix the problem by handling on-disk inode size update directly in our ->end_io completion handler. References: https://lore.kernel.org/all/[email protected] Reported-by: Gao Xiang <[email protected]> CC: [email protected] Fixes: 378f32b ("ext4: introduce direct I/O write using iomap infrastructure") Signed-off-by: Jan Kara <[email protected]> Tested-by: Joseph Qi <[email protected]> Reviewed-by: "Ritesh Harjani (IBM)" <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Theodore Ts'o <[email protected]>
1 parent ce56d21 commit 9156289

File tree

1 file changed

+65
-88
lines changed

1 file changed

+65
-88
lines changed

fs/ext4/file.c

Lines changed: 65 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -306,80 +306,38 @@ static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
306306
}
307307

308308
static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset,
309-
ssize_t written, size_t count)
309+
ssize_t count)
310310
{
311311
handle_t *handle;
312-
bool truncate = false;
313-
u8 blkbits = inode->i_blkbits;
314-
ext4_lblk_t written_blk, end_blk;
315-
int ret;
316-
317-
/*
318-
* Note that EXT4_I(inode)->i_disksize can get extended up to
319-
* inode->i_size while the I/O was running due to writeback of delalloc
320-
* blocks. But, the code in ext4_iomap_alloc() is careful to use
321-
* zeroed/unwritten extents if this is possible; thus we won't leave
322-
* uninitialized blocks in a file even if we didn't succeed in writing
323-
* as much as we intended.
324-
*/
325-
WARN_ON_ONCE(i_size_read(inode) < EXT4_I(inode)->i_disksize);
326-
if (offset + count <= EXT4_I(inode)->i_disksize) {
327-
/*
328-
* We need to ensure that the inode is removed from the orphan
329-
* list if it has been added prematurely, due to writeback of
330-
* delalloc blocks.
331-
*/
332-
if (!list_empty(&EXT4_I(inode)->i_orphan) && inode->i_nlink) {
333-
handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
334-
335-
if (IS_ERR(handle)) {
336-
ext4_orphan_del(NULL, inode);
337-
return PTR_ERR(handle);
338-
}
339-
340-
ext4_orphan_del(handle, inode);
341-
ext4_journal_stop(handle);
342-
}
343-
344-
return written;
345-
}
346-
347-
if (written < 0)
348-
goto truncate;
349312

313+
lockdep_assert_held_write(&inode->i_rwsem);
350314
handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
351-
if (IS_ERR(handle)) {
352-
written = PTR_ERR(handle);
353-
goto truncate;
354-
}
315+
if (IS_ERR(handle))
316+
return PTR_ERR(handle);
355317

356-
if (ext4_update_inode_size(inode, offset + written)) {
357-
ret = ext4_mark_inode_dirty(handle, inode);
318+
if (ext4_update_inode_size(inode, offset + count)) {
319+
int ret = ext4_mark_inode_dirty(handle, inode);
358320
if (unlikely(ret)) {
359-
written = ret;
360321
ext4_journal_stop(handle);
361-
goto truncate;
322+
return ret;
362323
}
363324
}
364325

365-
/*
366-
* We may need to truncate allocated but not written blocks beyond EOF.
367-
*/
368-
written_blk = ALIGN(offset + written, 1 << blkbits);
369-
end_blk = ALIGN(offset + count, 1 << blkbits);
370-
if (written_blk < end_blk && ext4_can_truncate(inode))
371-
truncate = true;
372-
373-
/*
374-
* Remove the inode from the orphan list if it has been extended and
375-
* everything went OK.
376-
*/
377-
if (!truncate && inode->i_nlink)
326+
if (inode->i_nlink)
378327
ext4_orphan_del(handle, inode);
379328
ext4_journal_stop(handle);
380329

381-
if (truncate) {
382-
truncate:
330+
return count;
331+
}
332+
333+
/*
334+
* Clean up the inode after DIO or DAX extending write has completed and the
335+
* inode size has been updated using ext4_handle_inode_extension().
336+
*/
337+
static void ext4_inode_extension_cleanup(struct inode *inode, ssize_t count)
338+
{
339+
lockdep_assert_held_write(&inode->i_rwsem);
340+
if (count < 0) {
383341
ext4_truncate_failed_write(inode);
384342
/*
385343
* If the truncate operation failed early, then the inode may
@@ -388,9 +346,28 @@ static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset,
388346
*/
389347
if (inode->i_nlink)
390348
ext4_orphan_del(NULL, inode);
349+
return;
391350
}
351+
/*
352+
* If i_disksize got extended due to writeback of delalloc blocks while
353+
* the DIO was running we could fail to cleanup the orphan list in
354+
* ext4_handle_inode_extension(). Do it now.
355+
*/
356+
if (!list_empty(&EXT4_I(inode)->i_orphan) && inode->i_nlink) {
357+
handle_t *handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
392358

393-
return written;
359+
if (IS_ERR(handle)) {
360+
/*
361+
* The write has successfully completed. Not much to
362+
* do with the error here so just cleanup the orphan
363+
* list and hope for the best.
364+
*/
365+
ext4_orphan_del(NULL, inode);
366+
return;
367+
}
368+
ext4_orphan_del(handle, inode);
369+
ext4_journal_stop(handle);
370+
}
394371
}
395372

396373
static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size,
@@ -399,31 +376,22 @@ static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size,
399376
loff_t pos = iocb->ki_pos;
400377
struct inode *inode = file_inode(iocb->ki_filp);
401378

379+
if (!error && size && flags & IOMAP_DIO_UNWRITTEN)
380+
error = ext4_convert_unwritten_extents(NULL, inode, pos, size);
402381
if (error)
403382
return error;
404-
405-
if (size && flags & IOMAP_DIO_UNWRITTEN) {
406-
error = ext4_convert_unwritten_extents(NULL, inode, pos, size);
407-
if (error < 0)
408-
return error;
409-
}
410383
/*
411-
* If we are extending the file, we have to update i_size here before
412-
* page cache gets invalidated in iomap_dio_rw(). Otherwise racing
413-
* buffered reads could zero out too much from page cache pages. Update
414-
* of on-disk size will happen later in ext4_dio_write_iter() where
415-
* we have enough information to also perform orphan list handling etc.
416-
* Note that we perform all extending writes synchronously under
417-
* i_rwsem held exclusively so i_size update is safe here in that case.
418-
* If the write was not extending, we cannot see pos > i_size here
419-
* because operations reducing i_size like truncate wait for all
420-
* outstanding DIO before updating i_size.
384+
* Note that EXT4_I(inode)->i_disksize can get extended up to
385+
* inode->i_size while the I/O was running due to writeback of delalloc
386+
* blocks. But the code in ext4_iomap_alloc() is careful to use
387+
* zeroed/unwritten extents if this is possible; thus we won't leave
388+
* uninitialized blocks in a file even if we didn't succeed in writing
389+
* as much as we intended.
421390
*/
422-
pos += size;
423-
if (pos > i_size_read(inode))
424-
i_size_write(inode, pos);
425-
426-
return 0;
391+
WARN_ON_ONCE(i_size_read(inode) < READ_ONCE(EXT4_I(inode)->i_disksize));
392+
if (pos + size <= READ_ONCE(EXT4_I(inode)->i_disksize))
393+
return size;
394+
return ext4_handle_inode_extension(inode, pos, size);
427395
}
428396

429397
static const struct iomap_dio_ops ext4_dio_write_ops = {
@@ -608,9 +576,16 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
608576
dio_flags, NULL, 0);
609577
if (ret == -ENOTBLK)
610578
ret = 0;
611-
612-
if (extend)
613-
ret = ext4_handle_inode_extension(inode, offset, ret, count);
579+
if (extend) {
580+
/*
581+
* We always perform extending DIO write synchronously so by
582+
* now the IO is completed and ext4_handle_inode_extension()
583+
* was called. Cleanup the inode in case of error or race with
584+
* writeback of delalloc blocks.
585+
*/
586+
WARN_ON_ONCE(ret == -EIOCBQUEUED);
587+
ext4_inode_extension_cleanup(inode, ret);
588+
}
614589

615590
out:
616591
if (ilock_shared)
@@ -691,8 +666,10 @@ ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
691666

692667
ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops);
693668

694-
if (extend)
695-
ret = ext4_handle_inode_extension(inode, offset, ret, count);
669+
if (extend) {
670+
ret = ext4_handle_inode_extension(inode, offset, ret);
671+
ext4_inode_extension_cleanup(inode, ret);
672+
}
696673
out:
697674
inode_unlock(inode);
698675
if (ret > 0)

0 commit comments

Comments
 (0)