Skip to content

Commit 378f32b

Browse files
matthewbobrowskitytso
authored andcommitted
ext4: introduce direct I/O write using iomap infrastructure
This patch introduces a new direct I/O write path which makes use of the iomap infrastructure. All direct I/O writes are now passed from the ->write_iter() callback through to the new direct I/O handler ext4_dio_write_iter(). This function is responsible for calling into the iomap infrastructure via iomap_dio_rw(). Code snippets from the existing direct I/O write code within ext4_file_write_iter() such as, checking whether the I/O request is unaligned asynchronous I/O, or whether the write will result in an overwrite have effectively been moved out and into the new direct I/O ->write_iter() handler. The block mapping flags that are eventually passed down to ext4_map_blocks() from the *_get_block_*() suite of routines have been taken out and introduced within ext4_iomap_alloc(). For inode extension cases, ext4_handle_inode_extension() is effectively the function responsible for performing such metadata updates. This is called after iomap_dio_rw() has returned so that we can safely determine whether we need to potentially truncate any allocated blocks that may have been prepared for this direct I/O write. We don't perform the inode extension, or truncate operations from the ->end_io() handler as we don't have the original I/O 'length' available there. The ->end_io() however is responsible fo converting allocated unwritten extents to written extents. In the instance of a short write, we fallback and complete the remainder of the I/O using buffered I/O via ext4_buffered_write_iter(). The existing buffer_head direct I/O implementation has been removed as it's now redundant. [ Fix up ext4_dio_write_iter() per Jan's comments at https://lore.kernel.org/r/[email protected] -- TYT ] Signed-off-by: Matthew Bobrowski <[email protected]> Reviewed-by: Jan Kara <[email protected]> Reviewed-by: Ritesh Harjani <[email protected]> Link: https://lore.kernel.org/r/e55db6f12ae6ff017f36774135e79f3e7b0333da.1572949325.git.mbobrowski@mbobrowski.org Signed-off-by: Theodore Ts'o <[email protected]>
1 parent 3eaf9cc commit 378f32b

File tree

4 files changed

+218
-455
lines changed

4 files changed

+218
-455
lines changed

fs/ext4/ext4.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1584,7 +1584,6 @@ enum {
15841584
EXT4_STATE_NO_EXPAND, /* No space for expansion */
15851585
EXT4_STATE_DA_ALLOC_CLOSE, /* Alloc DA blks on close */
15861586
EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */
1587-
EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/
15881587
EXT4_STATE_NEWENTRY, /* File just added to dir */
15891588
EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */
15901589
EXT4_STATE_EXT_PRECACHED, /* extents have been precached */
@@ -2565,8 +2564,6 @@ int ext4_get_block_unwritten(struct inode *inode, sector_t iblock,
25652564
struct buffer_head *bh_result, int create);
25662565
int ext4_get_block(struct inode *inode, sector_t iblock,
25672566
struct buffer_head *bh_result, int create);
2568-
int ext4_dio_get_block(struct inode *inode, sector_t iblock,
2569-
struct buffer_head *bh_result, int create);
25702567
int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
25712568
struct buffer_head *bh, int create);
25722569
int ext4_walk_page_buffers(handle_t *handle,

fs/ext4/extents.c

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1753,16 +1753,9 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
17531753
*/
17541754
if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN)
17551755
return 0;
1756-
/*
1757-
* The check for IO to unwritten extent is somewhat racy as we
1758-
* increment i_unwritten / set EXT4_STATE_DIO_UNWRITTEN only after
1759-
* dropping i_data_sem. But reserved blocks should save us in that
1760-
* case.
1761-
*/
1756+
17621757
if (ext4_ext_is_unwritten(ex1) &&
1763-
(ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) ||
1764-
atomic_read(&EXT4_I(inode)->i_unwritten) ||
1765-
(ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN)))
1758+
ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN)
17661759
return 0;
17671760
#ifdef AGGRESSIVE_TEST
17681761
if (ext1_ee_len >= 4)

fs/ext4/file.c

Lines changed: 174 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
#include <linux/pagevec.h>
3030
#include <linux/uio.h>
3131
#include <linux/mman.h>
32+
#include <linux/backing-dev.h>
3233
#include "ext4.h"
3334
#include "ext4_jbd2.h"
3435
#include "xattr.h"
@@ -155,13 +156,6 @@ static int ext4_release_file(struct inode *inode, struct file *filp)
155156
return 0;
156157
}
157158

158-
static void ext4_unwritten_wait(struct inode *inode)
159-
{
160-
wait_queue_head_t *wq = ext4_ioend_wq(inode);
161-
162-
wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_unwritten) == 0));
163-
}
164-
165159
/*
166160
* This tests whether the IO in question is block-aligned or not.
167161
* Ext4 utilizes unwritten extents when hole-filling during direct IO, and they
@@ -214,13 +208,13 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
214208
struct inode *inode = file_inode(iocb->ki_filp);
215209
ssize_t ret;
216210

211+
if (unlikely(IS_IMMUTABLE(inode)))
212+
return -EPERM;
213+
217214
ret = generic_write_checks(iocb, from);
218215
if (ret <= 0)
219216
return ret;
220217

221-
if (unlikely(IS_IMMUTABLE(inode)))
222-
return -EPERM;
223-
224218
/*
225219
* If we have encountered a bitmap-format file, the size limit
226220
* is smaller than s_maxbytes, which is for extent-mapped files.
@@ -232,9 +226,42 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
232226
return -EFBIG;
233227
iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos);
234228
}
229+
230+
ret = file_modified(iocb->ki_filp);
231+
if (ret)
232+
return ret;
233+
235234
return iov_iter_count(from);
236235
}
237236

237+
static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
238+
struct iov_iter *from)
239+
{
240+
ssize_t ret;
241+
struct inode *inode = file_inode(iocb->ki_filp);
242+
243+
if (iocb->ki_flags & IOCB_NOWAIT)
244+
return -EOPNOTSUPP;
245+
246+
inode_lock(inode);
247+
ret = ext4_write_checks(iocb, from);
248+
if (ret <= 0)
249+
goto out;
250+
251+
current->backing_dev_info = inode_to_bdi(inode);
252+
ret = generic_perform_write(iocb->ki_filp, from, iocb->ki_pos);
253+
current->backing_dev_info = NULL;
254+
255+
out:
256+
inode_unlock(inode);
257+
if (likely(ret > 0)) {
258+
iocb->ki_pos += ret;
259+
ret = generic_write_sync(iocb, ret);
260+
}
261+
262+
return ret;
263+
}
264+
238265
static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset,
239266
ssize_t written, size_t count)
240267
{
@@ -316,6 +343,139 @@ static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset,
316343
return written;
317344
}
318345

346+
static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size,
347+
int error, unsigned int flags)
348+
{
349+
loff_t offset = iocb->ki_pos;
350+
struct inode *inode = file_inode(iocb->ki_filp);
351+
352+
if (error)
353+
return error;
354+
355+
if (size && flags & IOMAP_DIO_UNWRITTEN)
356+
return ext4_convert_unwritten_extents(NULL, inode,
357+
offset, size);
358+
359+
return 0;
360+
}
361+
362+
static const struct iomap_dio_ops ext4_dio_write_ops = {
363+
.end_io = ext4_dio_write_end_io,
364+
};
365+
366+
static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
367+
{
368+
ssize_t ret;
369+
size_t count;
370+
loff_t offset;
371+
handle_t *handle;
372+
struct inode *inode = file_inode(iocb->ki_filp);
373+
bool extend = false, overwrite = false, unaligned_aio = false;
374+
375+
if (iocb->ki_flags & IOCB_NOWAIT) {
376+
if (!inode_trylock(inode))
377+
return -EAGAIN;
378+
} else {
379+
inode_lock(inode);
380+
}
381+
382+
if (!ext4_dio_supported(inode)) {
383+
inode_unlock(inode);
384+
/*
385+
* Fallback to buffered I/O if the inode does not support
386+
* direct I/O.
387+
*/
388+
return ext4_buffered_write_iter(iocb, from);
389+
}
390+
391+
ret = ext4_write_checks(iocb, from);
392+
if (ret <= 0) {
393+
inode_unlock(inode);
394+
return ret;
395+
}
396+
397+
/*
398+
* Unaligned asynchronous direct I/O must be serialized among each
399+
* other as the zeroing of partial blocks of two competing unaligned
400+
* asynchronous direct I/O writes can result in data corruption.
401+
*/
402+
offset = iocb->ki_pos;
403+
count = iov_iter_count(from);
404+
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
405+
!is_sync_kiocb(iocb) && ext4_unaligned_aio(inode, from, offset)) {
406+
unaligned_aio = true;
407+
inode_dio_wait(inode);
408+
}
409+
410+
/*
411+
* Determine whether the I/O will overwrite allocated and initialized
412+
* blocks. If so, check to see whether it is possible to take the
413+
* dioread_nolock path.
414+
*/
415+
if (!unaligned_aio && ext4_overwrite_io(inode, offset, count) &&
416+
ext4_should_dioread_nolock(inode)) {
417+
overwrite = true;
418+
downgrade_write(&inode->i_rwsem);
419+
}
420+
421+
if (offset + count > EXT4_I(inode)->i_disksize) {
422+
handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
423+
if (IS_ERR(handle)) {
424+
ret = PTR_ERR(handle);
425+
goto out;
426+
}
427+
428+
ret = ext4_orphan_add(handle, inode);
429+
if (ret) {
430+
ext4_journal_stop(handle);
431+
goto out;
432+
}
433+
434+
extend = true;
435+
ext4_journal_stop(handle);
436+
}
437+
438+
ret = iomap_dio_rw(iocb, from, &ext4_iomap_ops, &ext4_dio_write_ops,
439+
is_sync_kiocb(iocb) || unaligned_aio || extend);
440+
441+
if (extend)
442+
ret = ext4_handle_inode_extension(inode, offset, ret, count);
443+
444+
out:
445+
if (overwrite)
446+
inode_unlock_shared(inode);
447+
else
448+
inode_unlock(inode);
449+
450+
if (ret >= 0 && iov_iter_count(from)) {
451+
ssize_t err;
452+
loff_t endbyte;
453+
454+
offset = iocb->ki_pos;
455+
err = ext4_buffered_write_iter(iocb, from);
456+
if (err < 0)
457+
return err;
458+
459+
/*
460+
* We need to ensure that the pages within the page cache for
461+
* the range covered by this I/O are written to disk and
462+
* invalidated. This is in attempt to preserve the expected
463+
* direct I/O semantics in the case we fallback to buffered I/O
464+
* to complete off the I/O request.
465+
*/
466+
ret += err;
467+
endbyte = offset + err - 1;
468+
err = filemap_write_and_wait_range(iocb->ki_filp->f_mapping,
469+
offset, endbyte);
470+
if (!err)
471+
invalidate_mapping_pages(iocb->ki_filp->f_mapping,
472+
offset >> PAGE_SHIFT,
473+
endbyte >> PAGE_SHIFT);
474+
}
475+
476+
return ret;
477+
}
478+
319479
#ifdef CONFIG_FS_DAX
320480
static ssize_t
321481
ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
@@ -332,15 +492,10 @@ ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
332492
return -EAGAIN;
333493
inode_lock(inode);
334494
}
495+
335496
ret = ext4_write_checks(iocb, from);
336497
if (ret <= 0)
337498
goto out;
338-
ret = file_remove_privs(iocb->ki_filp);
339-
if (ret)
340-
goto out;
341-
ret = file_update_time(iocb->ki_filp);
342-
if (ret)
343-
goto out;
344499

345500
offset = iocb->ki_pos;
346501
count = iov_iter_count(from);
@@ -378,10 +533,6 @@ static ssize_t
378533
ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
379534
{
380535
struct inode *inode = file_inode(iocb->ki_filp);
381-
int o_direct = iocb->ki_flags & IOCB_DIRECT;
382-
int unaligned_aio = 0;
383-
int overwrite = 0;
384-
ssize_t ret;
385536

386537
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
387538
return -EIO;
@@ -390,59 +541,10 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
390541
if (IS_DAX(inode))
391542
return ext4_dax_write_iter(iocb, from);
392543
#endif
544+
if (iocb->ki_flags & IOCB_DIRECT)
545+
return ext4_dio_write_iter(iocb, from);
393546

394-
if (!inode_trylock(inode)) {
395-
if (iocb->ki_flags & IOCB_NOWAIT)
396-
return -EAGAIN;
397-
inode_lock(inode);
398-
}
399-
400-
ret = ext4_write_checks(iocb, from);
401-
if (ret <= 0)
402-
goto out;
403-
404-
/*
405-
* Unaligned direct AIO must be serialized among each other as zeroing
406-
* of partial blocks of two competing unaligned AIOs can result in data
407-
* corruption.
408-
*/
409-
if (o_direct && ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
410-
!is_sync_kiocb(iocb) &&
411-
ext4_unaligned_aio(inode, from, iocb->ki_pos)) {
412-
unaligned_aio = 1;
413-
ext4_unwritten_wait(inode);
414-
}
415-
416-
iocb->private = &overwrite;
417-
/* Check whether we do a DIO overwrite or not */
418-
if (o_direct && !unaligned_aio) {
419-
if (ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from))) {
420-
if (ext4_should_dioread_nolock(inode))
421-
overwrite = 1;
422-
} else if (iocb->ki_flags & IOCB_NOWAIT) {
423-
ret = -EAGAIN;
424-
goto out;
425-
}
426-
}
427-
428-
ret = __generic_file_write_iter(iocb, from);
429-
/*
430-
* Unaligned direct AIO must be the only IO in flight. Otherwise
431-
* overlapping aligned IO after unaligned might result in data
432-
* corruption.
433-
*/
434-
if (ret == -EIOCBQUEUED && unaligned_aio)
435-
ext4_unwritten_wait(inode);
436-
inode_unlock(inode);
437-
438-
if (ret > 0)
439-
ret = generic_write_sync(iocb, ret);
440-
441-
return ret;
442-
443-
out:
444-
inode_unlock(inode);
445-
return ret;
547+
return ext4_buffered_write_iter(iocb, from);
446548
}
447549

448550
#ifdef CONFIG_FS_DAX

0 commit comments

Comments
 (0)