20
20
* Private flags for iomap_dio, must not overlap with the public ones in
21
21
* iomap.h:
22
22
*/
23
- #define IOMAP_DIO_WRITE_FUA (1 << 28)
24
- #define IOMAP_DIO_NEED_SYNC (1 << 29)
25
- #define IOMAP_DIO_WRITE (1 << 30)
26
- #define IOMAP_DIO_DIRTY (1 << 31)
23
+ #define IOMAP_DIO_CALLER_COMP (1U << 26)
24
+ #define IOMAP_DIO_INLINE_COMP (1U << 27)
25
+ #define IOMAP_DIO_WRITE_THROUGH (1U << 28)
26
+ #define IOMAP_DIO_NEED_SYNC (1U << 29)
27
+ #define IOMAP_DIO_WRITE (1U << 30)
28
+ #define IOMAP_DIO_DIRTY (1U << 31)
27
29
28
30
struct iomap_dio {
29
31
struct kiocb * iocb ;
@@ -41,7 +43,6 @@ struct iomap_dio {
41
43
struct {
42
44
struct iov_iter * iter ;
43
45
struct task_struct * waiter ;
44
- struct bio * poll_bio ;
45
46
} submit ;
46
47
47
48
/* used for aio completion: */
@@ -63,12 +64,14 @@ static struct bio *iomap_dio_alloc_bio(const struct iomap_iter *iter,
63
64
static void iomap_dio_submit_bio (const struct iomap_iter * iter ,
64
65
struct iomap_dio * dio , struct bio * bio , loff_t pos )
65
66
{
67
+ struct kiocb * iocb = dio -> iocb ;
68
+
66
69
atomic_inc (& dio -> ref );
67
70
68
71
/* Sync dio can't be polled reliably */
69
- if ((dio -> iocb -> ki_flags & IOCB_HIPRI ) && !is_sync_kiocb (dio -> iocb )) {
70
- bio_set_polled (bio , dio -> iocb );
71
- dio -> submit . poll_bio = bio ;
72
+ if ((iocb -> ki_flags & IOCB_HIPRI ) && !is_sync_kiocb (iocb )) {
73
+ bio_set_polled (bio , iocb );
74
+ WRITE_ONCE ( iocb -> private , bio ) ;
72
75
}
73
76
74
77
if (dio -> dops && dio -> dops -> submit_io )
@@ -130,6 +133,11 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio)
130
133
}
131
134
EXPORT_SYMBOL_GPL (iomap_dio_complete );
132
135
136
+ static ssize_t iomap_dio_deferred_complete (void * data )
137
+ {
138
+ return iomap_dio_complete (data );
139
+ }
140
+
133
141
static void iomap_dio_complete_work (struct work_struct * work )
134
142
{
135
143
struct iomap_dio * dio = container_of (work , struct iomap_dio , aio .work );
@@ -152,27 +160,69 @@ void iomap_dio_bio_end_io(struct bio *bio)
152
160
{
153
161
struct iomap_dio * dio = bio -> bi_private ;
154
162
bool should_dirty = (dio -> flags & IOMAP_DIO_DIRTY );
163
+ struct kiocb * iocb = dio -> iocb ;
155
164
156
165
if (bio -> bi_status )
157
166
iomap_dio_set_error (dio , blk_status_to_errno (bio -> bi_status ));
167
+ if (!atomic_dec_and_test (& dio -> ref ))
168
+ goto release_bio ;
158
169
159
- if (atomic_dec_and_test (& dio -> ref )) {
160
- if (dio -> wait_for_completion ) {
161
- struct task_struct * waiter = dio -> submit .waiter ;
162
- WRITE_ONCE (dio -> submit .waiter , NULL );
163
- blk_wake_io_task (waiter );
164
- } else if (dio -> flags & IOMAP_DIO_WRITE ) {
165
- struct inode * inode = file_inode (dio -> iocb -> ki_filp );
166
-
167
- WRITE_ONCE (dio -> iocb -> private , NULL );
168
- INIT_WORK (& dio -> aio .work , iomap_dio_complete_work );
169
- queue_work (inode -> i_sb -> s_dio_done_wq , & dio -> aio .work );
170
- } else {
171
- WRITE_ONCE (dio -> iocb -> private , NULL );
172
- iomap_dio_complete_work (& dio -> aio .work );
173
- }
170
+ /*
171
+ * Synchronous dio, task itself will handle any completion work
172
+ * that needs after IO. All we need to do is wake the task.
173
+ */
174
+ if (dio -> wait_for_completion ) {
175
+ struct task_struct * waiter = dio -> submit .waiter ;
176
+
177
+ WRITE_ONCE (dio -> submit .waiter , NULL );
178
+ blk_wake_io_task (waiter );
179
+ goto release_bio ;
180
+ }
181
+
182
+ /*
183
+ * Flagged with IOMAP_DIO_INLINE_COMP, we can complete it inline
184
+ */
185
+ if (dio -> flags & IOMAP_DIO_INLINE_COMP ) {
186
+ WRITE_ONCE (iocb -> private , NULL );
187
+ iomap_dio_complete_work (& dio -> aio .work );
188
+ goto release_bio ;
189
+ }
190
+
191
+ /*
192
+ * If this dio is flagged with IOMAP_DIO_CALLER_COMP, then schedule
193
+ * our completion that way to avoid an async punt to a workqueue.
194
+ */
195
+ if (dio -> flags & IOMAP_DIO_CALLER_COMP ) {
196
+ /* only polled IO cares about private cleared */
197
+ iocb -> private = dio ;
198
+ iocb -> dio_complete = iomap_dio_deferred_complete ;
199
+
200
+ /*
201
+ * Invoke ->ki_complete() directly. We've assigned our
202
+ * dio_complete callback handler, and since the issuer set
203
+ * IOCB_DIO_CALLER_COMP, we know their ki_complete handler will
204
+ * notice ->dio_complete being set and will defer calling that
205
+ * handler until it can be done from a safe task context.
206
+ *
207
+ * Note that the 'res' being passed in here is not important
208
+ * for this case. The actual completion value of the request
209
+ * will be gotten from dio_complete when that is run by the
210
+ * issuer.
211
+ */
212
+ iocb -> ki_complete (iocb , 0 );
213
+ goto release_bio ;
174
214
}
175
215
216
+ /*
217
+ * Async DIO completion that requires filesystem level completion work
218
+ * gets punted to a work queue to complete as the operation may require
219
+ * more IO to be issued to finalise filesystem metadata changes or
220
+ * guarantee data integrity.
221
+ */
222
+ INIT_WORK (& dio -> aio .work , iomap_dio_complete_work );
223
+ queue_work (file_inode (iocb -> ki_filp )-> i_sb -> s_dio_done_wq ,
224
+ & dio -> aio .work );
225
+ release_bio :
176
226
if (should_dirty ) {
177
227
bio_check_pages_dirty (bio );
178
228
} else {
@@ -203,7 +253,7 @@ static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
203
253
/*
204
254
* Figure out the bio's operation flags from the dio request, the
205
255
* mapping, and whether or not we want FUA. Note that we can end up
206
- * clearing the WRITE_FUA flag in the dio request.
256
+ * clearing the WRITE_THROUGH flag in the dio request.
207
257
*/
208
258
static inline blk_opf_t iomap_dio_bio_opflags (struct iomap_dio * dio ,
209
259
const struct iomap * iomap , bool use_fua )
@@ -217,7 +267,7 @@ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
217
267
if (use_fua )
218
268
opflags |= REQ_FUA ;
219
269
else
220
- dio -> flags &= ~IOMAP_DIO_WRITE_FUA ;
270
+ dio -> flags &= ~IOMAP_DIO_WRITE_THROUGH ;
221
271
222
272
return opflags ;
223
273
}
@@ -257,12 +307,19 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
257
307
* Use a FUA write if we need datasync semantics, this is a pure
258
308
* data IO that doesn't require any metadata updates (including
259
309
* after IO completion such as unwritten extent conversion) and
260
- * the underlying device supports FUA. This allows us to avoid
261
- * cache flushes on IO completion.
310
+ * the underlying device either supports FUA or doesn't have
311
+ * a volatile write cache. This allows us to avoid cache flushes
312
+ * on IO completion. If we can't use writethrough and need to
313
+ * sync, disable in-task completions as dio completion will
314
+ * need to call generic_write_sync() which will do a blocking
315
+ * fsync / cache flush call.
262
316
*/
263
317
if (!(iomap -> flags & (IOMAP_F_SHARED |IOMAP_F_DIRTY )) &&
264
- (dio -> flags & IOMAP_DIO_WRITE_FUA ) && bdev_fua (iomap -> bdev ))
318
+ (dio -> flags & IOMAP_DIO_WRITE_THROUGH ) &&
319
+ (bdev_fua (iomap -> bdev ) || !bdev_write_cache (iomap -> bdev )))
265
320
use_fua = true;
321
+ else if (dio -> flags & IOMAP_DIO_NEED_SYNC )
322
+ dio -> flags &= ~IOMAP_DIO_CALLER_COMP ;
266
323
}
267
324
268
325
/*
@@ -277,10 +334,23 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
277
334
goto out ;
278
335
279
336
/*
280
- * We can only poll for single bio I/Os.
337
+ * We can only do deferred completion for pure overwrites that
338
+ * don't require additional IO at completion. This rules out
339
+ * writes that need zeroing or extent conversion, extend
340
+ * the file size, or issue journal IO or cache flushes
341
+ * during completion processing.
281
342
*/
282
343
if (need_zeroout ||
344
+ ((dio -> flags & IOMAP_DIO_NEED_SYNC ) && !use_fua ) ||
283
345
((dio -> flags & IOMAP_DIO_WRITE ) && pos >= i_size_read (inode )))
346
+ dio -> flags &= ~IOMAP_DIO_CALLER_COMP ;
347
+
348
+ /*
349
+ * The rules for polled IO completions follow the guidelines as the
350
+ * ones we set for inline and deferred completions. If none of those
351
+ * are available for this IO, clear the polled flag.
352
+ */
353
+ if (!(dio -> flags & (IOMAP_DIO_INLINE_COMP |IOMAP_DIO_CALLER_COMP )))
284
354
dio -> iocb -> ki_flags &= ~IOCB_HIPRI ;
285
355
286
356
if (need_zeroout ) {
@@ -505,12 +575,14 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
505
575
506
576
dio -> submit .iter = iter ;
507
577
dio -> submit .waiter = current ;
508
- dio -> submit .poll_bio = NULL ;
509
578
510
579
if (iocb -> ki_flags & IOCB_NOWAIT )
511
580
iomi .flags |= IOMAP_NOWAIT ;
512
581
513
582
if (iov_iter_rw (iter ) == READ ) {
583
+ /* reads can always complete inline */
584
+ dio -> flags |= IOMAP_DIO_INLINE_COMP ;
585
+
514
586
if (iomi .pos >= dio -> i_size )
515
587
goto out_free_dio ;
516
588
@@ -524,6 +596,15 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
524
596
iomi .flags |= IOMAP_WRITE ;
525
597
dio -> flags |= IOMAP_DIO_WRITE ;
526
598
599
+ /*
600
+ * Flag as supporting deferred completions, if the issuer
601
+ * groks it. This can avoid a workqueue punt for writes.
602
+ * We may later clear this flag if we need to do other IO
603
+ * as part of this IO completion.
604
+ */
605
+ if (iocb -> ki_flags & IOCB_DIO_CALLER_COMP )
606
+ dio -> flags |= IOMAP_DIO_CALLER_COMP ;
607
+
527
608
if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY ) {
528
609
ret = - EAGAIN ;
529
610
if (iomi .pos >= dio -> i_size ||
@@ -537,13 +618,16 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
537
618
dio -> flags |= IOMAP_DIO_NEED_SYNC ;
538
619
539
620
/*
540
- * For datasync only writes, we optimistically try
541
- * using FUA for this IO. Any non-FUA write that
542
- * occurs will clear this flag, hence we know before
543
- * completion whether a cache flush is necessary.
621
+ * For datasync only writes, we optimistically try using
622
+ * WRITE_THROUGH for this IO. This flag requires either
623
+ * FUA writes through the device's write cache, or a
624
+ * normal write to a device without a volatile write
625
+ * cache. For the former, Any non-FUA write that occurs
626
+ * will clear this flag, hence we know before completion
627
+ * whether a cache flush is necessary.
544
628
*/
545
629
if (!(iocb -> ki_flags & IOCB_SYNC ))
546
- dio -> flags |= IOMAP_DIO_WRITE_FUA ;
630
+ dio -> flags |= IOMAP_DIO_WRITE_THROUGH ;
547
631
}
548
632
549
633
/*
@@ -605,14 +689,13 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
605
689
iomap_dio_set_error (dio , ret );
606
690
607
691
/*
608
- * If all the writes we issued were FUA, we don't need to flush the
609
- * cache on IO completion. Clear the sync flag for this case.
692
+ * If all the writes we issued were already written through to the
693
+ * media, we don't need to flush the cache on IO completion. Clear the
694
+ * sync flag for this case.
610
695
*/
611
- if (dio -> flags & IOMAP_DIO_WRITE_FUA )
696
+ if (dio -> flags & IOMAP_DIO_WRITE_THROUGH )
612
697
dio -> flags &= ~IOMAP_DIO_NEED_SYNC ;
613
698
614
- WRITE_ONCE (iocb -> private , dio -> submit .poll_bio );
615
-
616
699
/*
617
700
* We are about to drop our additional submission reference, which
618
701
* might be the last reference to the dio. There are three different
0 commit comments