Skip to content

Commit 9637d51

Browse files
committed
Merge tag 'for-linus-20190715' of git://git.kernel.dk/linux-block
Pull more block updates from Jens Axboe: "A later pull request with some followup items. I had some vacation coming up to the merge window, so certain things items were delayed a bit. This pull request also contains fixes that came in within the last few days of the merge window, which I didn't want to push right before sending you a pull request. This contains: - NVMe pull request, mostly fixes, but also a few minor items on the feature side that were timing constrained (Christoph et al) - Report zones fixes (Damien) - Removal of dead code (Damien) - Turn on cgroup psi memstall (Josef) - block cgroup MAINTAINERS entry (Konstantin) - Flush init fix (Josef) - blk-throttle low iops timing fix (Konstantin) - nbd resize fixes (Mike) - nbd 0 blocksize crash fix (Xiubo) - block integrity error leak fix (Wenwen) - blk-cgroup writeback and priority inheritance fixes (Tejun)" * tag 'for-linus-20190715' of git://git.kernel.dk/linux-block: (42 commits) MAINTAINERS: add entry for block io cgroup null_blk: fixup ->report_zones() for !CONFIG_BLK_DEV_ZONED block: Limit zone array allocation size sd_zbc: Fix report zones buffer allocation block: Kill gfp_t argument of blkdev_report_zones() block: Allow mapping of vmalloc-ed buffers block/bio-integrity: fix a memory leak bug nvme: fix NULL deref for fabrics options nbd: add netlink reconfigure resize support nbd: fix crash when the blksize is zero block: Disable write plugging for zoned block devices block: Fix elevator name declaration block: Remove unused definitions nvme: fix regression upon hot device removal and insertion blk-throttle: fix zero wait time for iops throttled group block: Fix potential overflow in blk_report_zones() blkcg: implement REQ_CGROUP_PUNT blkcg, writeback: Implement wbc_blkcg_css() blkcg, writeback: Add wbc->no_cgroup_owner blkcg, writeback: Rename wbc_account_io() to wbc_account_cgroup_owner() ...
2 parents 273cbf6 + 787c79d commit 9637d51

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

50 files changed

+660
-210
lines changed

Documentation/admin-guide/cgroup-v2.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2124,7 +2124,7 @@ following two functions.
21242124
a queue (device) has been associated with the bio and
21252125
before submission.
21262126

2127-
wbc_account_io(@wbc, @page, @bytes)
2127+
wbc_account_cgroup_owner(@wbc, @page, @bytes)
21282128
Should be called for each data segment being written out.
21292129
While this function doesn't care exactly when it's called
21302130
during the writeback session, it's the easiest and most

Documentation/block/biodoc.txt

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -843,11 +843,6 @@ elevator_latter_req_fn These return the request before or after the
843843

844844
elevator_completed_req_fn called when a request is completed.
845845

846-
elevator_may_queue_fn returns true if the scheduler wants to allow the
847-
current context to queue a new request even if
848-
it is over the queue limit. This must be used
849-
very carefully!!
850-
851846
elevator_set_req_fn
852847
elevator_put_req_fn Must be used to allocate and free any elevator
853848
specific storage for a request.

MAINTAINERS

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4183,6 +4183,19 @@ S: Maintained
41834183
F: mm/memcontrol.c
41844184
F: mm/swap_cgroup.c
41854185

4186+
CONTROL GROUP - BLOCK IO CONTROLLER (BLKIO)
4187+
M: Tejun Heo <[email protected]>
4188+
M: Jens Axboe <[email protected]>
4189+
4190+
4191+
T: git git://git.kernel.dk/linux-block
4192+
F: Documentation/cgroup-v1/blkio-controller.rst
4193+
F: block/blk-cgroup.c
4194+
F: include/linux/blk-cgroup.h
4195+
F: block/blk-throttle.c
4196+
F: block/blk-iolatency.c
4197+
F: block/bfq-cgroup.c
4198+
41864199
CORETEMP HARDWARE MONITORING DRIVER
41874200
M: Fenghua Yu <[email protected]>
41884201

block/bio-integrity.c

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -276,8 +276,12 @@ bool bio_integrity_prep(struct bio *bio)
276276
ret = bio_integrity_add_page(bio, virt_to_page(buf),
277277
bytes, offset);
278278

279-
if (ret == 0)
280-
return false;
279+
if (ret == 0) {
280+
printk(KERN_ERR "could not attach integrity payload\n");
281+
kfree(buf);
282+
status = BLK_STS_RESOURCE;
283+
goto err_end_io;
284+
}
281285

282286
if (ret < bytes)
283287
break;

block/bio.c

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
#include <linux/workqueue.h>
1717
#include <linux/cgroup.h>
1818
#include <linux/blk-cgroup.h>
19+
#include <linux/highmem.h>
1920

2021
#include <trace/events/block.h>
2122
#include "blk.h"
@@ -1441,8 +1442,22 @@ void bio_unmap_user(struct bio *bio)
14411442
bio_put(bio);
14421443
}
14431444

1445+
static void bio_invalidate_vmalloc_pages(struct bio *bio)
1446+
{
1447+
#ifdef ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE
1448+
if (bio->bi_private && !op_is_write(bio_op(bio))) {
1449+
unsigned long i, len = 0;
1450+
1451+
for (i = 0; i < bio->bi_vcnt; i++)
1452+
len += bio->bi_io_vec[i].bv_len;
1453+
invalidate_kernel_vmap_range(bio->bi_private, len);
1454+
}
1455+
#endif
1456+
}
1457+
14441458
static void bio_map_kern_endio(struct bio *bio)
14451459
{
1460+
bio_invalidate_vmalloc_pages(bio);
14461461
bio_put(bio);
14471462
}
14481463

@@ -1463,13 +1478,20 @@ struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len,
14631478
unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
14641479
unsigned long start = kaddr >> PAGE_SHIFT;
14651480
const int nr_pages = end - start;
1481+
bool is_vmalloc = is_vmalloc_addr(data);
1482+
struct page *page;
14661483
int offset, i;
14671484
struct bio *bio;
14681485

14691486
bio = bio_kmalloc(gfp_mask, nr_pages);
14701487
if (!bio)
14711488
return ERR_PTR(-ENOMEM);
14721489

1490+
if (is_vmalloc) {
1491+
flush_kernel_vmap_range(data, len);
1492+
bio->bi_private = data;
1493+
}
1494+
14731495
offset = offset_in_page(kaddr);
14741496
for (i = 0; i < nr_pages; i++) {
14751497
unsigned int bytes = PAGE_SIZE - offset;
@@ -1480,7 +1502,11 @@ struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len,
14801502
if (bytes > len)
14811503
bytes = len;
14821504

1483-
if (bio_add_pc_page(q, bio, virt_to_page(data), bytes,
1505+
if (!is_vmalloc)
1506+
page = virt_to_page(data);
1507+
else
1508+
page = vmalloc_to_page(data);
1509+
if (bio_add_pc_page(q, bio, page, bytes,
14841510
offset) < bytes) {
14851511
/* we don't support partial mappings */
14861512
bio_put(bio);

block/blk-cgroup.c

Lines changed: 61 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
#include <linux/ctype.h>
3030
#include <linux/blk-cgroup.h>
3131
#include <linux/tracehook.h>
32+
#include <linux/psi.h>
3233
#include "blk.h"
3334

3435
#define MAX_KEY_LEN 100
@@ -47,12 +48,14 @@ struct blkcg blkcg_root;
4748
EXPORT_SYMBOL_GPL(blkcg_root);
4849

4950
struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css;
51+
EXPORT_SYMBOL_GPL(blkcg_root_css);
5052

5153
static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
5254

5355
static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */
5456

5557
static bool blkcg_debug_stats = false;
58+
static struct workqueue_struct *blkcg_punt_bio_wq;
5659

5760
static bool blkcg_policy_enabled(struct request_queue *q,
5861
const struct blkcg_policy *pol)
@@ -87,6 +90,8 @@ static void __blkg_release(struct rcu_head *rcu)
8790
{
8891
struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head);
8992

93+
WARN_ON(!bio_list_empty(&blkg->async_bios));
94+
9095
/* release the blkcg and parent blkg refs this blkg has been holding */
9196
css_put(&blkg->blkcg->css);
9297
if (blkg->parent)
@@ -112,6 +117,23 @@ static void blkg_release(struct percpu_ref *ref)
112117
call_rcu(&blkg->rcu_head, __blkg_release);
113118
}
114119

120+
static void blkg_async_bio_workfn(struct work_struct *work)
121+
{
122+
struct blkcg_gq *blkg = container_of(work, struct blkcg_gq,
123+
async_bio_work);
124+
struct bio_list bios = BIO_EMPTY_LIST;
125+
struct bio *bio;
126+
127+
/* as long as there are pending bios, @blkg can't go away */
128+
spin_lock_bh(&blkg->async_bio_lock);
129+
bio_list_merge(&bios, &blkg->async_bios);
130+
bio_list_init(&blkg->async_bios);
131+
spin_unlock_bh(&blkg->async_bio_lock);
132+
133+
while ((bio = bio_list_pop(&bios)))
134+
submit_bio(bio);
135+
}
136+
115137
/**
116138
* blkg_alloc - allocate a blkg
117139
* @blkcg: block cgroup the new blkg is associated with
@@ -140,6 +162,9 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
140162

141163
blkg->q = q;
142164
INIT_LIST_HEAD(&blkg->q_node);
165+
spin_lock_init(&blkg->async_bio_lock);
166+
bio_list_init(&blkg->async_bios);
167+
INIT_WORK(&blkg->async_bio_work, blkg_async_bio_workfn);
143168
blkg->blkcg = blkcg;
144169

145170
for (i = 0; i < BLKCG_MAX_POLS; i++) {
@@ -1526,6 +1551,25 @@ void blkcg_policy_unregister(struct blkcg_policy *pol)
15261551
}
15271552
EXPORT_SYMBOL_GPL(blkcg_policy_unregister);
15281553

1554+
bool __blkcg_punt_bio_submit(struct bio *bio)
1555+
{
1556+
struct blkcg_gq *blkg = bio->bi_blkg;
1557+
1558+
/* consume the flag first */
1559+
bio->bi_opf &= ~REQ_CGROUP_PUNT;
1560+
1561+
/* never bounce for the root cgroup */
1562+
if (!blkg->parent)
1563+
return false;
1564+
1565+
spin_lock_bh(&blkg->async_bio_lock);
1566+
bio_list_add(&blkg->async_bios, bio);
1567+
spin_unlock_bh(&blkg->async_bio_lock);
1568+
1569+
queue_work(blkcg_punt_bio_wq, &blkg->async_bio_work);
1570+
return true;
1571+
}
1572+
15291573
/*
15301574
* Scale the accumulated delay based on how long it has been since we updated
15311575
* the delay. We only call this when we are adding delay, in case it's been a
@@ -1587,6 +1631,7 @@ static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now)
15871631
*/
15881632
static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
15891633
{
1634+
unsigned long pflags;
15901635
u64 now = ktime_to_ns(ktime_get());
15911636
u64 exp;
15921637
u64 delay_nsec = 0;
@@ -1613,11 +1658,8 @@ static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
16131658
*/
16141659
delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC);
16151660

1616-
/*
1617-
* TODO: the use_memdelay flag is going to be for the upcoming psi stuff
1618-
* that hasn't landed upstream yet. Once that stuff is in place we need
1619-
* to do a psi_memstall_enter/leave if memdelay is set.
1620-
*/
1661+
if (use_memdelay)
1662+
psi_memstall_enter(&pflags);
16211663

16221664
exp = ktime_add_ns(now, delay_nsec);
16231665
tok = io_schedule_prepare();
@@ -1627,6 +1669,9 @@ static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
16271669
break;
16281670
} while (!fatal_signal_pending(current));
16291671
io_schedule_finish(tok);
1672+
1673+
if (use_memdelay)
1674+
psi_memstall_leave(&pflags);
16301675
}
16311676

16321677
/**
@@ -1726,5 +1771,16 @@ void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta)
17261771
atomic64_add(delta, &blkg->delay_nsec);
17271772
}
17281773

1774+
static int __init blkcg_init(void)
1775+
{
1776+
blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio",
1777+
WQ_MEM_RECLAIM | WQ_FREEZABLE |
1778+
WQ_UNBOUND | WQ_SYSFS, 0);
1779+
if (!blkcg_punt_bio_wq)
1780+
return -ENOMEM;
1781+
return 0;
1782+
}
1783+
subsys_initcall(blkcg_init);
1784+
17291785
module_param(blkcg_debug_stats, bool, 0644);
17301786
MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not");

block/blk-core.c

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
117117
rq->internal_tag = -1;
118118
rq->start_time_ns = ktime_get_ns();
119119
rq->part = NULL;
120+
refcount_set(&rq->ref, 1);
120121
}
121122
EXPORT_SYMBOL(blk_rq_init);
122123

@@ -687,7 +688,7 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
687688
struct request *rq;
688689
struct list_head *plug_list;
689690

690-
plug = current->plug;
691+
plug = blk_mq_plug(q, bio);
691692
if (!plug)
692693
return false;
693694

@@ -1127,6 +1128,9 @@ EXPORT_SYMBOL_GPL(direct_make_request);
11271128
*/
11281129
blk_qc_t submit_bio(struct bio *bio)
11291130
{
1131+
if (blkcg_punt_bio_submit(bio))
1132+
return BLK_QC_T_NONE;
1133+
11301134
/*
11311135
* If it's a regular read/write or a barrier with data attached,
11321136
* go through the normal accounting stuff before submission.

block/blk-mq.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1973,7 +1973,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
19731973

19741974
blk_mq_bio_to_request(rq, bio, nr_segs);
19751975

1976-
plug = current->plug;
1976+
plug = blk_mq_plug(q, bio);
19771977
if (unlikely(is_flush_fua)) {
19781978
/* bypass scheduler for flush rq */
19791979
blk_insert_flush(rq);

block/blk-mq.h

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -233,4 +233,36 @@ static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap)
233233
qmap->mq_map[cpu] = 0;
234234
}
235235

236+
/*
237+
* blk_mq_plug() - Get caller context plug
238+
* @q: request queue
239+
* @bio : the bio being submitted by the caller context
240+
*
241+
* Plugging, by design, may delay the insertion of BIOs into the elevator in
242+
* order to increase BIO merging opportunities. This however can cause BIO
243+
* insertion order to change from the order in which submit_bio() is being
244+
* executed in the case of multiple contexts concurrently issuing BIOs to a
245+
* device, even if these context are synchronized to tightly control BIO issuing
246+
* order. While this is not a problem with regular block devices, this ordering
247+
* change can cause write BIO failures with zoned block devices as these
248+
* require sequential write patterns to zones. Prevent this from happening by
249+
* ignoring the plug state of a BIO issuing context if the target request queue
250+
* is for a zoned block device and the BIO to plug is a write operation.
251+
*
252+
* Return current->plug if the bio can be plugged and NULL otherwise
253+
*/
254+
static inline struct blk_plug *blk_mq_plug(struct request_queue *q,
255+
struct bio *bio)
256+
{
257+
/*
258+
* For regular block devices or read operations, use the context plug
259+
* which may be NULL if blk_start_plug() was not executed.
260+
*/
261+
if (!blk_queue_is_zoned(q) || !op_is_write(bio_op(bio)))
262+
return current->plug;
263+
264+
/* Zoned block device write operation case: do not plug the BIO */
265+
return NULL;
266+
}
267+
236268
#endif

block/blk-throttle.c

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -881,13 +881,10 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
881881
unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
882882
u64 tmp;
883883

884-
jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
885-
886-
/* Slice has just started. Consider one slice interval */
887-
if (!jiffy_elapsed)
888-
jiffy_elapsed_rnd = tg->td->throtl_slice;
884+
jiffy_elapsed = jiffies - tg->slice_start[rw];
889885

890-
jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice);
886+
/* Round up to the next throttle slice, wait time must be nonzero */
887+
jiffy_elapsed_rnd = roundup(jiffy_elapsed + 1, tg->td->throtl_slice);
891888

892889
/*
893890
* jiffy_elapsed_rnd should not be a big value as minimum iops can be

0 commit comments

Comments
 (0)