Merge tag 'for-5.4/block-2019-09-16' of git://git.kernel.dk/linux-block

Pull block updates from Jens Axboe:

 - Two NVMe pull requests:
     - ana log parse fix from Anton
     - nvme quirks support for Apple devices from Ben
     - fix missing bio completion tracing for multipath stack devices
       from Hannes and Mikhail
     - IP TOS settings for nvme rdma and tcp transports from Israel
     - rq_dma_dir cleanups from Israel
     - tracing for Get LBA Status command from Minwoo
     - Some nvme-tcp cleanups from Minwoo, Potnuri and Myself
     - Some consolidation between the fabrics transports for handling
       the CAP register
     - reset race with ns scanning fix for fabrics (move fabrics
       commands to a dedicated request queue with a different lifetime
       from the admin request queue)."
     - controller reset and namespace scan races fixes
     - nvme discovery log change uevent support
     - naming improvements from Keith
     - multiple discovery controllers reject fix from James
     - some regular cleanups from various people

 - Series fixing (and re-fixing) null_blk debug printing and nr_devices
   checks (André)

 - A few pull requests from Song, with fixes from Andy, Guoqing,
   Guilherme, Neil, Nigel, and Yufen.

 - REQ_OP_ZONE_RESET_ALL support (Chaitanya)

 - Bio merge handling unification (Christoph)

 - Pick default elevator correctly for devices with special needs
   (Damien)

 - Block stats fixes (Hou)

 - Timeout and support devices nbd fixes (Mike)

 - Series fixing races around elevator switching and device add/remove
   (Ming)

 - sed-opal cleanups (Revanth)

 - Per device weight support for BFQ (Fam)

 - Support for blk-iocost, a new model that can properly account cost of
   IO workloads. (Tejun)

 - blk-cgroup writeback fixes (Tejun)

 - paride queue init fixes (zhengbin)

 - blk_set_runtime_active() cleanup (Stanley)

 - Block segment mapping optimizations (Bart)

 - lightnvm fixes (Hans/Minwoo/YueHaibing)

 - Various little fixes and cleanups

* tag 'for-5.4/block-2019-09-16' of git://git.kernel.dk/linux-block: (186 commits)
  null_blk: format pr_* logs with pr_fmt
  null_blk: match the type of parameter nr_devices
  null_blk: do not fail the module load with zero devices
  block: also check RQF_STATS in blk_mq_need_time_stamp()
  block: make rq sector size accessible for block stats
  bfq: Fix bfq linkage error
  raid5: use bio_end_sector in r5_next_bio
  raid5: remove STRIPE_OPS_REQ_PENDING
  md: add feature flag MD_FEATURE_RAID0_LAYOUT
  md/raid0: avoid RAID0 data corruption due to layout confusion.
  raid5: don't set STRIPE_HANDLE to stripe which is in batch list
  raid5: don't increment read_errors on EILSEQ return
  nvmet: fix a wrong error status returned in error log page
  nvme: send discovery log page change events to userspace
  nvme: add uevent variables for controller devices
  nvme: enable aen regardless of the presence of I/O queues
  nvme-fabrics: allow discovery subsystems accept a kato
  nvmet: Use PTR_ERR_OR_ZERO() in nvmet_init_discovery()
  nvme: Remove redundant assignment of cq vector
  nvme: Assign subsys instance from first ctrl
  ...
This commit is contained in:
Linus Torvalds
2019-09-17 16:57:47 -07:00
107 changed files with 5894 additions and 1282 deletions

View File

@@ -26,6 +26,9 @@ menuconfig BLOCK
if BLOCK
config BLK_RQ_ALLOC_TIME
bool
config BLK_SCSI_REQUEST
bool
@@ -132,6 +135,16 @@ config BLK_CGROUP_IOLATENCY
Note, this is an experimental interface and could be changed someday.
config BLK_CGROUP_IOCOST
bool "Enable support for cost model based cgroup IO controller"
depends on BLK_CGROUP=y
select BLK_RQ_ALLOC_TIME
---help---
Enabling this option enables the .weight interface for cost
model based proportional IO control. The IO controller
distributes IO capacity between different groups based on
their share of the overall weight distribution.
config BLK_WBT_MQ
bool "Multiqueue writeback throttling"
default y

View File

@@ -18,6 +18,7 @@ obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o
obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o
obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o
obj-$(CONFIG_BLK_CGROUP_IOLATENCY) += blk-iolatency.o
obj-$(CONFIG_BLK_CGROUP_IOCOST) += blk-iocost.o
obj-$(CONFIG_MQ_IOSCHED_DEADLINE) += mq-deadline.o
obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o
bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o

View File

@@ -501,11 +501,12 @@ static void bfq_cpd_free(struct blkcg_policy_data *cpd)
kfree(cpd_to_bfqgd(cpd));
}
static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node)
static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, struct request_queue *q,
struct blkcg *blkcg)
{
struct bfq_group *bfqg;
bfqg = kzalloc_node(sizeof(*bfqg), gfp, node);
bfqg = kzalloc_node(sizeof(*bfqg), gfp, q->node);
if (!bfqg)
return NULL;
@@ -904,7 +905,7 @@ void bfq_end_wr_async(struct bfq_data *bfqd)
bfq_end_wr_async_queues(bfqd, bfqd->root_group);
}
static int bfq_io_show_weight(struct seq_file *sf, void *v)
static int bfq_io_show_weight_legacy(struct seq_file *sf, void *v)
{
struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);
@@ -918,6 +919,60 @@ static int bfq_io_show_weight(struct seq_file *sf, void *v)
return 0;
}
static u64 bfqg_prfill_weight_device(struct seq_file *sf,
struct blkg_policy_data *pd, int off)
{
struct bfq_group *bfqg = pd_to_bfqg(pd);
if (!bfqg->entity.dev_weight)
return 0;
return __blkg_prfill_u64(sf, pd, bfqg->entity.dev_weight);
}
static int bfq_io_show_weight(struct seq_file *sf, void *v)
{
struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);
seq_printf(sf, "default %u\n", bfqgd->weight);
blkcg_print_blkgs(sf, blkcg, bfqg_prfill_weight_device,
&blkcg_policy_bfq, 0, false);
return 0;
}
static void bfq_group_set_weight(struct bfq_group *bfqg, u64 weight, u64 dev_weight)
{
weight = dev_weight ?: weight;
bfqg->entity.dev_weight = dev_weight;
/*
* Setting the prio_changed flag of the entity
* to 1 with new_weight == weight would re-set
* the value of the weight to its ioprio mapping.
* Set the flag only if necessary.
*/
if ((unsigned short)weight != bfqg->entity.new_weight) {
bfqg->entity.new_weight = (unsigned short)weight;
/*
* Make sure that the above new value has been
* stored in bfqg->entity.new_weight before
* setting the prio_changed flag. In fact,
* this flag may be read asynchronously (in
* critical sections protected by a different
* lock than that held here), and finding this
* flag set may cause the execution of the code
* for updating parameters whose value may
* depend also on bfqg->entity.new_weight (in
* __bfq_entity_update_weight_prio).
* This barrier makes sure that the new value
* of bfqg->entity.new_weight is correctly
* seen in that code.
*/
smp_wmb();
bfqg->entity.prio_changed = 1;
}
}
static int bfq_io_set_weight_legacy(struct cgroup_subsys_state *css,
struct cftype *cftype,
u64 val)
@@ -936,53 +991,70 @@ static int bfq_io_set_weight_legacy(struct cgroup_subsys_state *css,
hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
struct bfq_group *bfqg = blkg_to_bfqg(blkg);
if (!bfqg)
continue;
/*
* Setting the prio_changed flag of the entity
* to 1 with new_weight == weight would re-set
* the value of the weight to its ioprio mapping.
* Set the flag only if necessary.
*/
if ((unsigned short)val != bfqg->entity.new_weight) {
bfqg->entity.new_weight = (unsigned short)val;
/*
* Make sure that the above new value has been
* stored in bfqg->entity.new_weight before
* setting the prio_changed flag. In fact,
* this flag may be read asynchronously (in
* critical sections protected by a different
* lock than that held here), and finding this
* flag set may cause the execution of the code
* for updating parameters whose value may
* depend also on bfqg->entity.new_weight (in
* __bfq_entity_update_weight_prio).
* This barrier makes sure that the new value
* of bfqg->entity.new_weight is correctly
* seen in that code.
*/
smp_wmb();
bfqg->entity.prio_changed = 1;
}
if (bfqg)
bfq_group_set_weight(bfqg, val, 0);
}
spin_unlock_irq(&blkcg->lock);
return ret;
}
static ssize_t bfq_io_set_device_weight(struct kernfs_open_file *of,
char *buf, size_t nbytes,
loff_t off)
{
int ret;
struct blkg_conf_ctx ctx;
struct blkcg *blkcg = css_to_blkcg(of_css(of));
struct bfq_group *bfqg;
u64 v;
ret = blkg_conf_prep(blkcg, &blkcg_policy_bfq, buf, &ctx);
if (ret)
return ret;
if (sscanf(ctx.body, "%llu", &v) == 1) {
/* require "default" on dfl */
ret = -ERANGE;
if (!v)
goto out;
} else if (!strcmp(strim(ctx.body), "default")) {
v = 0;
} else {
ret = -EINVAL;
goto out;
}
bfqg = blkg_to_bfqg(ctx.blkg);
ret = -ERANGE;
if (!v || (v >= BFQ_MIN_WEIGHT && v <= BFQ_MAX_WEIGHT)) {
bfq_group_set_weight(bfqg, bfqg->entity.weight, v);
ret = 0;
}
out:
blkg_conf_finish(&ctx);
return ret ?: nbytes;
}
static ssize_t bfq_io_set_weight(struct kernfs_open_file *of,
char *buf, size_t nbytes,
loff_t off)
{
u64 weight;
/* First unsigned long found in the file is used */
int ret = kstrtoull(strim(buf), 0, &weight);
char *endp;
int ret;
u64 v;
if (ret)
return ret;
buf = strim(buf);
ret = bfq_io_set_weight_legacy(of_css(of), NULL, weight);
return ret ?: nbytes;
/* "WEIGHT" or "default WEIGHT" sets the default weight */
v = simple_strtoull(buf, &endp, 0);
if (*endp == '\0' || sscanf(buf, "default %llu", &v) == 1) {
ret = bfq_io_set_weight_legacy(of_css(of), NULL, v);
return ret ?: nbytes;
}
return bfq_io_set_device_weight(of, buf, nbytes, off);
}
#ifdef CONFIG_BFQ_CGROUP_DEBUG
@@ -1141,9 +1213,15 @@ struct cftype bfq_blkcg_legacy_files[] = {
{
.name = "bfq.weight",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = bfq_io_show_weight,
.seq_show = bfq_io_show_weight_legacy,
.write_u64 = bfq_io_set_weight_legacy,
},
{
.name = "bfq.weight_device",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = bfq_io_show_weight,
.write = bfq_io_set_weight,
},
/* statistics, covers only the tasks in the bfqg */
{

View File

@@ -168,6 +168,9 @@ struct bfq_entity {
/* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */
int budget;
/* device weight, if non-zero, it overrides the default weight of
* bfq_group_data */
int dev_weight;
/* weight of the queue */
int weight;
/* next weight if a change is in progress */

View File

@@ -744,6 +744,8 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
}
#endif
/* Matches the smp_wmb() in bfq_group_set_weight. */
smp_rmb();
old_st->wsum -= entity->weight;
if (entity->new_weight != entity->orig_weight) {

View File

@@ -646,25 +646,20 @@ static inline bool page_is_mergeable(const struct bio_vec *bv,
return true;
}
/*
* Check if the @page can be added to the current segment(@bv), and make
* sure to call it only if page_is_mergeable(@bv, @page) is true
*/
static bool can_add_page_to_seg(struct request_queue *q,
struct bio_vec *bv, struct page *page, unsigned len,
unsigned offset)
static bool bio_try_merge_pc_page(struct request_queue *q, struct bio *bio,
struct page *page, unsigned len, unsigned offset,
bool *same_page)
{
struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
unsigned long mask = queue_segment_boundary(q);
phys_addr_t addr1 = page_to_phys(bv->bv_page) + bv->bv_offset;
phys_addr_t addr2 = page_to_phys(page) + offset + len - 1;
if ((addr1 | mask) != (addr2 | mask))
return false;
if (bv->bv_len + len > queue_max_segment_size(q))
return false;
return true;
return __bio_try_merge_page(bio, page, len, offset, same_page);
}
/**
@@ -674,7 +669,7 @@ static bool can_add_page_to_seg(struct request_queue *q,
* @page: page to add
* @len: vec entry length
* @offset: vec entry offset
* @put_same_page: put the page if it is same with last added page
* @same_page: return if the merge happen inside the same page
*
* Attempt to add a page to the bio_vec maplist. This can fail for a
* number of reasons, such as the bio being full or target block device
@@ -685,10 +680,9 @@ static bool can_add_page_to_seg(struct request_queue *q,
*/
static int __bio_add_pc_page(struct request_queue *q, struct bio *bio,
struct page *page, unsigned int len, unsigned int offset,
bool put_same_page)
bool *same_page)
{
struct bio_vec *bvec;
bool same_page = false;
/*
* cloned bio must not modify vec list
@@ -700,28 +694,16 @@ static int __bio_add_pc_page(struct request_queue *q, struct bio *bio,
return 0;
if (bio->bi_vcnt > 0) {
bvec = &bio->bi_io_vec[bio->bi_vcnt - 1];
if (page == bvec->bv_page &&
offset == bvec->bv_offset + bvec->bv_len) {
if (put_same_page)
put_page(page);
bvec->bv_len += len;
goto done;
}
if (bio_try_merge_pc_page(q, bio, page, len, offset, same_page))
return len;
/*
* If the queue doesn't support SG gaps and adding this
* offset would create a gap, disallow it.
* If the queue doesn't support SG gaps and adding this segment
* would create a gap, disallow it.
*/
bvec = &bio->bi_io_vec[bio->bi_vcnt - 1];
if (bvec_gap_to_prev(q, bvec, offset))
return 0;
if (page_is_mergeable(bvec, page, len, offset, &same_page) &&
can_add_page_to_seg(q, bvec, page, len, offset)) {
bvec->bv_len += len;
goto done;
}
}
if (bio_full(bio, len))
@@ -735,7 +717,6 @@ static int __bio_add_pc_page(struct request_queue *q, struct bio *bio,
bvec->bv_len = len;
bvec->bv_offset = offset;
bio->bi_vcnt++;
done:
bio->bi_iter.bi_size += len;
return len;
}
@@ -743,7 +724,8 @@ static int __bio_add_pc_page(struct request_queue *q, struct bio *bio,
int bio_add_pc_page(struct request_queue *q, struct bio *bio,
struct page *page, unsigned int len, unsigned int offset)
{
return __bio_add_pc_page(q, bio, page, len, offset, false);
bool same_page = false;
return __bio_add_pc_page(q, bio, page, len, offset, &same_page);
}
EXPORT_SYMBOL(bio_add_pc_page);
@@ -806,6 +788,9 @@ void __bio_add_page(struct bio *bio, struct page *page,
bio->bi_iter.bi_size += len;
bio->bi_vcnt++;
if (!bio_flagged(bio, BIO_WORKINGSET) && unlikely(PageWorkingset(page)))
bio_set_flag(bio, BIO_WORKINGSET);
}
EXPORT_SYMBOL_GPL(__bio_add_page);
@@ -1384,13 +1369,17 @@ struct bio *bio_map_user_iov(struct request_queue *q,
for (j = 0; j < npages; j++) {
struct page *page = pages[j];
unsigned int n = PAGE_SIZE - offs;
bool same_page = false;
if (n > bytes)
n = bytes;
if (!__bio_add_pc_page(q, bio, page, n, offs,
true))
&same_page)) {
if (same_page)
put_page(page);
break;
}
added += n;
bytes -= n;
@@ -1521,7 +1510,6 @@ struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len,
bio->bi_end_io = bio_map_kern_endio;
return bio;
}
EXPORT_SYMBOL(bio_map_kern);
static void bio_copy_kern_endio(struct bio *bio)
{
@@ -1842,8 +1830,8 @@ EXPORT_SYMBOL(bio_endio);
* @bio, and updates @bio to represent the remaining sectors.
*
* Unless this is a discard request the newly allocated bio will point
* to @bio's bi_io_vec; it is the caller's responsibility to ensure that
* @bio is not freed before the split.
* to @bio's bi_io_vec. It is the caller's responsibility to ensure that
* neither @bio nor @bs are freed before the split bio.
*/
struct bio *bio_split(struct bio *bio, int sectors,
gfp_t gfp, struct bio_set *bs)

View File

@@ -175,7 +175,7 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
continue;
/* alloc per-policy data and attach it to blkg */
pd = pol->pd_alloc_fn(gfp_mask, q->node);
pd = pol->pd_alloc_fn(gfp_mask, q, blkcg);
if (!pd)
goto err_free;
@@ -753,6 +753,44 @@ static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg,
return __blkg_lookup(blkcg, q, true /* update_hint */);
}
/**
* blkg_conf_prep - parse and prepare for per-blkg config update
* @inputp: input string pointer
*
* Parse the device node prefix part, MAJ:MIN, of per-blkg config update
* from @input and get and return the matching gendisk. *@inputp is
* updated to point past the device node prefix. Returns an ERR_PTR()
* value on error.
*
* Use this function iff blkg_conf_prep() can't be used for some reason.
*/
struct gendisk *blkcg_conf_get_disk(char **inputp)
{
char *input = *inputp;
unsigned int major, minor;
struct gendisk *disk;
int key_len, part;
if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2)
return ERR_PTR(-EINVAL);
input += key_len;
if (!isspace(*input))
return ERR_PTR(-EINVAL);
input = skip_spaces(input);
disk = get_gendisk(MKDEV(major, minor), &part);
if (!disk)
return ERR_PTR(-ENODEV);
if (part) {
put_disk_and_module(disk);
return ERR_PTR(-ENODEV);
}
*inputp = input;
return disk;
}
/**
* blkg_conf_prep - parse and prepare for per-blkg config update
* @blkcg: target block cgroup
@@ -772,25 +810,11 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
struct gendisk *disk;
struct request_queue *q;
struct blkcg_gq *blkg;
unsigned int major, minor;
int key_len, part, ret;
char *body;
int ret;
if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2)
return -EINVAL;
body = input + key_len;
if (!isspace(*body))
return -EINVAL;
body = skip_spaces(body);
disk = get_gendisk(MKDEV(major, minor), &part);
if (!disk)
return -ENODEV;
if (part) {
ret = -ENODEV;
goto fail;
}
disk = blkcg_conf_get_disk(&input);
if (IS_ERR(disk))
return PTR_ERR(disk);
q = disk->queue;
@@ -856,7 +880,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
success:
ctx->disk = disk;
ctx->blkg = blkg;
ctx->body = body;
ctx->body = input;
return 0;
fail_unlock:
@@ -876,6 +900,7 @@ fail:
}
return ret;
}
EXPORT_SYMBOL_GPL(blkg_conf_prep);
/**
* blkg_conf_finish - finish up per-blkg config update
@@ -891,6 +916,7 @@ void blkg_conf_finish(struct blkg_conf_ctx *ctx)
rcu_read_unlock();
put_disk_and_module(ctx->disk);
}
EXPORT_SYMBOL_GPL(blkg_conf_finish);
static int blkcg_print_stat(struct seq_file *sf, void *v)
{
@@ -1346,7 +1372,7 @@ int blkcg_activate_policy(struct request_queue *q,
blk_mq_freeze_queue(q);
pd_prealloc:
if (!pd_prealloc) {
pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q->node);
pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q, &blkcg_root);
if (!pd_prealloc) {
ret = -ENOMEM;
goto out_bypass_end;
@@ -1362,7 +1388,7 @@ pd_prealloc:
if (blkg->pd[pol->plid])
continue;
pd = pol->pd_alloc_fn(GFP_NOWAIT | __GFP_NOWARN, q->node);
pd = pol->pd_alloc_fn(GFP_NOWAIT | __GFP_NOWARN, q, &blkcg_root);
if (!pd)
swap(pd, pd_prealloc);
if (!pd) {
@@ -1475,7 +1501,8 @@ int blkcg_policy_register(struct blkcg_policy *pol)
blkcg->cpd[pol->plid] = cpd;
cpd->blkcg = blkcg;
cpd->plid = pol->plid;
pol->cpd_init_fn(cpd);
if (pol->cpd_init_fn)
pol->cpd_init_fn(cpd);
}
}

View File

@@ -36,6 +36,7 @@
#include <linux/blk-cgroup.h>
#include <linux/debugfs.h>
#include <linux/bpf.h>
#include <linux/psi.h>
#define CREATE_TRACE_POINTS
#include <trace/events/block.h>
@@ -129,6 +130,7 @@ static const char *const blk_op_name[] = {
REQ_OP_NAME(DISCARD),
REQ_OP_NAME(SECURE_ERASE),
REQ_OP_NAME(ZONE_RESET),
REQ_OP_NAME(ZONE_RESET_ALL),
REQ_OP_NAME(WRITE_SAME),
REQ_OP_NAME(WRITE_ZEROES),
REQ_OP_NAME(SCSI_IN),
@@ -344,7 +346,8 @@ void blk_cleanup_queue(struct request_queue *q)
/*
* Drain all requests queued before DYING marking. Set DEAD flag to
* prevent that q->request_fn() gets invoked after draining finished.
* prevent that blk_mq_run_hw_queues() accesses the hardware queues
* after draining finished.
*/
blk_freeze_queue(q);
@@ -479,7 +482,6 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
if (!q)
return NULL;
INIT_LIST_HEAD(&q->queue_head);
q->last_merge = NULL;
q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask);
@@ -518,6 +520,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
mutex_init(&q->blk_trace_mutex);
#endif
mutex_init(&q->sysfs_lock);
mutex_init(&q->sysfs_dir_lock);
spin_lock_init(&q->queue_lock);
init_waitqueue_head(&q->mq_freeze_wq);
@@ -601,6 +604,7 @@ bool bio_attempt_back_merge(struct request *req, struct bio *bio,
return false;
trace_block_bio_backmerge(req->q, req, bio);
rq_qos_merge(req->q, req, bio);
if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
blk_rq_set_mixed_merge(req);
@@ -622,6 +626,7 @@ bool bio_attempt_front_merge(struct request *req, struct bio *bio,
return false;
trace_block_bio_frontmerge(req->q, req, bio);
rq_qos_merge(req->q, req, bio);
if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
blk_rq_set_mixed_merge(req);
@@ -647,6 +652,8 @@ bool bio_attempt_discard_merge(struct request_queue *q, struct request *req,
blk_rq_get_max_sectors(req, blk_rq_pos(req)))
goto no_merge;
rq_qos_merge(q, req, bio);
req->biotail->bi_next = bio;
req->biotail = bio;
req->__data_len += bio->bi_iter.bi_size;
@@ -931,6 +938,10 @@ generic_make_request_checks(struct bio *bio)
if (!blk_queue_is_zoned(q))
goto not_supported;
break;
case REQ_OP_ZONE_RESET_ALL:
if (!blk_queue_is_zoned(q) || !blk_queue_zone_resetall(q))
goto not_supported;
break;
case REQ_OP_WRITE_ZEROES:
if (!q->limits.max_write_zeroes_sectors)
goto not_supported;
@@ -1128,6 +1139,10 @@ EXPORT_SYMBOL_GPL(direct_make_request);
*/
blk_qc_t submit_bio(struct bio *bio)
{
bool workingset_read = false;
unsigned long pflags;
blk_qc_t ret;
if (blkcg_punt_bio_submit(bio))
return BLK_QC_T_NONE;
@@ -1146,6 +1161,8 @@ blk_qc_t submit_bio(struct bio *bio)
if (op_is_write(bio_op(bio))) {
count_vm_events(PGPGOUT, count);
} else {
if (bio_flagged(bio, BIO_WORKINGSET))
workingset_read = true;
task_io_account_read(bio->bi_iter.bi_size);
count_vm_events(PGPGIN, count);
}
@@ -1160,7 +1177,21 @@ blk_qc_t submit_bio(struct bio *bio)
}
}
return generic_make_request(bio);
/*
* If we're reading data that is part of the userspace
* workingset, count submission time as memory stall. When the
* device is congested, or the submitting cgroup IO-throttled,
* submission can be a significant part of overall IO time.
*/
if (workingset_read)
psi_memstall_enter(&pflags);
ret = generic_make_request(bio);
if (workingset_read)
psi_memstall_leave(&pflags);
return ret;
}
EXPORT_SYMBOL(submit_bio);

2457
block/blk-iocost.c Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -725,7 +725,7 @@ int blk_iolatency_init(struct request_queue *q)
return -ENOMEM;
rqos = &blkiolat->rqos;
rqos->id = RQ_QOS_CGROUP;
rqos->id = RQ_QOS_LATENCY;
rqos->ops = &blkcg_iolatency_ops;
rqos->q = q;
@@ -934,11 +934,13 @@ static size_t iolatency_pd_stat(struct blkg_policy_data *pd, char *buf,
}
static struct blkg_policy_data *iolatency_pd_alloc(gfp_t gfp, int node)
static struct blkg_policy_data *iolatency_pd_alloc(gfp_t gfp,
struct request_queue *q,
struct blkcg *blkcg)
{
struct iolatency_grp *iolat;
iolat = kzalloc_node(sizeof(*iolat), gfp, node);
iolat = kzalloc_node(sizeof(*iolat), gfp, q->node);
if (!iolat)
return NULL;
iolat->stats = __alloc_percpu_gfp(sizeof(struct latency_stat),

View File

@@ -132,19 +132,32 @@ static struct bio *blk_bio_write_same_split(struct request_queue *q,
return bio_split(bio, q->limits.max_write_same_sectors, GFP_NOIO, bs);
}
/*
* Return the maximum number of sectors from the start of a bio that may be
* submitted as a single request to a block device. If enough sectors remain,
* align the end to the physical block size. Otherwise align the end to the
* logical block size. This approach minimizes the number of non-aligned
* requests that are submitted to a block device if the start of a bio is not
* aligned to a physical block boundary.
*/
static inline unsigned get_max_io_size(struct request_queue *q,
struct bio *bio)
{
unsigned sectors = blk_max_size_offset(q, bio->bi_iter.bi_sector);
unsigned mask = queue_logical_block_size(q) - 1;
unsigned max_sectors = sectors;
unsigned pbs = queue_physical_block_size(q) >> SECTOR_SHIFT;
unsigned lbs = queue_logical_block_size(q) >> SECTOR_SHIFT;
unsigned start_offset = bio->bi_iter.bi_sector & (pbs - 1);
/* aligned to logical block size */
sectors &= ~(mask >> 9);
max_sectors += start_offset;
max_sectors &= ~(pbs - 1);
if (max_sectors > start_offset)
return max_sectors - start_offset;
return sectors;
return sectors & (lbs - 1);
}
static unsigned get_max_segment_size(struct request_queue *q,
static unsigned get_max_segment_size(const struct request_queue *q,
unsigned offset)
{
unsigned long mask = queue_segment_boundary(q);
@@ -157,26 +170,41 @@ static unsigned get_max_segment_size(struct request_queue *q,
queue_max_segment_size(q));
}
/*
* Split the bvec @bv into segments, and update all kinds of
* variables.
/**
* bvec_split_segs - verify whether or not a bvec should be split in the middle
* @q: [in] request queue associated with the bio associated with @bv
* @bv: [in] bvec to examine
* @nsegs: [in,out] Number of segments in the bio being built. Incremented
* by the number of segments from @bv that may be appended to that
* bio without exceeding @max_segs
* @sectors: [in,out] Number of sectors in the bio being built. Incremented
* by the number of sectors from @bv that may be appended to that
* bio without exceeding @max_sectors
* @max_segs: [in] upper bound for *@nsegs
* @max_sectors: [in] upper bound for *@sectors
*
* When splitting a bio, it can happen that a bvec is encountered that is too
* big to fit in a single segment and hence that it has to be split in the
* middle. This function verifies whether or not that should happen. The value
* %true is returned if and only if appending the entire @bv to a bio with
* *@nsegs segments and *@sectors sectors would make that bio unacceptable for
* the block driver.
*/
static bool bvec_split_segs(struct request_queue *q, struct bio_vec *bv,
unsigned *nsegs, unsigned *sectors, unsigned max_segs)
static bool bvec_split_segs(const struct request_queue *q,
const struct bio_vec *bv, unsigned *nsegs,
unsigned *sectors, unsigned max_segs,
unsigned max_sectors)
{
unsigned len = bv->bv_len;
unsigned max_len = (min(max_sectors, UINT_MAX >> 9) - *sectors) << 9;
unsigned len = min(bv->bv_len, max_len);
unsigned total_len = 0;
unsigned new_nsegs = 0, seg_size = 0;
unsigned seg_size = 0;
/*
* Multi-page bvec may be too big to hold in one segment, so the
* current bvec has to be splitted as multiple segments.
*/
while (len && new_nsegs + *nsegs < max_segs) {
while (len && *nsegs < max_segs) {
seg_size = get_max_segment_size(q, bv->bv_offset + total_len);
seg_size = min(seg_size, len);
new_nsegs++;
(*nsegs)++;
total_len += seg_size;
len -= seg_size;
@@ -184,16 +212,31 @@ static bool bvec_split_segs(struct request_queue *q, struct bio_vec *bv,
break;
}
if (new_nsegs) {
*nsegs += new_nsegs;
if (sectors)
*sectors += total_len >> 9;
}
*sectors += total_len >> 9;
/* split in the middle of the bvec if len != 0 */
return !!len;
/* tell the caller to split the bvec if it is too big to fit */
return len > 0 || bv->bv_len > max_len;
}
/**
* blk_bio_segment_split - split a bio in two bios
* @q: [in] request queue pointer
* @bio: [in] bio to be split
* @bs: [in] bio set to allocate the clone from
* @segs: [out] number of segments in the bio with the first half of the sectors
*
* Clone @bio, update the bi_iter of the clone to represent the first sectors
* of @bio and update @bio->bi_iter to represent the remaining sectors. The
* following is guaranteed for the cloned bio:
* - That it has at most get_max_io_size(@q, @bio) sectors.
* - That it has at most queue_max_segments(@q) segments.
*
* Except for discard requests the cloned bio will point at the bi_io_vec of
* the original bio. It is the responsibility of the caller to ensure that the
* original bio is not freed before the cloned bio. The caller is also
* responsible for ensuring that @bs is only destroyed after processing of the
* split bio has finished.
*/
static struct bio *blk_bio_segment_split(struct request_queue *q,
struct bio *bio,
struct bio_set *bs,
@@ -213,34 +256,18 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
if (bvprvp && bvec_gap_to_prev(q, bvprvp, bv.bv_offset))
goto split;
if (sectors + (bv.bv_len >> 9) > max_sectors) {
/*
* Consider this a new segment if we're splitting in
* the middle of this vector.
*/
if (nsegs < max_segs &&
sectors < max_sectors) {
/* split in the middle of bvec */
bv.bv_len = (max_sectors - sectors) << 9;
bvec_split_segs(q, &bv, &nsegs,
&sectors, max_segs);
}
if (nsegs < max_segs &&
sectors + (bv.bv_len >> 9) <= max_sectors &&
bv.bv_offset + bv.bv_len <= PAGE_SIZE) {
nsegs++;
sectors += bv.bv_len >> 9;
} else if (bvec_split_segs(q, &bv, &nsegs, &sectors, max_segs,
max_sectors)) {
goto split;
}
if (nsegs == max_segs)
goto split;
bvprv = bv;
bvprvp = &bvprv;
if (bv.bv_offset + bv.bv_len <= PAGE_SIZE) {
nsegs++;
sectors += bv.bv_len >> 9;
} else if (bvec_split_segs(q, &bv, &nsegs, &sectors,
max_segs)) {
goto split;
}
}
*segs = nsegs;
@@ -250,6 +277,19 @@ split:
return bio_split(bio, sectors, GFP_NOIO, bs);
}
/**
* __blk_queue_split - split a bio and submit the second half
* @q: [in] request queue pointer
* @bio: [in, out] bio to be split
* @nr_segs: [out] number of segments in the first bio
*
* Split a bio into two bios, chain the two bios, submit the second half and
* store a pointer to the first half in *@bio. If the second bio is still too
* big it will be split by a recursive call to this function. Since this
* function may allocate a new bio from @q->bio_split, it is the responsibility
* of the caller to ensure that @q is only released after processing of the
* split bio has finished.
*/
void __blk_queue_split(struct request_queue *q, struct bio **bio,
unsigned int *nr_segs)
{
@@ -294,6 +334,17 @@ void __blk_queue_split(struct request_queue *q, struct bio **bio,
}
}
/**
* blk_queue_split - split a bio and submit the second half
* @q: [in] request queue pointer
* @bio: [in, out] bio to be split
*
* Split a bio into two bios, chains the two bios, submit the second half and
* store a pointer to the first half in *@bio. Since this function may allocate
* a new bio from @q->bio_split, it is the responsibility of the caller to
* ensure that @q is only released after processing of the split bio has
* finished.
*/
void blk_queue_split(struct request_queue *q, struct bio **bio)
{
unsigned int nr_segs;
@@ -305,6 +356,7 @@ EXPORT_SYMBOL(blk_queue_split);
unsigned int blk_recalc_rq_segments(struct request *rq)
{
unsigned int nr_phys_segs = 0;
unsigned int nr_sectors = 0;
struct req_iterator iter;
struct bio_vec bv;
@@ -321,7 +373,8 @@ unsigned int blk_recalc_rq_segments(struct request *rq)
}
rq_for_each_bvec(bv, rq, iter)
bvec_split_segs(rq->q, &bv, &nr_phys_segs, NULL, UINT_MAX);
bvec_split_segs(rq->q, &bv, &nr_phys_segs, &nr_sectors,
UINT_MAX, UINT_MAX);
return nr_phys_segs;
}

View File

@@ -15,10 +15,10 @@
#include "blk.h"
#include "blk-mq.h"
static int cpu_to_queue_index(struct blk_mq_queue_map *qmap,
unsigned int nr_queues, const int cpu)
static int queue_index(struct blk_mq_queue_map *qmap,
unsigned int nr_queues, const int q)
{
return qmap->queue_offset + (cpu % nr_queues);
return qmap->queue_offset + (q % nr_queues);
}
static int get_first_sibling(unsigned int cpu)
@@ -36,21 +36,36 @@ int blk_mq_map_queues(struct blk_mq_queue_map *qmap)
{
unsigned int *map = qmap->mq_map;
unsigned int nr_queues = qmap->nr_queues;
unsigned int cpu, first_sibling;
unsigned int cpu, first_sibling, q = 0;
for_each_possible_cpu(cpu)
map[cpu] = -1;
/*
* Spread queues among present CPUs first for minimizing
* count of dead queues which are mapped by all un-present CPUs
*/
for_each_present_cpu(cpu) {
if (q >= nr_queues)
break;
map[cpu] = queue_index(qmap, nr_queues, q++);
}
for_each_possible_cpu(cpu) {
if (map[cpu] != -1)
continue;
/*
* First do sequential mapping between CPUs and queues.
* In case we still have CPUs to map, and we have some number of
* threads per cores then map sibling threads to the same queue
* for performance optimizations.
*/
if (cpu < nr_queues) {
map[cpu] = cpu_to_queue_index(qmap, nr_queues, cpu);
if (q < nr_queues) {
map[cpu] = queue_index(qmap, nr_queues, q++);
} else {
first_sibling = get_first_sibling(cpu);
if (first_sibling == cpu)
map[cpu] = cpu_to_queue_index(qmap, nr_queues, cpu);
map[cpu] = queue_index(qmap, nr_queues, q++);
else
map[cpu] = map[first_sibling];
}

View File

@@ -270,7 +270,7 @@ void blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
struct blk_mq_hw_ctx *hctx;
int i;
lockdep_assert_held(&q->sysfs_lock);
lockdep_assert_held(&q->sysfs_dir_lock);
queue_for_each_hw_ctx(q, hctx, i)
blk_mq_unregister_hctx(hctx);
@@ -320,7 +320,7 @@ int __blk_mq_register_dev(struct device *dev, struct request_queue *q)
int ret, i;
WARN_ON_ONCE(!q->kobj.parent);
lockdep_assert_held(&q->sysfs_lock);
lockdep_assert_held(&q->sysfs_dir_lock);
ret = kobject_add(q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq");
if (ret < 0)
@@ -349,23 +349,12 @@ unreg:
return ret;
}
int blk_mq_register_dev(struct device *dev, struct request_queue *q)
{
int ret;
mutex_lock(&q->sysfs_lock);
ret = __blk_mq_register_dev(dev, q);
mutex_unlock(&q->sysfs_lock);
return ret;
}
void blk_mq_sysfs_unregister(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
int i;
mutex_lock(&q->sysfs_lock);
mutex_lock(&q->sysfs_dir_lock);
if (!q->mq_sysfs_init_done)
goto unlock;
@@ -373,7 +362,7 @@ void blk_mq_sysfs_unregister(struct request_queue *q)
blk_mq_unregister_hctx(hctx);
unlock:
mutex_unlock(&q->sysfs_lock);
mutex_unlock(&q->sysfs_dir_lock);
}
int blk_mq_sysfs_register(struct request_queue *q)
@@ -381,7 +370,7 @@ int blk_mq_sysfs_register(struct request_queue *q)
struct blk_mq_hw_ctx *hctx;
int i, ret = 0;
mutex_lock(&q->sysfs_lock);
mutex_lock(&q->sysfs_dir_lock);
if (!q->mq_sysfs_init_done)
goto unlock;
@@ -392,7 +381,7 @@ int blk_mq_sysfs_register(struct request_queue *q)
}
unlock:
mutex_unlock(&q->sysfs_lock);
mutex_unlock(&q->sysfs_dir_lock);
return ret;
}

View File

@@ -10,6 +10,7 @@
#include <linux/module.h>
#include <linux/blk-mq.h>
#include <linux/delay.h>
#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-tag.h"
@@ -354,6 +355,37 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
}
EXPORT_SYMBOL(blk_mq_tagset_busy_iter);
static bool blk_mq_tagset_count_completed_rqs(struct request *rq,
void *data, bool reserved)
{
unsigned *count = data;
if (blk_mq_request_completed(rq))
(*count)++;
return true;
}
/**
* blk_mq_tagset_wait_completed_request - wait until all completed req's
* complete funtion is run
* @tagset: Tag set to drain completed request
*
* Note: This function has to be run after all IO queues are shutdown
*/
void blk_mq_tagset_wait_completed_request(struct blk_mq_tag_set *tagset)
{
while (true) {
unsigned count = 0;
blk_mq_tagset_busy_iter(tagset,
blk_mq_tagset_count_completed_rqs, &count);
if (!count)
break;
msleep(5);
}
}
EXPORT_SYMBOL(blk_mq_tagset_wait_completed_request);
/**
* blk_mq_queue_tag_busy_iter - iterate over all requests with a driver tag
* @q: Request queue to examine.

View File

@@ -44,12 +44,12 @@ static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
static int blk_mq_poll_stats_bkt(const struct request *rq)
{
int ddir, bytes, bucket;
int ddir, sectors, bucket;
ddir = rq_data_dir(rq);
bytes = blk_rq_bytes(rq);
sectors = blk_rq_stats_sectors(rq);
bucket = ddir + 2*(ilog2(bytes) - 9);
bucket = ddir + 2 * ilog2(sectors);
if (bucket < 0)
return -1;
@@ -282,16 +282,16 @@ bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
EXPORT_SYMBOL(blk_mq_can_queue);
/*
* Only need start/end time stamping if we have stats enabled, or using
* an IO scheduler.
* Only need start/end time stamping if we have iostat or
* blk stats enabled, or using an IO scheduler.
*/
static inline bool blk_mq_need_time_stamp(struct request *rq)
{
return (rq->rq_flags & RQF_IO_STAT) || rq->q->elevator;
return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS)) || rq->q->elevator;
}
static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
unsigned int tag, unsigned int op)
unsigned int tag, unsigned int op, u64 alloc_time_ns)
{
struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
struct request *rq = tags->static_rqs[tag];
@@ -325,11 +325,15 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
RB_CLEAR_NODE(&rq->rb_node);
rq->rq_disk = NULL;
rq->part = NULL;
#ifdef CONFIG_BLK_RQ_ALLOC_TIME
rq->alloc_time_ns = alloc_time_ns;
#endif
if (blk_mq_need_time_stamp(rq))
rq->start_time_ns = ktime_get_ns();
else
rq->start_time_ns = 0;
rq->io_start_time_ns = 0;
rq->stats_sectors = 0;
rq->nr_phys_segments = 0;
#if defined(CONFIG_BLK_DEV_INTEGRITY)
rq->nr_integrity_segments = 0;
@@ -356,8 +360,14 @@ static struct request *blk_mq_get_request(struct request_queue *q,
struct request *rq;
unsigned int tag;
bool clear_ctx_on_error = false;
u64 alloc_time_ns = 0;
blk_queue_enter_live(q);
/* alloc_time includes depth and tag waits */
if (blk_queue_rq_alloc_time(q))
alloc_time_ns = ktime_get_ns();
data->q = q;
if (likely(!data->ctx)) {
data->ctx = blk_mq_get_ctx(q);
@@ -393,7 +403,7 @@ static struct request *blk_mq_get_request(struct request_queue *q,
return NULL;
}
rq = blk_mq_rq_ctx_init(data, tag, data->cmd_flags);
rq = blk_mq_rq_ctx_init(data, tag, data->cmd_flags, alloc_time_ns);
if (!op_is_flush(data->cmd_flags)) {
rq->elv.icq = NULL;
if (e && e->type->ops.prepare_request) {
@@ -652,19 +662,18 @@ bool blk_mq_complete_request(struct request *rq)
}
EXPORT_SYMBOL(blk_mq_complete_request);
void blk_mq_complete_request_sync(struct request *rq)
{
WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
rq->q->mq_ops->complete(rq);
}
EXPORT_SYMBOL_GPL(blk_mq_complete_request_sync);
int blk_mq_request_started(struct request *rq)
{
return blk_mq_rq_state(rq) != MQ_RQ_IDLE;
}
EXPORT_SYMBOL_GPL(blk_mq_request_started);
int blk_mq_request_completed(struct request *rq)
{
return blk_mq_rq_state(rq) == MQ_RQ_COMPLETE;
}
EXPORT_SYMBOL_GPL(blk_mq_request_completed);
void blk_mq_start_request(struct request *rq)
{
struct request_queue *q = rq->q;
@@ -673,9 +682,7 @@ void blk_mq_start_request(struct request *rq)
if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
rq->io_start_time_ns = ktime_get_ns();
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
rq->throtl_size = blk_rq_sectors(rq);
#endif
rq->stats_sectors = blk_rq_sectors(rq);
rq->rq_flags |= RQF_STATS;
rq_qos_issue(q, rq);
}
@@ -2453,11 +2460,6 @@ static void blk_mq_map_swqueue(struct request_queue *q)
struct blk_mq_ctx *ctx;
struct blk_mq_tag_set *set = q->tag_set;
/*
* Avoid others reading imcomplete hctx->cpumask through sysfs
*/
mutex_lock(&q->sysfs_lock);
queue_for_each_hw_ctx(q, hctx, i) {
cpumask_clear(hctx->cpumask);
hctx->nr_ctx = 0;
@@ -2518,8 +2520,6 @@ static void blk_mq_map_swqueue(struct request_queue *q)
HCTX_TYPE_DEFAULT, i);
}
mutex_unlock(&q->sysfs_lock);
queue_for_each_hw_ctx(q, hctx, i) {
/*
* If no software queues are mapped to this hardware queue,
@@ -2688,7 +2688,11 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
if (!uninit_q)
return ERR_PTR(-ENOMEM);
q = blk_mq_init_allocated_queue(set, uninit_q);
/*
* Initialize the queue without an elevator. device_add_disk() will do
* the initialization.
*/
q = blk_mq_init_allocated_queue(set, uninit_q, false);
if (IS_ERR(q))
blk_cleanup_queue(uninit_q);
@@ -2839,7 +2843,8 @@ static unsigned int nr_hw_queues(struct blk_mq_tag_set *set)
}
struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
struct request_queue *q)
struct request_queue *q,
bool elevator_init)
{
/* mark the queue as mq asap */
q->mq_ops = set->ops;
@@ -2901,18 +2906,14 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
blk_mq_add_queue_tag_set(set, q);
blk_mq_map_swqueue(q);
if (!(set->flags & BLK_MQ_F_NO_SCHED)) {
int ret;
ret = elevator_init_mq(q);
if (ret)
return ERR_PTR(ret);
}
if (elevator_init)
elevator_init_mq(q);
return q;
err_hctxs:
kfree(q->queue_hw_ctx);
q->nr_hw_queues = 0;
err_sys_init:
blk_mq_sysfs_deinit(q);
err_poll:

View File

@@ -207,10 +207,12 @@ EXPORT_SYMBOL(blk_post_runtime_resume);
*/
void blk_set_runtime_active(struct request_queue *q)
{
spin_lock_irq(&q->queue_lock);
q->rpm_status = RPM_ACTIVE;
pm_runtime_mark_last_busy(q->dev);
pm_request_autosuspend(q->dev);
spin_unlock_irq(&q->queue_lock);
if (q->dev) {
spin_lock_irq(&q->queue_lock);
q->rpm_status = RPM_ACTIVE;
pm_runtime_mark_last_busy(q->dev);
pm_request_autosuspend(q->dev);
spin_unlock_irq(&q->queue_lock);
}
}
EXPORT_SYMBOL(blk_set_runtime_active);

View File

@@ -83,6 +83,15 @@ void __rq_qos_track(struct rq_qos *rqos, struct request *rq, struct bio *bio)
} while (rqos);
}
void __rq_qos_merge(struct rq_qos *rqos, struct request *rq, struct bio *bio)
{
do {
if (rqos->ops->merge)
rqos->ops->merge(rqos, rq, bio);
rqos = rqos->next;
} while (rqos);
}
void __rq_qos_done_bio(struct rq_qos *rqos, struct bio *bio)
{
do {
@@ -92,6 +101,15 @@ void __rq_qos_done_bio(struct rq_qos *rqos, struct bio *bio)
} while (rqos);
}
void __rq_qos_queue_depth_changed(struct rq_qos *rqos)
{
do {
if (rqos->ops->queue_depth_changed)
rqos->ops->queue_depth_changed(rqos);
rqos = rqos->next;
} while (rqos);
}
/*
* Return true, if we can't increase the depth further by scaling
*/

View File

@@ -14,7 +14,8 @@ struct blk_mq_debugfs_attr;
enum rq_qos_id {
RQ_QOS_WBT,
RQ_QOS_CGROUP,
RQ_QOS_LATENCY,
RQ_QOS_COST,
};
struct rq_wait {
@@ -35,11 +36,13 @@ struct rq_qos {
struct rq_qos_ops {
void (*throttle)(struct rq_qos *, struct bio *);
void (*track)(struct rq_qos *, struct request *, struct bio *);
void (*merge)(struct rq_qos *, struct request *, struct bio *);
void (*issue)(struct rq_qos *, struct request *);
void (*requeue)(struct rq_qos *, struct request *);
void (*done)(struct rq_qos *, struct request *);
void (*done_bio)(struct rq_qos *, struct bio *);
void (*cleanup)(struct rq_qos *, struct bio *);
void (*queue_depth_changed)(struct rq_qos *);
void (*exit)(struct rq_qos *);
const struct blk_mq_debugfs_attr *debugfs_attrs;
};
@@ -72,7 +75,7 @@ static inline struct rq_qos *wbt_rq_qos(struct request_queue *q)
static inline struct rq_qos *blkcg_rq_qos(struct request_queue *q)
{
return rq_qos_id(q, RQ_QOS_CGROUP);
return rq_qos_id(q, RQ_QOS_LATENCY);
}
static inline const char *rq_qos_id_to_name(enum rq_qos_id id)
@@ -80,8 +83,10 @@ static inline const char *rq_qos_id_to_name(enum rq_qos_id id)
switch (id) {
case RQ_QOS_WBT:
return "wbt";
case RQ_QOS_CGROUP:
return "cgroup";
case RQ_QOS_LATENCY:
return "latency";
case RQ_QOS_COST:
return "cost";
}
return "unknown";
}
@@ -135,7 +140,9 @@ void __rq_qos_issue(struct rq_qos *rqos, struct request *rq);
void __rq_qos_requeue(struct rq_qos *rqos, struct request *rq);
void __rq_qos_throttle(struct rq_qos *rqos, struct bio *bio);
void __rq_qos_track(struct rq_qos *rqos, struct request *rq, struct bio *bio);
void __rq_qos_merge(struct rq_qos *rqos, struct request *rq, struct bio *bio);
void __rq_qos_done_bio(struct rq_qos *rqos, struct bio *bio);
void __rq_qos_queue_depth_changed(struct rq_qos *rqos);
static inline void rq_qos_cleanup(struct request_queue *q, struct bio *bio)
{
@@ -185,6 +192,19 @@ static inline void rq_qos_track(struct request_queue *q, struct request *rq,
__rq_qos_track(q->rq_qos, rq, bio);
}
static inline void rq_qos_merge(struct request_queue *q, struct request *rq,
struct bio *bio)
{
if (q->rq_qos)
__rq_qos_merge(q->rq_qos, rq, bio);
}
static inline void rq_qos_queue_depth_changed(struct request_queue *q)
{
if (q->rq_qos)
__rq_qos_queue_depth_changed(q->rq_qos);
}
void rq_qos_exit(struct request_queue *);
#endif

View File

@@ -805,7 +805,7 @@ EXPORT_SYMBOL(blk_queue_update_dma_alignment);
void blk_set_queue_depth(struct request_queue *q, unsigned int depth)
{
q->queue_depth = depth;
wbt_set_queue_depth(q, depth);
rq_qos_queue_depth_changed(q);
}
EXPORT_SYMBOL(blk_set_queue_depth);
@@ -832,6 +832,22 @@ void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua)
}
EXPORT_SYMBOL_GPL(blk_queue_write_cache);
/**
* blk_queue_required_elevator_features - Set a queue required elevator features
* @q: the request queue for the target device
* @features: Required elevator features OR'ed together
*
* Tell the block layer that for the device controlled through @q, only the
* only elevators that can be used are those that implement at least the set of
* features specified by @features.
*/
void blk_queue_required_elevator_features(struct request_queue *q,
unsigned int features)
{
q->required_elevator_features = features;
}
EXPORT_SYMBOL_GPL(blk_queue_required_elevator_features);
static int __init blk_settings_init(void)
{
blk_max_low_pfn = max_low_pfn - 1;

View File

@@ -941,14 +941,14 @@ int blk_register_queue(struct gendisk *disk)
int ret;
struct device *dev = disk_to_dev(disk);
struct request_queue *q = disk->queue;
bool has_elevator = false;
if (WARN_ON(!q))
return -ENXIO;
WARN_ONCE(test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags),
WARN_ONCE(blk_queue_registered(q),
"%s is registering an already registered queue\n",
kobject_name(&dev->kobj));
blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);
/*
* SCSI probing may synchronously create and destroy a lot of
@@ -968,8 +968,7 @@ int blk_register_queue(struct gendisk *disk)
if (ret)
return ret;
/* Prevent changes through sysfs until registration is completed. */
mutex_lock(&q->sysfs_lock);
mutex_lock(&q->sysfs_dir_lock);
ret = kobject_add(&q->kobj, kobject_get(&dev->kobj), "%s", "queue");
if (ret < 0) {
@@ -990,26 +989,36 @@ int blk_register_queue(struct gendisk *disk)
blk_mq_debugfs_register(q);
}
kobject_uevent(&q->kobj, KOBJ_ADD);
wbt_enable_default(q);
blk_throtl_register_queue(q);
/*
* The flag of QUEUE_FLAG_REGISTERED isn't set yet, so elevator
* switch won't happen at all.
*/
if (q->elevator) {
ret = elv_register_queue(q);
ret = elv_register_queue(q, false);
if (ret) {
mutex_unlock(&q->sysfs_lock);
kobject_uevent(&q->kobj, KOBJ_REMOVE);
mutex_unlock(&q->sysfs_dir_lock);
kobject_del(&q->kobj);
blk_trace_remove_sysfs(dev);
kobject_put(&dev->kobj);
return ret;
}
has_elevator = true;
}
mutex_lock(&q->sysfs_lock);
blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);
wbt_enable_default(q);
blk_throtl_register_queue(q);
/* Now everything is ready and send out KOBJ_ADD uevent */
kobject_uevent(&q->kobj, KOBJ_ADD);
if (has_elevator)
kobject_uevent(&q->elevator->kobj, KOBJ_ADD);
mutex_unlock(&q->sysfs_lock);
ret = 0;
unlock:
mutex_unlock(&q->sysfs_lock);
mutex_unlock(&q->sysfs_dir_lock);
return ret;
}
EXPORT_SYMBOL_GPL(blk_register_queue);
@@ -1029,7 +1038,7 @@ void blk_unregister_queue(struct gendisk *disk)
return;
/* Return early if disk->queue was never registered. */
if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
if (!blk_queue_registered(q))
return;
/*
@@ -1038,25 +1047,28 @@ void blk_unregister_queue(struct gendisk *disk)
* concurrent elv_iosched_store() calls.
*/
mutex_lock(&q->sysfs_lock);
blk_queue_flag_clear(QUEUE_FLAG_REGISTERED, q);
mutex_unlock(&q->sysfs_lock);
mutex_lock(&q->sysfs_dir_lock);
/*
* Remove the sysfs attributes before unregistering the queue data
* structures that can be modified through sysfs.
*/
if (queue_is_mq(q))
blk_mq_unregister_dev(disk_to_dev(disk), q);
mutex_unlock(&q->sysfs_lock);
kobject_uevent(&q->kobj, KOBJ_REMOVE);
kobject_del(&q->kobj);
blk_trace_remove_sysfs(disk_to_dev(disk));
mutex_lock(&q->sysfs_lock);
/*
* q->kobj has been removed, so it is safe to check if elevator
* exists without holding q->sysfs_lock.
*/
if (q->elevator)
elv_unregister_queue(q);
mutex_unlock(&q->sysfs_lock);
mutex_unlock(&q->sysfs_dir_lock);
kobject_put(&disk_to_dev(disk)->kobj);
}

View File

@@ -478,12 +478,14 @@ static void throtl_service_queue_init(struct throtl_service_queue *sq)
timer_setup(&sq->pending_timer, throtl_pending_timer_fn, 0);
}
static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node)
static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp,
struct request_queue *q,
struct blkcg *blkcg)
{
struct throtl_grp *tg;
int rw;
tg = kzalloc_node(sizeof(*tg), gfp, node);
tg = kzalloc_node(sizeof(*tg), gfp, q->node);
if (!tg)
return NULL;
@@ -2246,7 +2248,8 @@ void blk_throtl_stat_add(struct request *rq, u64 time_ns)
struct request_queue *q = rq->q;
struct throtl_data *td = q->td;
throtl_track_latency(td, rq->throtl_size, req_op(rq), time_ns >> 10);
throtl_track_latency(td, blk_rq_stats_sectors(rq), req_op(rq),
time_ns >> 10);
}
void blk_throtl_bio_endio(struct bio *bio)

View File

@@ -629,15 +629,6 @@ static void wbt_requeue(struct rq_qos *rqos, struct request *rq)
}
}
void wbt_set_queue_depth(struct request_queue *q, unsigned int depth)
{
struct rq_qos *rqos = wbt_rq_qos(q);
if (rqos) {
RQWB(rqos)->rq_depth.queue_depth = depth;
__wbt_update_limits(RQWB(rqos));
}
}
void wbt_set_write_cache(struct request_queue *q, bool write_cache_on)
{
struct rq_qos *rqos = wbt_rq_qos(q);
@@ -656,7 +647,7 @@ void wbt_enable_default(struct request_queue *q)
return;
/* Queue not registered? Maybe shutting down... */
if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
if (!blk_queue_registered(q))
return;
if (queue_is_mq(q) && IS_ENABLED(CONFIG_BLK_WBT_MQ))
@@ -689,6 +680,12 @@ static int wbt_data_dir(const struct request *rq)
return -1;
}
static void wbt_queue_depth_changed(struct rq_qos *rqos)
{
RQWB(rqos)->rq_depth.queue_depth = blk_queue_depth(rqos->q);
__wbt_update_limits(RQWB(rqos));
}
static void wbt_exit(struct rq_qos *rqos)
{
struct rq_wb *rwb = RQWB(rqos);
@@ -811,6 +808,7 @@ static struct rq_qos_ops wbt_rqos_ops = {
.requeue = wbt_requeue,
.done = wbt_done,
.cleanup = wbt_cleanup,
.queue_depth_changed = wbt_queue_depth_changed,
.exit = wbt_exit,
#ifdef CONFIG_BLK_DEBUG_FS
.debugfs_attrs = wbt_debugfs_attrs,
@@ -853,7 +851,7 @@ int wbt_init(struct request_queue *q)
rwb->min_lat_nsec = wbt_default_latency_nsec(q);
wbt_set_queue_depth(q, blk_queue_depth(q));
wbt_queue_depth_changed(&rwb->rqos);
wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
return 0;

View File

@@ -95,7 +95,6 @@ void wbt_enable_default(struct request_queue *);
u64 wbt_get_min_lat(struct request_queue *q);
void wbt_set_min_lat(struct request_queue *q, u64 val);
void wbt_set_queue_depth(struct request_queue *, unsigned int);
void wbt_set_write_cache(struct request_queue *, bool);
u64 wbt_default_latency_nsec(struct request_queue *);
@@ -118,9 +117,6 @@ static inline void wbt_disable_default(struct request_queue *q)
static inline void wbt_enable_default(struct request_queue *q)
{
}
static inline void wbt_set_queue_depth(struct request_queue *q, unsigned int depth)
{
}
static inline void wbt_set_write_cache(struct request_queue *q, bool wc)
{
}

View File

@@ -202,6 +202,42 @@ int blkdev_report_zones(struct block_device *bdev, sector_t sector,
}
EXPORT_SYMBOL_GPL(blkdev_report_zones);
/*
* Special case of zone reset operation to reset all zones in one command,
* useful for applications like mkfs.
*/
static int __blkdev_reset_all_zones(struct block_device *bdev, gfp_t gfp_mask)
{
struct bio *bio = bio_alloc(gfp_mask, 0);
int ret;
/* across the zones operations, don't need any sectors */
bio_set_dev(bio, bdev);
bio_set_op_attrs(bio, REQ_OP_ZONE_RESET_ALL, 0);
ret = submit_bio_wait(bio);
bio_put(bio);
return ret;
}
static inline bool blkdev_allow_reset_all_zones(struct block_device *bdev,
sector_t nr_sectors)
{
if (!blk_queue_zone_resetall(bdev_get_queue(bdev)))
return false;
if (nr_sectors != part_nr_sects_read(bdev->bd_part))
return false;
/*
* REQ_OP_ZONE_RESET_ALL can be executed only if the block device is
* the entire disk, that is, if the blocks device start offset is 0 and
* its capacity is the same as the entire disk.
*/
return get_start_sect(bdev) == 0 &&
part_nr_sects_read(bdev->bd_part) == get_capacity(bdev->bd_disk);
}
/**
* blkdev_reset_zones - Reset zones write pointer
* @bdev: Target block device
@@ -235,6 +271,9 @@ int blkdev_reset_zones(struct block_device *bdev,
/* Out of range */
return -EINVAL;
if (blkdev_allow_reset_all_zones(bdev, nr_sectors))
return __blkdev_reset_all_zones(bdev, gfp_mask);
/* Check alignment (handle eventual smaller last zone) */
zone_sectors = blk_queue_zone_sectors(q);
if (sector & (zone_sectors - 1))

View File

@@ -184,11 +184,11 @@ void blk_account_io_done(struct request *req, u64 now);
void blk_insert_flush(struct request *rq);
int elevator_init_mq(struct request_queue *q);
void elevator_init_mq(struct request_queue *q);
int elevator_switch_mq(struct request_queue *q,
struct elevator_type *new_e);
void __elevator_exit(struct request_queue *, struct elevator_queue *);
int elv_register_queue(struct request_queue *q);
int elv_register_queue(struct request_queue *q, bool uevent);
void elv_unregister_queue(struct request_queue *q);
static inline void elevator_exit(struct request_queue *q,

View File

@@ -83,8 +83,26 @@ bool elv_bio_merge_ok(struct request *rq, struct bio *bio)
}
EXPORT_SYMBOL(elv_bio_merge_ok);
static bool elevator_match(const struct elevator_type *e, const char *name)
static inline bool elv_support_features(unsigned int elv_features,
unsigned int required_features)
{
return (required_features & elv_features) == required_features;
}
/**
* elevator_match - Test an elevator name and features
* @e: Scheduler to test
* @name: Elevator name to test
* @required_features: Features that the elevator must provide
*
* Return true is the elevator @e name matches @name and if @e provides all the
* the feratures spcified by @required_features.
*/
static bool elevator_match(const struct elevator_type *e, const char *name,
unsigned int required_features)
{
if (!elv_support_features(e->elevator_features, required_features))
return false;
if (!strcmp(e->elevator_name, name))
return true;
if (e->elevator_alias && !strcmp(e->elevator_alias, name))
@@ -93,15 +111,21 @@ static bool elevator_match(const struct elevator_type *e, const char *name)
return false;
}
/*
* Return scheduler with name 'name'
/**
* elevator_find - Find an elevator
* @name: Name of the elevator to find
* @required_features: Features that the elevator must provide
*
* Return the first registered scheduler with name @name and supporting the
* features @required_features and NULL otherwise.
*/
static struct elevator_type *elevator_find(const char *name)
static struct elevator_type *elevator_find(const char *name,
unsigned int required_features)
{
struct elevator_type *e;
list_for_each_entry(e, &elv_list, list) {
if (elevator_match(e, name))
if (elevator_match(e, name, required_features))
return e;
}
@@ -120,12 +144,12 @@ static struct elevator_type *elevator_get(struct request_queue *q,
spin_lock(&elv_list_lock);
e = elevator_find(name);
e = elevator_find(name, q->required_elevator_features);
if (!e && try_loading) {
spin_unlock(&elv_list_lock);
request_module("%s-iosched", name);
spin_lock(&elv_list_lock);
e = elevator_find(name);
e = elevator_find(name, q->required_elevator_features);
}
if (e && !try_module_get(e->elevator_owner))
@@ -135,20 +159,6 @@ static struct elevator_type *elevator_get(struct request_queue *q,
return e;
}
static char chosen_elevator[ELV_NAME_MAX];
static int __init elevator_setup(char *str)
{
/*
* Be backwards-compatible with previous kernels, so users
* won't get the wrong elevator.
*/
strncpy(chosen_elevator, str, sizeof(chosen_elevator) - 1);
return 1;
}
__setup("elevator=", elevator_setup);
static struct kobj_type elv_ktype;
struct elevator_queue *elevator_alloc(struct request_queue *q,
@@ -470,13 +480,16 @@ static struct kobj_type elv_ktype = {
.release = elevator_release,
};
int elv_register_queue(struct request_queue *q)
/*
* elv_register_queue is called from either blk_register_queue or
* elevator_switch, elevator switch is prevented from being happen
* in the two paths, so it is safe to not hold q->sysfs_lock.
*/
int elv_register_queue(struct request_queue *q, bool uevent)
{
struct elevator_queue *e = q->elevator;
int error;
lockdep_assert_held(&q->sysfs_lock);
error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched");
if (!error) {
struct elv_fs_entry *attr = e->type->elevator_attrs;
@@ -487,24 +500,34 @@ int elv_register_queue(struct request_queue *q)
attr++;
}
}
kobject_uevent(&e->kobj, KOBJ_ADD);
if (uevent)
kobject_uevent(&e->kobj, KOBJ_ADD);
mutex_lock(&q->sysfs_lock);
e->registered = 1;
mutex_unlock(&q->sysfs_lock);
}
return error;
}
/*
* elv_unregister_queue is called from either blk_unregister_queue or
* elevator_switch, elevator switch is prevented from being happen
* in the two paths, so it is safe to not hold q->sysfs_lock.
*/
void elv_unregister_queue(struct request_queue *q)
{
lockdep_assert_held(&q->sysfs_lock);
if (q) {
struct elevator_queue *e = q->elevator;
kobject_uevent(&e->kobj, KOBJ_REMOVE);
kobject_del(&e->kobj);
mutex_lock(&q->sysfs_lock);
e->registered = 0;
/* Re-enable throttling in case elevator disabled it */
wbt_enable_default(q);
mutex_unlock(&q->sysfs_lock);
}
}
@@ -526,7 +549,7 @@ int elv_register(struct elevator_type *e)
/* register, don't allow duplicate names */
spin_lock(&elv_list_lock);
if (elevator_find(e->elevator_name)) {
if (elevator_find(e->elevator_name, 0)) {
spin_unlock(&elv_list_lock);
kmem_cache_destroy(e->icq_cache);
return -EBUSY;
@@ -567,10 +590,32 @@ int elevator_switch_mq(struct request_queue *q,
lockdep_assert_held(&q->sysfs_lock);
if (q->elevator) {
if (q->elevator->registered)
if (q->elevator->registered) {
mutex_unlock(&q->sysfs_lock);
/*
* Concurrent elevator switch can't happen becasue
* sysfs write is always exclusively on same file.
*
* Also the elevator queue won't be freed after
* sysfs_lock is released becasue kobject_del() in
* blk_unregister_queue() waits for completion of
* .store & .show on its attributes.
*/
elv_unregister_queue(q);
mutex_lock(&q->sysfs_lock);
}
ioc_clear_queue(q);
elevator_exit(q, q->elevator);
/*
* sysfs_lock may be dropped, so re-check if queue is
* unregistered. If yes, don't switch to new elevator
* any more
*/
if (!blk_queue_registered(q))
return 0;
}
ret = blk_mq_init_sched(q, new_e);
@@ -578,7 +623,11 @@ int elevator_switch_mq(struct request_queue *q,
goto out;
if (new_e) {
ret = elv_register_queue(q);
mutex_unlock(&q->sysfs_lock);
ret = elv_register_queue(q, true);
mutex_lock(&q->sysfs_lock);
if (ret) {
elevator_exit(q, q->elevator);
goto out;
@@ -594,37 +643,89 @@ out:
return ret;
}
static inline bool elv_support_iosched(struct request_queue *q)
{
if (q->tag_set && (q->tag_set->flags & BLK_MQ_F_NO_SCHED))
return false;
return true;
}
/*
* For blk-mq devices, we default to using mq-deadline, if available, for single
* queue devices. If deadline isn't available OR we have multiple queues,
* default to "none".
* For single queue devices, default to using mq-deadline. If we have multiple
* queues or mq-deadline is not available, default to "none".
*/
int elevator_init_mq(struct request_queue *q)
static struct elevator_type *elevator_get_default(struct request_queue *q)
{
if (q->nr_hw_queues != 1)
return NULL;
return elevator_get(q, "mq-deadline", false);
}
/*
* Get the first elevator providing the features required by the request queue.
* Default to "none" if no matching elevator is found.
*/
static struct elevator_type *elevator_get_by_features(struct request_queue *q)
{
struct elevator_type *e, *found = NULL;
spin_lock(&elv_list_lock);
list_for_each_entry(e, &elv_list, list) {
if (elv_support_features(e->elevator_features,
q->required_elevator_features)) {
found = e;
break;
}
}
if (found && !try_module_get(found->elevator_owner))
found = NULL;
spin_unlock(&elv_list_lock);
return found;
}
/*
* For a device queue that has no required features, use the default elevator
* settings. Otherwise, use the first elevator available matching the required
* features. If no suitable elevator is find or if the chosen elevator
* initialization fails, fall back to the "none" elevator (no elevator).
*/
void elevator_init_mq(struct request_queue *q)
{
struct elevator_type *e;
int err = 0;
int err;
if (q->nr_hw_queues != 1)
return 0;
if (!elv_support_iosched(q))
return;
WARN_ON_ONCE(test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags));
/*
* q->sysfs_lock must be held to provide mutual exclusion between
* elevator_switch() and here.
*/
mutex_lock(&q->sysfs_lock);
if (unlikely(q->elevator))
goto out_unlock;
return;
e = elevator_get(q, "mq-deadline", false);
if (!q->required_elevator_features)
e = elevator_get_default(q);
else
e = elevator_get_by_features(q);
if (!e)
goto out_unlock;
return;
blk_mq_freeze_queue(q);
blk_mq_quiesce_queue(q);
err = blk_mq_init_sched(q, e);
if (err)
blk_mq_unquiesce_queue(q);
blk_mq_unfreeze_queue(q);
if (err) {
pr_warn("\"%s\" elevator initialization failed, "
"falling back to \"none\"\n", e->elevator_name);
elevator_put(e);
out_unlock:
mutex_unlock(&q->sysfs_lock);
return err;
}
}
@@ -660,7 +761,7 @@ static int __elevator_change(struct request_queue *q, const char *name)
struct elevator_type *e;
/* Make sure queue is not in the middle of being removed */
if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
if (!blk_queue_registered(q))
return -ENOENT;
/*
@@ -677,7 +778,8 @@ static int __elevator_change(struct request_queue *q, const char *name)
if (!e)
return -EINVAL;
if (q->elevator && elevator_match(q->elevator->type, elevator_name)) {
if (q->elevator &&
elevator_match(q->elevator->type, elevator_name, 0)) {
elevator_put(e);
return 0;
}
@@ -685,13 +787,6 @@ static int __elevator_change(struct request_queue *q, const char *name)
return elevator_switch(q, e);
}
static inline bool elv_support_iosched(struct request_queue *q)
{
if (q->tag_set && (q->tag_set->flags & BLK_MQ_F_NO_SCHED))
return false;
return true;
}
ssize_t elv_iosched_store(struct request_queue *q, const char *name,
size_t count)
{
@@ -724,11 +819,13 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name)
spin_lock(&elv_list_lock);
list_for_each_entry(__e, &elv_list, list) {
if (elv && elevator_match(elv, __e->elevator_name)) {
if (elv && elevator_match(elv, __e->elevator_name, 0)) {
len += sprintf(name+len, "[%s] ", elv->elevator_name);
continue;
}
if (elv_support_iosched(q))
if (elv_support_iosched(q) &&
elevator_match(__e, __e->elevator_name,
q->required_elevator_features))
len += sprintf(name+len, "%s ", __e->elevator_name);
}
spin_unlock(&elv_list_lock);

View File

@@ -695,6 +695,15 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk,
dev_t devt;
int retval;
/*
* The disk queue should now be all set with enough information about
* the device for the elevator code to pick an adequate default
* elevator if one is needed, that is, for devices requesting queue
* registration.
*/
if (register_queue)
elevator_init_mq(disk->queue);
/* minors == 0 indicates to use ext devt from part0 and should
* be accompanied with EXT_DEVT flag. Make sure all
* parameters make sense.

View File

@@ -377,13 +377,6 @@ done:
* hardware queue, but we may return a request that is for a
* different hardware queue. This is because mq-deadline has shared
* state for all hardware queues, in terms of sorting, FIFOs, etc.
*
* For a zoned block device, __dd_dispatch_request() may return NULL
* if all the queued write requests are directed at zones that are already
* locked due to on-going write requests. In this case, make sure to mark
* the queue as needing a restart to ensure that the queue is run again
* and the pending writes dispatched once the target zones for the ongoing
* write requests are unlocked in dd_finish_request().
*/
static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
{
@@ -392,9 +385,6 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
spin_lock(&dd->lock);
rq = __dd_dispatch_request(dd);
if (!rq && blk_queue_is_zoned(hctx->queue) &&
!list_empty(&dd->fifo_list[WRITE]))
blk_mq_sched_mark_restart_hctx(hctx);
spin_unlock(&dd->lock);
return rq;
@@ -561,6 +551,13 @@ static void dd_prepare_request(struct request *rq, struct bio *bio)
* spinlock so that the zone is never unlocked while deadline_fifo_request()
* or deadline_next_request() are executing. This function is called for
* all requests, whether or not these requests complete successfully.
*
* For a zoned block device, __dd_dispatch_request() may have stopped
* dispatching requests if all the queued requests are write requests directed
* at zones that are already locked due to on-going write requests. To ensure
* write request dispatch progress in this case, mark the queue as needing a
* restart to ensure that the queue is run again after completion of the
* request and zones being unlocked.
*/
static void dd_finish_request(struct request *rq)
{
@@ -572,6 +569,8 @@ static void dd_finish_request(struct request *rq)
spin_lock_irqsave(&dd->zone_lock, flags);
blk_req_zone_write_unlock(rq);
if (!list_empty(&dd->fifo_list[WRITE]))
blk_mq_sched_mark_restart_hctx(rq->mq_hctx);
spin_unlock_irqrestore(&dd->zone_lock, flags);
}
}
@@ -795,6 +794,7 @@ static struct elevator_type mq_deadline = {
.elevator_attrs = deadline_attrs,
.elevator_name = "mq-deadline",
.elevator_alias = "deadline",
.elevator_features = ELEVATOR_F_ZBD_SEQ_WRITE,
.elevator_owner = THIS_MODULE,
};
MODULE_ALIAS("mq-deadline-iosched");

View File

@@ -119,8 +119,6 @@ enum opal_uid {
OPAL_UID_HEXFF,
};
#define OPAL_METHOD_LENGTH 8
/* Enum for indexing the OPALMETHOD array */
enum opal_method {
OPAL_PROPERTIES,
@@ -167,7 +165,6 @@ enum opal_token {
OPAL_TABLE_LASTID = 0x0A,
OPAL_TABLE_MIN = 0x0B,
OPAL_TABLE_MAX = 0x0C,
/* authority table */
OPAL_PIN = 0x03,
/* locking tokens */
@@ -182,7 +179,7 @@ enum opal_token {
OPAL_LIFECYCLE = 0x06,
/* locking info table */
OPAL_MAXRANGES = 0x04,
/* mbr control */
/* mbr control */
OPAL_MBRENABLE = 0x01,
OPAL_MBRDONE = 0x02,
/* properties */

View File

@@ -129,7 +129,6 @@ static const u8 opaluid[][OPAL_UID_LENGTH] = {
{ 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x84, 0x01 },
/* tables */
[OPAL_TABLE_TABLE]
{ 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01 },
[OPAL_LOCKINGRANGE_GLOBAL] =
@@ -152,7 +151,6 @@ static const u8 opaluid[][OPAL_UID_LENGTH] = {
{ 0x00, 0x00, 0x08, 0x01, 0x00, 0x00, 0x00, 0x00 },
/* C_PIN_TABLE object ID's */
[OPAL_C_PIN_MSID] =
{ 0x00, 0x00, 0x00, 0x0B, 0x00, 0x00, 0x84, 0x02},
[OPAL_C_PIN_SID] =
@@ -161,7 +159,6 @@ static const u8 opaluid[][OPAL_UID_LENGTH] = {
{ 0x00, 0x00, 0x00, 0x0B, 0x00, 0x01, 0x00, 0x01},
/* half UID's (only first 4 bytes used) */
[OPAL_HALF_UID_AUTHORITY_OBJ_REF] =
{ 0x00, 0x00, 0x0C, 0x05, 0xff, 0xff, 0xff, 0xff },
[OPAL_HALF_UID_BOOLEAN_ACE] =
@@ -517,6 +514,7 @@ static int opal_discovery0(struct opal_dev *dev, void *data)
ret = opal_recv_cmd(dev);
if (ret)
return ret;
return opal_discovery0_end(dev);
}
@@ -525,6 +523,7 @@ static int opal_discovery0_step(struct opal_dev *dev)
const struct opal_step discovery0_step = {
opal_discovery0,
};
return execute_step(dev, &discovery0_step, 0);
}
@@ -551,6 +550,7 @@ static void add_token_u8(int *err, struct opal_dev *cmd, u8 tok)
{
if (!can_add(err, cmd, 1))
return;
cmd->cmd[cmd->pos++] = tok;
}
@@ -577,6 +577,7 @@ static void add_medium_atom_header(struct opal_dev *cmd, bool bytestring,
header0 |= bytestring ? MEDIUM_ATOM_BYTESTRING : 0;
header0 |= has_sign ? MEDIUM_ATOM_SIGNED : 0;
header0 |= (len >> 8) & MEDIUM_ATOM_LEN_MASK;
cmd->cmd[cmd->pos++] = header0;
cmd->cmd[cmd->pos++] = len;
}
@@ -649,6 +650,7 @@ static int build_locking_range(u8 *buffer, size_t length, u8 lr)
if (lr == 0)
return 0;
buffer[5] = LOCKING_RANGE_NON_GLOBAL;
buffer[7] = lr;
@@ -903,10 +905,6 @@ static int response_parse(const u8 *buf, size_t length,
num_entries++;
}
if (num_entries == 0) {
pr_debug("Couldn't parse response.\n");
return -EINVAL;
}
resp->num = num_entries;
return 0;
@@ -945,6 +943,7 @@ static size_t response_get_string(const struct parsed_resp *resp, int n,
}
*store = tok->pos + skip;
return tok->len - skip;
}
@@ -1062,6 +1061,7 @@ static int start_opal_session_cont(struct opal_dev *dev)
dev->hsn = hsn;
dev->tsn = tsn;
return 0;
}
@@ -1084,6 +1084,7 @@ static int end_session_cont(struct opal_dev *dev)
{
dev->hsn = 0;
dev->tsn = 0;
return parse_and_check_status(dev);
}
@@ -1172,6 +1173,7 @@ static int gen_key(struct opal_dev *dev, void *data)
return err;
}
return finalize_and_send(dev, parse_and_check_status);
}
@@ -1184,12 +1186,14 @@ static int get_active_key_cont(struct opal_dev *dev)
error = parse_and_check_status(dev);
if (error)
return error;
keylen = response_get_string(&dev->parsed, 4, &activekey);
if (!activekey) {
pr_debug("%s: Couldn't extract the Activekey from the response\n",
__func__);
return OPAL_INVAL_PARAM;
}
dev->prev_data = kmemdup(activekey, keylen, GFP_KERNEL);
if (!dev->prev_data)
@@ -1251,6 +1255,7 @@ static int generic_lr_enable_disable(struct opal_dev *dev,
add_token_u8(&err, dev, OPAL_ENDLIST);
add_token_u8(&err, dev, OPAL_ENDNAME);
return err;
}
@@ -1263,6 +1268,7 @@ static inline int enable_global_lr(struct opal_dev *dev, u8 *uid,
0, 0);
if (err)
pr_debug("Failed to create enable global lr command\n");
return err;
}
@@ -1313,7 +1319,6 @@ static int setup_locking_range(struct opal_dev *dev, void *data)
if (err) {
pr_debug("Error building Setup Locking range command.\n");
return err;
}
return finalize_and_send(dev, parse_and_check_status);
@@ -1393,6 +1398,7 @@ static int start_SIDASP_opal_session(struct opal_dev *dev, void *data)
kfree(key);
dev->prev_data = NULL;
}
return ret;
}
@@ -1518,6 +1524,7 @@ static int erase_locking_range(struct opal_dev *dev, void *data)
pr_debug("Error building Erase Locking Range Command.\n");
return err;
}
return finalize_and_send(dev, parse_and_check_status);
}
@@ -1636,6 +1643,7 @@ static int write_shadow_mbr(struct opal_dev *dev, void *data)
off += len;
}
return err;
}
@@ -1816,6 +1824,7 @@ static int lock_unlock_locking_range(struct opal_dev *dev, void *data)
pr_debug("Error building SET command.\n");
return err;
}
return finalize_and_send(dev, parse_and_check_status);
}
@@ -1857,6 +1866,7 @@ static int lock_unlock_locking_range_sum(struct opal_dev *dev, void *data)
pr_debug("Error building SET command.\n");
return ret;
}
return finalize_and_send(dev, parse_and_check_status);
}
@@ -1957,6 +1967,7 @@ static int end_opal_session(struct opal_dev *dev, void *data)
if (err < 0)
return err;
return finalize_and_send(dev, end_session_cont);
}
@@ -1965,6 +1976,7 @@ static int end_opal_session_error(struct opal_dev *dev)
const struct opal_step error_end_session = {
end_opal_session,
};
return execute_step(dev, &error_end_session, 0);
}
@@ -1984,6 +1996,7 @@ static int check_opal_support(struct opal_dev *dev)
ret = opal_discovery0_step(dev);
dev->supported = !ret;
mutex_unlock(&dev->dev_lock);
return ret;
}
@@ -2004,6 +2017,7 @@ void free_opal_dev(struct opal_dev *dev)
{
if (!dev)
return;
clean_opal_dev(dev);
kfree(dev);
}
@@ -2026,6 +2040,7 @@ struct opal_dev *init_opal_dev(void *data, sec_send_recv *send_recv)
kfree(dev);
return NULL;
}
return dev;
}
EXPORT_SYMBOL(init_opal_dev);
@@ -2045,6 +2060,7 @@ static int opal_secure_erase_locking_range(struct opal_dev *dev,
setup_opal_dev(dev);
ret = execute_steps(dev, erase_steps, ARRAY_SIZE(erase_steps));
mutex_unlock(&dev->dev_lock);
return ret;
}
@@ -2062,6 +2078,7 @@ static int opal_erase_locking_range(struct opal_dev *dev,
setup_opal_dev(dev);
ret = execute_steps(dev, erase_steps, ARRAY_SIZE(erase_steps));
mutex_unlock(&dev->dev_lock);
return ret;
}
@@ -2089,6 +2106,7 @@ static int opal_enable_disable_shadow_mbr(struct opal_dev *dev,
setup_opal_dev(dev);
ret = execute_steps(dev, mbr_steps, ARRAY_SIZE(mbr_steps));
mutex_unlock(&dev->dev_lock);
return ret;
}
@@ -2113,6 +2131,7 @@ static int opal_set_mbr_done(struct opal_dev *dev,
setup_opal_dev(dev);
ret = execute_steps(dev, mbr_steps, ARRAY_SIZE(mbr_steps));
mutex_unlock(&dev->dev_lock);
return ret;
}
@@ -2133,6 +2152,7 @@ static int opal_write_shadow_mbr(struct opal_dev *dev,
setup_opal_dev(dev);
ret = execute_steps(dev, mbr_steps, ARRAY_SIZE(mbr_steps));
mutex_unlock(&dev->dev_lock);
return ret;
}
@@ -2151,6 +2171,7 @@ static int opal_save(struct opal_dev *dev, struct opal_lock_unlock *lk_unlk)
setup_opal_dev(dev);
add_suspend_info(dev, suspend);
mutex_unlock(&dev->dev_lock);
return 0;
}
@@ -2169,12 +2190,14 @@ static int opal_add_user_to_lr(struct opal_dev *dev,
pr_debug("Locking state was not RO or RW\n");
return -EINVAL;
}
if (lk_unlk->session.who < OPAL_USER1 ||
lk_unlk->session.who > OPAL_USER9) {
pr_debug("Authority was not within the range of users: %d\n",
lk_unlk->session.who);
return -EINVAL;
}
if (lk_unlk->session.sum) {
pr_debug("%s not supported in sum. Use setup locking range\n",
__func__);
@@ -2185,6 +2208,7 @@ static int opal_add_user_to_lr(struct opal_dev *dev,
setup_opal_dev(dev);
ret = execute_steps(dev, steps, ARRAY_SIZE(steps));
mutex_unlock(&dev->dev_lock);
return ret;
}
@@ -2267,6 +2291,7 @@ static int opal_lock_unlock(struct opal_dev *dev,
mutex_lock(&dev->dev_lock);
ret = __opal_lock_unlock(dev, lk_unlk);
mutex_unlock(&dev->dev_lock);
return ret;
}
@@ -2289,6 +2314,7 @@ static int opal_take_ownership(struct opal_dev *dev, struct opal_key *opal)
setup_opal_dev(dev);
ret = execute_steps(dev, owner_steps, ARRAY_SIZE(owner_steps));
mutex_unlock(&dev->dev_lock);
return ret;
}
@@ -2310,6 +2336,7 @@ static int opal_activate_lsp(struct opal_dev *dev,
setup_opal_dev(dev);
ret = execute_steps(dev, active_steps, ARRAY_SIZE(active_steps));
mutex_unlock(&dev->dev_lock);
return ret;
}
@@ -2327,6 +2354,7 @@ static int opal_setup_locking_range(struct opal_dev *dev,
setup_opal_dev(dev);
ret = execute_steps(dev, lr_steps, ARRAY_SIZE(lr_steps));
mutex_unlock(&dev->dev_lock);
return ret;
}
@@ -2347,6 +2375,7 @@ static int opal_set_new_pw(struct opal_dev *dev, struct opal_new_pw *opal_pw)
setup_opal_dev(dev);
ret = execute_steps(dev, pw_steps, ARRAY_SIZE(pw_steps));
mutex_unlock(&dev->dev_lock);
return ret;
}
@@ -2371,6 +2400,7 @@ static int opal_activate_user(struct opal_dev *dev,
setup_opal_dev(dev);
ret = execute_steps(dev, act_steps, ARRAY_SIZE(act_steps));
mutex_unlock(&dev->dev_lock);
return ret;
}
@@ -2382,6 +2412,7 @@ bool opal_unlock_from_suspend(struct opal_dev *dev)
if (!dev)
return false;
if (!dev->supported)
return false;
@@ -2399,6 +2430,7 @@ bool opal_unlock_from_suspend(struct opal_dev *dev)
suspend->unlk.session.sum);
was_failure = true;
}
if (dev->mbr_enabled) {
ret = __opal_set_mbr_done(dev, &suspend->unlk.session.opal_key);
if (ret)
@@ -2406,6 +2438,7 @@ bool opal_unlock_from_suspend(struct opal_dev *dev)
}
}
mutex_unlock(&dev->dev_lock);
return was_failure;
}
EXPORT_SYMBOL(opal_unlock_from_suspend);