for-linus-20190715

-----BEGIN PGP SIGNATURE-----
 
 iQJEBAABCAAuFiEEwPw5LcreJtl1+l5K99NY+ylx4KYFAl0s1ZEQHGF4Ym9lQGtl
 cm5lbC5kawAKCRD301j7KXHgpiCEEACE9H/pXoegTTWIVPVajMlsa19UHIeilk4N
 GI7oKSiirQEMZnAOmrEzgB4/0zyYQsVypys0gZlYUD3GJVsXDT3zzjNXL5NpVg/O
 nqwSGWMHBSjWkLbaM40Pb2QLXsYgveptNL+9PtxrgtoYPoT5/+TyrJMFrRfi72EK
 WFeNDKOu6aJxpJ26JSsckJ0gluKeeEpRoEqsgHGIwaMIGHQf+b+ikk7tel5FAIgA
 uDwwD+Oxsdgh/ChsXL0d90GkcbcSp6GQ7GybxVmw/tPijx6mpeIY72xY3Zx+t8zF
 b71UNk6NmCKjOPO/6fiuYKKTYw+KhzlyEKO0j675HKfx2AhchEwKw0irp4yUlydA
 zxWYmz4U7iRgktJtymv3J4FEQQ3S6d1EnuQkQNX1LwiOsEsfzhkWi+7jy7KFhZoJ
 AqtYzqnOXvLx92q0vloj06HtK6zo+I/MINldy0+qn9lq0N0VF+dctyztAHLsF7P6
 pUtS6i7l1JSFKAmMhC31sIj5TImaehM2e/TWMUPEDZaO96oKCmQwOF1oiloc6vlW
 h4xWsxP/9zOFcWNyPzy6Vo3JUXWRvFA7K+jV3Hsukw6rVHiNCGVYGSlTv8Roi5b7
 I4ggu9R2JOGyku7UIlL50IRxEyjAp11LaO8yHhcCnRB65rmyBuNMQNcfOsfxpZ5Y
 1mtSNhm5TQ==
 =g8xI
 -----END PGP SIGNATURE-----

Merge tag 'for-linus-20190715' of git://git.kernel.dk/linux-block

Pull more block updates from Jens Axboe:
 "A later pull request with some followup items. I had some vacation
  coming up to the merge window, so certain things items were delayed a
  bit. This pull request also contains fixes that came in within the
  last few days of the merge window, which I didn't want to push right
  before sending you a pull request.

  This contains:

   - NVMe pull request, mostly fixes, but also a few minor items on the
     feature side that were timing constrained (Christoph et al)

   - Report zones fixes (Damien)

   - Removal of dead code (Damien)

   - Turn on cgroup psi memstall (Josef)

   - block cgroup MAINTAINERS entry (Konstantin)

   - Flush init fix (Josef)

   - blk-throttle low iops timing fix (Konstantin)

   - nbd resize fixes (Mike)

   - nbd 0 blocksize crash fix (Xiubo)

   - block integrity error leak fix (Wenwen)

   - blk-cgroup writeback and priority inheritance fixes (Tejun)"

* tag 'for-linus-20190715' of git://git.kernel.dk/linux-block: (42 commits)
  MAINTAINERS: add entry for block io cgroup
  null_blk: fixup ->report_zones() for !CONFIG_BLK_DEV_ZONED
  block: Limit zone array allocation size
  sd_zbc: Fix report zones buffer allocation
  block: Kill gfp_t argument of blkdev_report_zones()
  block: Allow mapping of vmalloc-ed buffers
  block/bio-integrity: fix a memory leak bug
  nvme: fix NULL deref for fabrics options
  nbd: add netlink reconfigure resize support
  nbd: fix crash when the blksize is zero
  block: Disable write plugging for zoned block devices
  block: Fix elevator name declaration
  block: Remove unused definitions
  nvme: fix regression upon hot device removal and insertion
  blk-throttle: fix zero wait time for iops throttled group
  block: Fix potential overflow in blk_report_zones()
  blkcg: implement REQ_CGROUP_PUNT
  blkcg, writeback: Implement wbc_blkcg_css()
  blkcg, writeback: Add wbc->no_cgroup_owner
  blkcg, writeback: Rename wbc_account_io() to wbc_account_cgroup_owner()
  ...
This commit is contained in:
Linus Torvalds 2019-07-15 21:20:52 -07:00
commit 9637d51734
50 changed files with 661 additions and 211 deletions

View File

@ -2124,7 +2124,7 @@ following two functions.
a queue (device) has been associated with the bio and a queue (device) has been associated with the bio and
before submission. before submission.
wbc_account_io(@wbc, @page, @bytes) wbc_account_cgroup_owner(@wbc, @page, @bytes)
Should be called for each data segment being written out. Should be called for each data segment being written out.
While this function doesn't care exactly when it's called While this function doesn't care exactly when it's called
during the writeback session, it's the easiest and most during the writeback session, it's the easiest and most

View File

@ -843,11 +843,6 @@ elevator_latter_req_fn These return the request before or after the
elevator_completed_req_fn called when a request is completed. elevator_completed_req_fn called when a request is completed.
elevator_may_queue_fn returns true if the scheduler wants to allow the
current context to queue a new request even if
it is over the queue limit. This must be used
very carefully!!
elevator_set_req_fn elevator_set_req_fn
elevator_put_req_fn Must be used to allocate and free any elevator elevator_put_req_fn Must be used to allocate and free any elevator
specific storage for a request. specific storage for a request.

View File

@ -4183,6 +4183,19 @@ S: Maintained
F: mm/memcontrol.c F: mm/memcontrol.c
F: mm/swap_cgroup.c F: mm/swap_cgroup.c
CONTROL GROUP - BLOCK IO CONTROLLER (BLKIO)
M: Tejun Heo <tj@kernel.org>
M: Jens Axboe <axboe@kernel.dk>
L: cgroups@vger.kernel.org
L: linux-block@vger.kernel.org
T: git git://git.kernel.dk/linux-block
F: Documentation/cgroup-v1/blkio-controller.rst
F: block/blk-cgroup.c
F: include/linux/blk-cgroup.h
F: block/blk-throttle.c
F: block/blk-iolatency.c
F: block/bfq-cgroup.c
CORETEMP HARDWARE MONITORING DRIVER CORETEMP HARDWARE MONITORING DRIVER
M: Fenghua Yu <fenghua.yu@intel.com> M: Fenghua Yu <fenghua.yu@intel.com>
L: linux-hwmon@vger.kernel.org L: linux-hwmon@vger.kernel.org

View File

@ -276,8 +276,12 @@ bool bio_integrity_prep(struct bio *bio)
ret = bio_integrity_add_page(bio, virt_to_page(buf), ret = bio_integrity_add_page(bio, virt_to_page(buf),
bytes, offset); bytes, offset);
if (ret == 0) if (ret == 0) {
return false; printk(KERN_ERR "could not attach integrity payload\n");
kfree(buf);
status = BLK_STS_RESOURCE;
goto err_end_io;
}
if (ret < bytes) if (ret < bytes)
break; break;

View File

@ -16,6 +16,7 @@
#include <linux/workqueue.h> #include <linux/workqueue.h>
#include <linux/cgroup.h> #include <linux/cgroup.h>
#include <linux/blk-cgroup.h> #include <linux/blk-cgroup.h>
#include <linux/highmem.h>
#include <trace/events/block.h> #include <trace/events/block.h>
#include "blk.h" #include "blk.h"
@ -1441,8 +1442,22 @@ void bio_unmap_user(struct bio *bio)
bio_put(bio); bio_put(bio);
} }
static void bio_invalidate_vmalloc_pages(struct bio *bio)
{
#ifdef ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE
if (bio->bi_private && !op_is_write(bio_op(bio))) {
unsigned long i, len = 0;
for (i = 0; i < bio->bi_vcnt; i++)
len += bio->bi_io_vec[i].bv_len;
invalidate_kernel_vmap_range(bio->bi_private, len);
}
#endif
}
static void bio_map_kern_endio(struct bio *bio) static void bio_map_kern_endio(struct bio *bio)
{ {
bio_invalidate_vmalloc_pages(bio);
bio_put(bio); bio_put(bio);
} }
@ -1463,6 +1478,8 @@ struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len,
unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
unsigned long start = kaddr >> PAGE_SHIFT; unsigned long start = kaddr >> PAGE_SHIFT;
const int nr_pages = end - start; const int nr_pages = end - start;
bool is_vmalloc = is_vmalloc_addr(data);
struct page *page;
int offset, i; int offset, i;
struct bio *bio; struct bio *bio;
@ -1470,6 +1487,11 @@ struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len,
if (!bio) if (!bio)
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
if (is_vmalloc) {
flush_kernel_vmap_range(data, len);
bio->bi_private = data;
}
offset = offset_in_page(kaddr); offset = offset_in_page(kaddr);
for (i = 0; i < nr_pages; i++) { for (i = 0; i < nr_pages; i++) {
unsigned int bytes = PAGE_SIZE - offset; unsigned int bytes = PAGE_SIZE - offset;
@ -1480,7 +1502,11 @@ struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len,
if (bytes > len) if (bytes > len)
bytes = len; bytes = len;
if (bio_add_pc_page(q, bio, virt_to_page(data), bytes, if (!is_vmalloc)
page = virt_to_page(data);
else
page = vmalloc_to_page(data);
if (bio_add_pc_page(q, bio, page, bytes,
offset) < bytes) { offset) < bytes) {
/* we don't support partial mappings */ /* we don't support partial mappings */
bio_put(bio); bio_put(bio);

View File

@ -29,6 +29,7 @@
#include <linux/ctype.h> #include <linux/ctype.h>
#include <linux/blk-cgroup.h> #include <linux/blk-cgroup.h>
#include <linux/tracehook.h> #include <linux/tracehook.h>
#include <linux/psi.h>
#include "blk.h" #include "blk.h"
#define MAX_KEY_LEN 100 #define MAX_KEY_LEN 100
@ -47,12 +48,14 @@ struct blkcg blkcg_root;
EXPORT_SYMBOL_GPL(blkcg_root); EXPORT_SYMBOL_GPL(blkcg_root);
struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css; struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css;
EXPORT_SYMBOL_GPL(blkcg_root_css);
static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS]; static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */ static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */
static bool blkcg_debug_stats = false; static bool blkcg_debug_stats = false;
static struct workqueue_struct *blkcg_punt_bio_wq;
static bool blkcg_policy_enabled(struct request_queue *q, static bool blkcg_policy_enabled(struct request_queue *q,
const struct blkcg_policy *pol) const struct blkcg_policy *pol)
@ -87,6 +90,8 @@ static void __blkg_release(struct rcu_head *rcu)
{ {
struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head); struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head);
WARN_ON(!bio_list_empty(&blkg->async_bios));
/* release the blkcg and parent blkg refs this blkg has been holding */ /* release the blkcg and parent blkg refs this blkg has been holding */
css_put(&blkg->blkcg->css); css_put(&blkg->blkcg->css);
if (blkg->parent) if (blkg->parent)
@ -112,6 +117,23 @@ static void blkg_release(struct percpu_ref *ref)
call_rcu(&blkg->rcu_head, __blkg_release); call_rcu(&blkg->rcu_head, __blkg_release);
} }
static void blkg_async_bio_workfn(struct work_struct *work)
{
struct blkcg_gq *blkg = container_of(work, struct blkcg_gq,
async_bio_work);
struct bio_list bios = BIO_EMPTY_LIST;
struct bio *bio;
/* as long as there are pending bios, @blkg can't go away */
spin_lock_bh(&blkg->async_bio_lock);
bio_list_merge(&bios, &blkg->async_bios);
bio_list_init(&blkg->async_bios);
spin_unlock_bh(&blkg->async_bio_lock);
while ((bio = bio_list_pop(&bios)))
submit_bio(bio);
}
/** /**
* blkg_alloc - allocate a blkg * blkg_alloc - allocate a blkg
* @blkcg: block cgroup the new blkg is associated with * @blkcg: block cgroup the new blkg is associated with
@ -140,6 +162,9 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
blkg->q = q; blkg->q = q;
INIT_LIST_HEAD(&blkg->q_node); INIT_LIST_HEAD(&blkg->q_node);
spin_lock_init(&blkg->async_bio_lock);
bio_list_init(&blkg->async_bios);
INIT_WORK(&blkg->async_bio_work, blkg_async_bio_workfn);
blkg->blkcg = blkcg; blkg->blkcg = blkcg;
for (i = 0; i < BLKCG_MAX_POLS; i++) { for (i = 0; i < BLKCG_MAX_POLS; i++) {
@ -1526,6 +1551,25 @@ void blkcg_policy_unregister(struct blkcg_policy *pol)
} }
EXPORT_SYMBOL_GPL(blkcg_policy_unregister); EXPORT_SYMBOL_GPL(blkcg_policy_unregister);
bool __blkcg_punt_bio_submit(struct bio *bio)
{
struct blkcg_gq *blkg = bio->bi_blkg;
/* consume the flag first */
bio->bi_opf &= ~REQ_CGROUP_PUNT;
/* never bounce for the root cgroup */
if (!blkg->parent)
return false;
spin_lock_bh(&blkg->async_bio_lock);
bio_list_add(&blkg->async_bios, bio);
spin_unlock_bh(&blkg->async_bio_lock);
queue_work(blkcg_punt_bio_wq, &blkg->async_bio_work);
return true;
}
/* /*
* Scale the accumulated delay based on how long it has been since we updated * Scale the accumulated delay based on how long it has been since we updated
* the delay. We only call this when we are adding delay, in case it's been a * the delay. We only call this when we are adding delay, in case it's been a
@ -1587,6 +1631,7 @@ static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now)
*/ */
static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay) static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
{ {
unsigned long pflags;
u64 now = ktime_to_ns(ktime_get()); u64 now = ktime_to_ns(ktime_get());
u64 exp; u64 exp;
u64 delay_nsec = 0; u64 delay_nsec = 0;
@ -1613,11 +1658,8 @@ static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
*/ */
delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC); delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC);
/* if (use_memdelay)
* TODO: the use_memdelay flag is going to be for the upcoming psi stuff psi_memstall_enter(&pflags);
* that hasn't landed upstream yet. Once that stuff is in place we need
* to do a psi_memstall_enter/leave if memdelay is set.
*/
exp = ktime_add_ns(now, delay_nsec); exp = ktime_add_ns(now, delay_nsec);
tok = io_schedule_prepare(); tok = io_schedule_prepare();
@ -1627,6 +1669,9 @@ static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
break; break;
} while (!fatal_signal_pending(current)); } while (!fatal_signal_pending(current));
io_schedule_finish(tok); io_schedule_finish(tok);
if (use_memdelay)
psi_memstall_leave(&pflags);
} }
/** /**
@ -1726,5 +1771,16 @@ void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta)
atomic64_add(delta, &blkg->delay_nsec); atomic64_add(delta, &blkg->delay_nsec);
} }
static int __init blkcg_init(void)
{
blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio",
WQ_MEM_RECLAIM | WQ_FREEZABLE |
WQ_UNBOUND | WQ_SYSFS, 0);
if (!blkcg_punt_bio_wq)
return -ENOMEM;
return 0;
}
subsys_initcall(blkcg_init);
module_param(blkcg_debug_stats, bool, 0644); module_param(blkcg_debug_stats, bool, 0644);
MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not"); MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not");

View File

@ -117,6 +117,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
rq->internal_tag = -1; rq->internal_tag = -1;
rq->start_time_ns = ktime_get_ns(); rq->start_time_ns = ktime_get_ns();
rq->part = NULL; rq->part = NULL;
refcount_set(&rq->ref, 1);
} }
EXPORT_SYMBOL(blk_rq_init); EXPORT_SYMBOL(blk_rq_init);
@ -687,7 +688,7 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
struct request *rq; struct request *rq;
struct list_head *plug_list; struct list_head *plug_list;
plug = current->plug; plug = blk_mq_plug(q, bio);
if (!plug) if (!plug)
return false; return false;
@ -1127,6 +1128,9 @@ EXPORT_SYMBOL_GPL(direct_make_request);
*/ */
blk_qc_t submit_bio(struct bio *bio) blk_qc_t submit_bio(struct bio *bio)
{ {
if (blkcg_punt_bio_submit(bio))
return BLK_QC_T_NONE;
/* /*
* If it's a regular read/write or a barrier with data attached, * If it's a regular read/write or a barrier with data attached,
* go through the normal accounting stuff before submission. * go through the normal accounting stuff before submission.

View File

@ -1973,7 +1973,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
blk_mq_bio_to_request(rq, bio, nr_segs); blk_mq_bio_to_request(rq, bio, nr_segs);
plug = current->plug; plug = blk_mq_plug(q, bio);
if (unlikely(is_flush_fua)) { if (unlikely(is_flush_fua)) {
/* bypass scheduler for flush rq */ /* bypass scheduler for flush rq */
blk_insert_flush(rq); blk_insert_flush(rq);

View File

@ -233,4 +233,36 @@ static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap)
qmap->mq_map[cpu] = 0; qmap->mq_map[cpu] = 0;
} }
/*
* blk_mq_plug() - Get caller context plug
* @q: request queue
* @bio : the bio being submitted by the caller context
*
* Plugging, by design, may delay the insertion of BIOs into the elevator in
* order to increase BIO merging opportunities. This however can cause BIO
* insertion order to change from the order in which submit_bio() is being
* executed in the case of multiple contexts concurrently issuing BIOs to a
* device, even if these context are synchronized to tightly control BIO issuing
* order. While this is not a problem with regular block devices, this ordering
* change can cause write BIO failures with zoned block devices as these
* require sequential write patterns to zones. Prevent this from happening by
* ignoring the plug state of a BIO issuing context if the target request queue
* is for a zoned block device and the BIO to plug is a write operation.
*
* Return current->plug if the bio can be plugged and NULL otherwise
*/
static inline struct blk_plug *blk_mq_plug(struct request_queue *q,
struct bio *bio)
{
/*
* For regular block devices or read operations, use the context plug
* which may be NULL if blk_start_plug() was not executed.
*/
if (!blk_queue_is_zoned(q) || !op_is_write(bio_op(bio)))
return current->plug;
/* Zoned block device write operation case: do not plug the BIO */
return NULL;
}
#endif #endif

View File

@ -881,13 +881,10 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
u64 tmp; u64 tmp;
jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; jiffy_elapsed = jiffies - tg->slice_start[rw];
/* Slice has just started. Consider one slice interval */ /* Round up to the next throttle slice, wait time must be nonzero */
if (!jiffy_elapsed) jiffy_elapsed_rnd = roundup(jiffy_elapsed + 1, tg->td->throtl_slice);
jiffy_elapsed_rnd = tg->td->throtl_slice;
jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice);
/* /*
* jiffy_elapsed_rnd should not be a big value as minimum iops can be * jiffy_elapsed_rnd should not be a big value as minimum iops can be

View File

@ -14,6 +14,9 @@
#include <linux/rbtree.h> #include <linux/rbtree.h>
#include <linux/blkdev.h> #include <linux/blkdev.h>
#include <linux/blk-mq.h> #include <linux/blk-mq.h>
#include <linux/mm.h>
#include <linux/vmalloc.h>
#include <linux/sched/mm.h>
#include "blk.h" #include "blk.h"
@ -70,7 +73,7 @@ EXPORT_SYMBOL_GPL(__blk_req_zone_write_unlock);
static inline unsigned int __blkdev_nr_zones(struct request_queue *q, static inline unsigned int __blkdev_nr_zones(struct request_queue *q,
sector_t nr_sectors) sector_t nr_sectors)
{ {
unsigned long zone_sectors = blk_queue_zone_sectors(q); sector_t zone_sectors = blk_queue_zone_sectors(q);
return (nr_sectors + zone_sectors - 1) >> ilog2(zone_sectors); return (nr_sectors + zone_sectors - 1) >> ilog2(zone_sectors);
} }
@ -117,8 +120,7 @@ static bool blkdev_report_zone(struct block_device *bdev, struct blk_zone *rep)
} }
static int blk_report_zones(struct gendisk *disk, sector_t sector, static int blk_report_zones(struct gendisk *disk, sector_t sector,
struct blk_zone *zones, unsigned int *nr_zones, struct blk_zone *zones, unsigned int *nr_zones)
gfp_t gfp_mask)
{ {
struct request_queue *q = disk->queue; struct request_queue *q = disk->queue;
unsigned int z = 0, n, nrz = *nr_zones; unsigned int z = 0, n, nrz = *nr_zones;
@ -127,8 +129,7 @@ static int blk_report_zones(struct gendisk *disk, sector_t sector,
while (z < nrz && sector < capacity) { while (z < nrz && sector < capacity) {
n = nrz - z; n = nrz - z;
ret = disk->fops->report_zones(disk, sector, &zones[z], &n, ret = disk->fops->report_zones(disk, sector, &zones[z], &n);
gfp_mask);
if (ret) if (ret)
return ret; return ret;
if (!n) if (!n)
@ -149,17 +150,18 @@ static int blk_report_zones(struct gendisk *disk, sector_t sector,
* @sector: Sector from which to report zones * @sector: Sector from which to report zones
* @zones: Array of zone structures where to return the zones information * @zones: Array of zone structures where to return the zones information
* @nr_zones: Number of zone structures in the zone array * @nr_zones: Number of zone structures in the zone array
* @gfp_mask: Memory allocation flags (for bio_alloc)
* *
* Description: * Description:
* Get zone information starting from the zone containing @sector. * Get zone information starting from the zone containing @sector.
* The number of zone information reported may be less than the number * The number of zone information reported may be less than the number
* requested by @nr_zones. The number of zones actually reported is * requested by @nr_zones. The number of zones actually reported is
* returned in @nr_zones. * returned in @nr_zones.
* The caller must use memalloc_noXX_save/restore() calls to control
* memory allocations done within this function (zone array and command
* buffer allocation by the device driver).
*/ */
int blkdev_report_zones(struct block_device *bdev, sector_t sector, int blkdev_report_zones(struct block_device *bdev, sector_t sector,
struct blk_zone *zones, unsigned int *nr_zones, struct blk_zone *zones, unsigned int *nr_zones)
gfp_t gfp_mask)
{ {
struct request_queue *q = bdev_get_queue(bdev); struct request_queue *q = bdev_get_queue(bdev);
unsigned int i, nrz; unsigned int i, nrz;
@ -184,7 +186,7 @@ int blkdev_report_zones(struct block_device *bdev, sector_t sector,
nrz = min(*nr_zones, nrz = min(*nr_zones,
__blkdev_nr_zones(q, bdev->bd_part->nr_sects - sector)); __blkdev_nr_zones(q, bdev->bd_part->nr_sects - sector));
ret = blk_report_zones(bdev->bd_disk, get_start_sect(bdev) + sector, ret = blk_report_zones(bdev->bd_disk, get_start_sect(bdev) + sector,
zones, &nrz, gfp_mask); zones, &nrz);
if (ret) if (ret)
return ret; return ret;
@ -305,9 +307,7 @@ int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode,
if (!zones) if (!zones)
return -ENOMEM; return -ENOMEM;
ret = blkdev_report_zones(bdev, rep.sector, ret = blkdev_report_zones(bdev, rep.sector, zones, &rep.nr_zones);
zones, &rep.nr_zones,
GFP_KERNEL);
if (ret) if (ret)
goto out; goto out;
@ -373,22 +373,25 @@ static inline unsigned long *blk_alloc_zone_bitmap(int node,
* Allocate an array of struct blk_zone to get nr_zones zone information. * Allocate an array of struct blk_zone to get nr_zones zone information.
* The allocated array may be smaller than nr_zones. * The allocated array may be smaller than nr_zones.
*/ */
static struct blk_zone *blk_alloc_zones(int node, unsigned int *nr_zones) static struct blk_zone *blk_alloc_zones(unsigned int *nr_zones)
{ {
size_t size = *nr_zones * sizeof(struct blk_zone); struct blk_zone *zones;
struct page *page; size_t nrz = min(*nr_zones, BLK_ZONED_REPORT_MAX_ZONES);
int order;
for (order = get_order(size); order >= 0; order--) { /*
page = alloc_pages_node(node, GFP_NOIO | __GFP_ZERO, order); * GFP_KERNEL here is meaningless as the caller task context has
if (page) { * the PF_MEMALLOC_NOIO flag set in blk_revalidate_disk_zones()
*nr_zones = min_t(unsigned int, *nr_zones, * with memalloc_noio_save().
(PAGE_SIZE << order) / sizeof(struct blk_zone)); */
return page_address(page); zones = kvcalloc(nrz, sizeof(struct blk_zone), GFP_KERNEL);
} if (!zones) {
*nr_zones = 0;
return NULL;
} }
return NULL; *nr_zones = nrz;
return zones;
} }
void blk_queue_free_zone_bitmaps(struct request_queue *q) void blk_queue_free_zone_bitmaps(struct request_queue *q)
@ -415,6 +418,7 @@ int blk_revalidate_disk_zones(struct gendisk *disk)
unsigned long *seq_zones_wlock = NULL, *seq_zones_bitmap = NULL; unsigned long *seq_zones_wlock = NULL, *seq_zones_bitmap = NULL;
unsigned int i, rep_nr_zones = 0, z = 0, nrz; unsigned int i, rep_nr_zones = 0, z = 0, nrz;
struct blk_zone *zones = NULL; struct blk_zone *zones = NULL;
unsigned int noio_flag;
sector_t sector = 0; sector_t sector = 0;
int ret = 0; int ret = 0;
@ -427,6 +431,12 @@ int blk_revalidate_disk_zones(struct gendisk *disk)
return 0; return 0;
} }
/*
* Ensure that all memory allocations in this context are done as
* if GFP_NOIO was specified.
*/
noio_flag = memalloc_noio_save();
if (!blk_queue_is_zoned(q) || !nr_zones) { if (!blk_queue_is_zoned(q) || !nr_zones) {
nr_zones = 0; nr_zones = 0;
goto update; goto update;
@ -443,13 +453,13 @@ int blk_revalidate_disk_zones(struct gendisk *disk)
/* Get zone information and initialize seq_zones_bitmap */ /* Get zone information and initialize seq_zones_bitmap */
rep_nr_zones = nr_zones; rep_nr_zones = nr_zones;
zones = blk_alloc_zones(q->node, &rep_nr_zones); zones = blk_alloc_zones(&rep_nr_zones);
if (!zones) if (!zones)
goto out; goto out;
while (z < nr_zones) { while (z < nr_zones) {
nrz = min(nr_zones - z, rep_nr_zones); nrz = min(nr_zones - z, rep_nr_zones);
ret = blk_report_zones(disk, sector, zones, &nrz, GFP_NOIO); ret = blk_report_zones(disk, sector, zones, &nrz);
if (ret) if (ret)
goto out; goto out;
if (!nrz) if (!nrz)
@ -480,8 +490,9 @@ int blk_revalidate_disk_zones(struct gendisk *disk)
blk_mq_unfreeze_queue(q); blk_mq_unfreeze_queue(q);
out: out:
free_pages((unsigned long)zones, memalloc_noio_restore(noio_flag);
get_order(rep_nr_zones * sizeof(struct blk_zone)));
kvfree(zones);
kfree(seq_zones_wlock); kfree(seq_zones_wlock);
kfree(seq_zones_bitmap); kfree(seq_zones_bitmap);

View File

@ -134,6 +134,8 @@ static struct dentry *nbd_dbg_dir;
#define NBD_MAGIC 0x68797548 #define NBD_MAGIC 0x68797548
#define NBD_DEF_BLKSIZE 1024
static unsigned int nbds_max = 16; static unsigned int nbds_max = 16;
static int max_part = 16; static int max_part = 16;
static struct workqueue_struct *recv_workqueue; static struct workqueue_struct *recv_workqueue;
@ -1236,6 +1238,14 @@ static void nbd_clear_sock_ioctl(struct nbd_device *nbd,
nbd_config_put(nbd); nbd_config_put(nbd);
} }
static bool nbd_is_valid_blksize(unsigned long blksize)
{
if (!blksize || !is_power_of_2(blksize) || blksize < 512 ||
blksize > PAGE_SIZE)
return false;
return true;
}
/* Must be called with config_lock held */ /* Must be called with config_lock held */
static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
unsigned int cmd, unsigned long arg) unsigned int cmd, unsigned long arg)
@ -1251,8 +1261,9 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
case NBD_SET_SOCK: case NBD_SET_SOCK:
return nbd_add_socket(nbd, arg, false); return nbd_add_socket(nbd, arg, false);
case NBD_SET_BLKSIZE: case NBD_SET_BLKSIZE:
if (!arg || !is_power_of_2(arg) || arg < 512 || if (!arg)
arg > PAGE_SIZE) arg = NBD_DEF_BLKSIZE;
if (!nbd_is_valid_blksize(arg))
return -EINVAL; return -EINVAL;
nbd_size_set(nbd, arg, nbd_size_set(nbd, arg,
div_s64(config->bytesize, arg)); div_s64(config->bytesize, arg));
@ -1332,7 +1343,7 @@ static struct nbd_config *nbd_alloc_config(void)
atomic_set(&config->recv_threads, 0); atomic_set(&config->recv_threads, 0);
init_waitqueue_head(&config->recv_wq); init_waitqueue_head(&config->recv_wq);
init_waitqueue_head(&config->conn_wait); init_waitqueue_head(&config->conn_wait);
config->blksize = 1024; config->blksize = NBD_DEF_BLKSIZE;
atomic_set(&config->live_connections, 0); atomic_set(&config->live_connections, 0);
try_module_get(THIS_MODULE); try_module_get(THIS_MODULE);
return config; return config;
@ -1673,6 +1684,30 @@ nbd_device_policy[NBD_DEVICE_ATTR_MAX + 1] = {
[NBD_DEVICE_CONNECTED] = { .type = NLA_U8 }, [NBD_DEVICE_CONNECTED] = { .type = NLA_U8 },
}; };
static int nbd_genl_size_set(struct genl_info *info, struct nbd_device *nbd)
{
struct nbd_config *config = nbd->config;
u64 bsize = config->blksize;
u64 bytes = config->bytesize;
if (info->attrs[NBD_ATTR_SIZE_BYTES])
bytes = nla_get_u64(info->attrs[NBD_ATTR_SIZE_BYTES]);
if (info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES]) {
bsize = nla_get_u64(info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES]);
if (!bsize)
bsize = NBD_DEF_BLKSIZE;
if (!nbd_is_valid_blksize(bsize)) {
printk(KERN_ERR "Invalid block size %llu\n", bsize);
return -EINVAL;
}
}
if (bytes != config->bytesize || bsize != config->blksize)
nbd_size_set(nbd, bsize, div64_u64(bytes, bsize));
return 0;
}
static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info) static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info)
{ {
struct nbd_device *nbd = NULL; struct nbd_device *nbd = NULL;
@ -1760,16 +1795,10 @@ static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info)
refcount_set(&nbd->config_refs, 1); refcount_set(&nbd->config_refs, 1);
set_bit(NBD_BOUND, &config->runtime_flags); set_bit(NBD_BOUND, &config->runtime_flags);
if (info->attrs[NBD_ATTR_SIZE_BYTES]) { ret = nbd_genl_size_set(info, nbd);
u64 bytes = nla_get_u64(info->attrs[NBD_ATTR_SIZE_BYTES]); if (ret)
nbd_size_set(nbd, config->blksize, goto out;
div64_u64(bytes, config->blksize));
}
if (info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES]) {
u64 bsize =
nla_get_u64(info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES]);
nbd_size_set(nbd, bsize, div64_u64(config->bytesize, bsize));
}
if (info->attrs[NBD_ATTR_TIMEOUT]) { if (info->attrs[NBD_ATTR_TIMEOUT]) {
u64 timeout = nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT]); u64 timeout = nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT]);
nbd->tag_set.timeout = timeout * HZ; nbd->tag_set.timeout = timeout * HZ;
@ -1938,6 +1967,10 @@ static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info)
goto out; goto out;
} }
ret = nbd_genl_size_set(info, nbd);
if (ret)
goto out;
if (info->attrs[NBD_ATTR_TIMEOUT]) { if (info->attrs[NBD_ATTR_TIMEOUT]) {
u64 timeout = nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT]); u64 timeout = nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT]);
nbd->tag_set.timeout = timeout * HZ; nbd->tag_set.timeout = timeout * HZ;

View File

@ -89,8 +89,7 @@ struct nullb {
int null_zone_init(struct nullb_device *dev); int null_zone_init(struct nullb_device *dev);
void null_zone_exit(struct nullb_device *dev); void null_zone_exit(struct nullb_device *dev);
int null_zone_report(struct gendisk *disk, sector_t sector, int null_zone_report(struct gendisk *disk, sector_t sector,
struct blk_zone *zones, unsigned int *nr_zones, struct blk_zone *zones, unsigned int *nr_zones);
gfp_t gfp_mask);
void null_zone_write(struct nullb_cmd *cmd, sector_t sector, void null_zone_write(struct nullb_cmd *cmd, sector_t sector,
unsigned int nr_sectors); unsigned int nr_sectors);
void null_zone_reset(struct nullb_cmd *cmd, sector_t sector); void null_zone_reset(struct nullb_cmd *cmd, sector_t sector);
@ -103,7 +102,7 @@ static inline int null_zone_init(struct nullb_device *dev)
static inline void null_zone_exit(struct nullb_device *dev) {} static inline void null_zone_exit(struct nullb_device *dev) {}
static inline int null_zone_report(struct gendisk *disk, sector_t sector, static inline int null_zone_report(struct gendisk *disk, sector_t sector,
struct blk_zone *zones, struct blk_zone *zones,
unsigned int *nr_zones, gfp_t gfp_mask) unsigned int *nr_zones)
{ {
return -EOPNOTSUPP; return -EOPNOTSUPP;
} }

View File

@ -67,8 +67,7 @@ void null_zone_exit(struct nullb_device *dev)
} }
int null_zone_report(struct gendisk *disk, sector_t sector, int null_zone_report(struct gendisk *disk, sector_t sector,
struct blk_zone *zones, unsigned int *nr_zones, struct blk_zone *zones, unsigned int *nr_zones)
gfp_t gfp_mask)
{ {
struct nullb *nullb = disk->private_data; struct nullb *nullb = disk->private_data;
struct nullb_device *dev = nullb->dev; struct nullb_device *dev = nullb->dev;

View File

@ -461,15 +461,14 @@ static int flakey_prepare_ioctl(struct dm_target *ti, struct block_device **bdev
#ifdef CONFIG_BLK_DEV_ZONED #ifdef CONFIG_BLK_DEV_ZONED
static int flakey_report_zones(struct dm_target *ti, sector_t sector, static int flakey_report_zones(struct dm_target *ti, sector_t sector,
struct blk_zone *zones, unsigned int *nr_zones, struct blk_zone *zones, unsigned int *nr_zones)
gfp_t gfp_mask)
{ {
struct flakey_c *fc = ti->private; struct flakey_c *fc = ti->private;
int ret; int ret;
/* Do report and remap it */ /* Do report and remap it */
ret = blkdev_report_zones(fc->dev->bdev, flakey_map_sector(ti, sector), ret = blkdev_report_zones(fc->dev->bdev, flakey_map_sector(ti, sector),
zones, nr_zones, gfp_mask); zones, nr_zones);
if (ret != 0) if (ret != 0)
return ret; return ret;

View File

@ -137,15 +137,14 @@ static int linear_prepare_ioctl(struct dm_target *ti, struct block_device **bdev
#ifdef CONFIG_BLK_DEV_ZONED #ifdef CONFIG_BLK_DEV_ZONED
static int linear_report_zones(struct dm_target *ti, sector_t sector, static int linear_report_zones(struct dm_target *ti, sector_t sector,
struct blk_zone *zones, unsigned int *nr_zones, struct blk_zone *zones, unsigned int *nr_zones)
gfp_t gfp_mask)
{ {
struct linear_c *lc = (struct linear_c *) ti->private; struct linear_c *lc = (struct linear_c *) ti->private;
int ret; int ret;
/* Do report and remap it */ /* Do report and remap it */
ret = blkdev_report_zones(lc->dev->bdev, linear_map_sector(ti, sector), ret = blkdev_report_zones(lc->dev->bdev, linear_map_sector(ti, sector),
zones, nr_zones, gfp_mask); zones, nr_zones);
if (ret != 0) if (ret != 0)
return ret; return ret;

View File

@ -8,6 +8,7 @@
#include <linux/module.h> #include <linux/module.h>
#include <linux/crc32.h> #include <linux/crc32.h>
#include <linux/sched/mm.h>
#define DM_MSG_PREFIX "zoned metadata" #define DM_MSG_PREFIX "zoned metadata"
@ -1162,8 +1163,7 @@ static int dmz_init_zones(struct dmz_metadata *zmd)
while (sector < dev->capacity) { while (sector < dev->capacity) {
/* Get zone information */ /* Get zone information */
nr_blkz = DMZ_REPORT_NR_ZONES; nr_blkz = DMZ_REPORT_NR_ZONES;
ret = blkdev_report_zones(dev->bdev, sector, blkz, ret = blkdev_report_zones(dev->bdev, sector, blkz, &nr_blkz);
&nr_blkz, GFP_KERNEL);
if (ret) { if (ret) {
dmz_dev_err(dev, "Report zones failed %d", ret); dmz_dev_err(dev, "Report zones failed %d", ret);
goto out; goto out;
@ -1201,12 +1201,20 @@ static int dmz_init_zones(struct dmz_metadata *zmd)
static int dmz_update_zone(struct dmz_metadata *zmd, struct dm_zone *zone) static int dmz_update_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
{ {
unsigned int nr_blkz = 1; unsigned int nr_blkz = 1;
unsigned int noio_flag;
struct blk_zone blkz; struct blk_zone blkz;
int ret; int ret;
/* Get zone information from disk */ /*
* Get zone information from disk. Since blkdev_report_zones() uses
* GFP_KERNEL by default for memory allocations, set the per-task
* PF_MEMALLOC_NOIO flag so that all allocations are done as if
* GFP_NOIO was specified.
*/
noio_flag = memalloc_noio_save();
ret = blkdev_report_zones(zmd->dev->bdev, dmz_start_sect(zmd, zone), ret = blkdev_report_zones(zmd->dev->bdev, dmz_start_sect(zmd, zone),
&blkz, &nr_blkz, GFP_NOIO); &blkz, &nr_blkz);
memalloc_noio_restore(noio_flag);
if (!nr_blkz) if (!nr_blkz)
ret = -EIO; ret = -EIO;
if (ret) { if (ret) {

View File

@ -441,8 +441,7 @@ static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
} }
static int dm_blk_report_zones(struct gendisk *disk, sector_t sector, static int dm_blk_report_zones(struct gendisk *disk, sector_t sector,
struct blk_zone *zones, unsigned int *nr_zones, struct blk_zone *zones, unsigned int *nr_zones)
gfp_t gfp_mask)
{ {
#ifdef CONFIG_BLK_DEV_ZONED #ifdef CONFIG_BLK_DEV_ZONED
struct mapped_device *md = disk->private_data; struct mapped_device *md = disk->private_data;
@ -480,8 +479,7 @@ static int dm_blk_report_zones(struct gendisk *disk, sector_t sector,
* So there is no need to loop here trying to fill the entire array * So there is no need to loop here trying to fill the entire array
* of zones. * of zones.
*/ */
ret = tgt->type->report_zones(tgt, sector, zones, ret = tgt->type->report_zones(tgt, sector, zones, nr_zones);
nr_zones, gfp_mask);
out: out:
dm_put_live_table(md, srcu_idx); dm_put_live_table(md, srcu_idx);

View File

@ -11,6 +11,7 @@
#include <linux/hdreg.h> #include <linux/hdreg.h>
#include <linux/kernel.h> #include <linux/kernel.h>
#include <linux/module.h> #include <linux/module.h>
#include <linux/backing-dev.h>
#include <linux/list_sort.h> #include <linux/list_sort.h>
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/types.h> #include <linux/types.h>
@ -1626,6 +1627,7 @@ static void nvme_update_disk_info(struct gendisk *disk,
{ {
sector_t capacity = le64_to_cpu(id->nsze) << (ns->lba_shift - 9); sector_t capacity = le64_to_cpu(id->nsze) << (ns->lba_shift - 9);
unsigned short bs = 1 << ns->lba_shift; unsigned short bs = 1 << ns->lba_shift;
u32 atomic_bs, phys_bs, io_opt;
if (ns->lba_shift > PAGE_SHIFT) { if (ns->lba_shift > PAGE_SHIFT) {
/* unsupported block size, set capacity to 0 later */ /* unsupported block size, set capacity to 0 later */
@ -1634,9 +1636,37 @@ static void nvme_update_disk_info(struct gendisk *disk,
blk_mq_freeze_queue(disk->queue); blk_mq_freeze_queue(disk->queue);
blk_integrity_unregister(disk); blk_integrity_unregister(disk);
if (id->nabo == 0) {
/*
* Bit 1 indicates whether NAWUPF is defined for this namespace
* and whether it should be used instead of AWUPF. If NAWUPF ==
* 0 then AWUPF must be used instead.
*/
if (id->nsfeat & (1 << 1) && id->nawupf)
atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs;
else
atomic_bs = (1 + ns->ctrl->subsys->awupf) * bs;
} else {
atomic_bs = bs;
}
phys_bs = bs;
io_opt = bs;
if (id->nsfeat & (1 << 4)) {
/* NPWG = Namespace Preferred Write Granularity */
phys_bs *= 1 + le16_to_cpu(id->npwg);
/* NOWS = Namespace Optimal Write Size */
io_opt *= 1 + le16_to_cpu(id->nows);
}
blk_queue_logical_block_size(disk->queue, bs); blk_queue_logical_block_size(disk->queue, bs);
blk_queue_physical_block_size(disk->queue, bs); /*
blk_queue_io_min(disk->queue, bs); * Linux filesystems assume writing a single physical block is
* an atomic operation. Hence limit the physical block size to the
* value of the Atomic Write Unit Power Fail parameter.
*/
blk_queue_physical_block_size(disk->queue, min(phys_bs, atomic_bs));
blk_queue_io_min(disk->queue, phys_bs);
blk_queue_io_opt(disk->queue, io_opt);
if (ns->ms && !ns->ext && if (ns->ms && !ns->ext &&
(ns->ctrl->ops->flags & NVME_F_METADATA_SUPPORTED)) (ns->ctrl->ops->flags & NVME_F_METADATA_SUPPORTED))
@ -2386,8 +2416,8 @@ static bool nvme_validate_cntlid(struct nvme_subsystem *subsys,
lockdep_assert_held(&nvme_subsystems_lock); lockdep_assert_held(&nvme_subsystems_lock);
list_for_each_entry(tmp, &subsys->ctrls, subsys_entry) { list_for_each_entry(tmp, &subsys->ctrls, subsys_entry) {
if (ctrl->state == NVME_CTRL_DELETING || if (tmp->state == NVME_CTRL_DELETING ||
ctrl->state == NVME_CTRL_DEAD) tmp->state == NVME_CTRL_DEAD)
continue; continue;
if (tmp->cntlid == ctrl->cntlid) { if (tmp->cntlid == ctrl->cntlid) {
@ -2433,6 +2463,7 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
memcpy(subsys->firmware_rev, id->fr, sizeof(subsys->firmware_rev)); memcpy(subsys->firmware_rev, id->fr, sizeof(subsys->firmware_rev));
subsys->vendor_id = le16_to_cpu(id->vid); subsys->vendor_id = le16_to_cpu(id->vid);
subsys->cmic = id->cmic; subsys->cmic = id->cmic;
subsys->awupf = le16_to_cpu(id->awupf);
#ifdef CONFIG_NVME_MULTIPATH #ifdef CONFIG_NVME_MULTIPATH
subsys->iopolicy = NVME_IOPOLICY_NUMA; subsys->iopolicy = NVME_IOPOLICY_NUMA;
#endif #endif
@ -3274,6 +3305,10 @@ static int nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
goto out_free_ns; goto out_free_ns;
} }
if (ctrl->opts && ctrl->opts->data_digest)
ns->queue->backing_dev_info->capabilities
|= BDI_CAP_STABLE_WRITES;
blk_queue_flag_set(QUEUE_FLAG_NONROT, ns->queue); blk_queue_flag_set(QUEUE_FLAG_NONROT, ns->queue);
if (ctrl->ops->flags & NVME_F_PCI_P2PDMA) if (ctrl->ops->flags & NVME_F_PCI_P2PDMA)
blk_queue_flag_set(QUEUE_FLAG_PCI_P2PDMA, ns->queue); blk_queue_flag_set(QUEUE_FLAG_PCI_P2PDMA, ns->queue);

View File

@ -204,6 +204,9 @@ static DEFINE_IDA(nvme_fc_ctrl_cnt);
static struct workqueue_struct *nvme_fc_wq; static struct workqueue_struct *nvme_fc_wq;
static bool nvme_fc_waiting_to_unload;
static DECLARE_COMPLETION(nvme_fc_unload_proceed);
/* /*
* These items are short-term. They will eventually be moved into * These items are short-term. They will eventually be moved into
* a generic FC class. See comments in module init. * a generic FC class. See comments in module init.
@ -229,6 +232,8 @@ nvme_fc_free_lport(struct kref *ref)
/* remove from transport list */ /* remove from transport list */
spin_lock_irqsave(&nvme_fc_lock, flags); spin_lock_irqsave(&nvme_fc_lock, flags);
list_del(&lport->port_list); list_del(&lport->port_list);
if (nvme_fc_waiting_to_unload && list_empty(&nvme_fc_lport_list))
complete(&nvme_fc_unload_proceed);
spin_unlock_irqrestore(&nvme_fc_lock, flags); spin_unlock_irqrestore(&nvme_fc_lock, flags);
ida_simple_remove(&nvme_fc_local_port_cnt, lport->localport.port_num); ida_simple_remove(&nvme_fc_local_port_cnt, lport->localport.port_num);
@ -3457,11 +3462,51 @@ static int __init nvme_fc_init_module(void)
return ret; return ret;
} }
static void
nvme_fc_delete_controllers(struct nvme_fc_rport *rport)
{
struct nvme_fc_ctrl *ctrl;
spin_lock(&rport->lock);
list_for_each_entry(ctrl, &rport->ctrl_list, ctrl_list) {
dev_warn(ctrl->ctrl.device,
"NVME-FC{%d}: transport unloading: deleting ctrl\n",
ctrl->cnum);
nvme_delete_ctrl(&ctrl->ctrl);
}
spin_unlock(&rport->lock);
}
static void
nvme_fc_cleanup_for_unload(void)
{
struct nvme_fc_lport *lport;
struct nvme_fc_rport *rport;
list_for_each_entry(lport, &nvme_fc_lport_list, port_list) {
list_for_each_entry(rport, &lport->endp_list, endp_list) {
nvme_fc_delete_controllers(rport);
}
}
}
static void __exit nvme_fc_exit_module(void) static void __exit nvme_fc_exit_module(void)
{ {
/* sanity check - all lports should be removed */ unsigned long flags;
if (!list_empty(&nvme_fc_lport_list)) bool need_cleanup = false;
pr_warn("%s: localport list not empty\n", __func__);
spin_lock_irqsave(&nvme_fc_lock, flags);
nvme_fc_waiting_to_unload = true;
if (!list_empty(&nvme_fc_lport_list)) {
need_cleanup = true;
nvme_fc_cleanup_for_unload();
}
spin_unlock_irqrestore(&nvme_fc_lock, flags);
if (need_cleanup) {
pr_info("%s: waiting for ctlr deletes\n", __func__);
wait_for_completion(&nvme_fc_unload_proceed);
pr_info("%s: ctrl deletes complete\n", __func__);
}
nvmf_unregister_transport(&nvme_fc_transport); nvmf_unregister_transport(&nvme_fc_transport);

View File

@ -123,14 +123,20 @@ void nvme_mpath_clear_current_path(struct nvme_ns *ns)
} }
} }
static bool nvme_path_is_disabled(struct nvme_ns *ns)
{
return ns->ctrl->state != NVME_CTRL_LIVE ||
test_bit(NVME_NS_ANA_PENDING, &ns->flags) ||
test_bit(NVME_NS_REMOVING, &ns->flags);
}
static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node) static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
{ {
int found_distance = INT_MAX, fallback_distance = INT_MAX, distance; int found_distance = INT_MAX, fallback_distance = INT_MAX, distance;
struct nvme_ns *found = NULL, *fallback = NULL, *ns; struct nvme_ns *found = NULL, *fallback = NULL, *ns;
list_for_each_entry_rcu(ns, &head->list, siblings) { list_for_each_entry_rcu(ns, &head->list, siblings) {
if (ns->ctrl->state != NVME_CTRL_LIVE || if (nvme_path_is_disabled(ns))
test_bit(NVME_NS_ANA_PENDING, &ns->flags))
continue; continue;
if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA) if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA)
@ -178,14 +184,16 @@ static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head,
{ {
struct nvme_ns *ns, *found, *fallback = NULL; struct nvme_ns *ns, *found, *fallback = NULL;
if (list_is_singular(&head->list)) if (list_is_singular(&head->list)) {
if (nvme_path_is_disabled(old))
return NULL;
return old; return old;
}
for (ns = nvme_next_ns(head, old); for (ns = nvme_next_ns(head, old);
ns != old; ns != old;
ns = nvme_next_ns(head, ns)) { ns = nvme_next_ns(head, ns)) {
if (ns->ctrl->state != NVME_CTRL_LIVE || if (nvme_path_is_disabled(ns))
test_bit(NVME_NS_ANA_PENDING, &ns->flags))
continue; continue;
if (ns->ana_state == NVME_ANA_OPTIMIZED) { if (ns->ana_state == NVME_ANA_OPTIMIZED) {

View File

@ -283,6 +283,7 @@ struct nvme_subsystem {
char firmware_rev[8]; char firmware_rev[8];
u8 cmic; u8 cmic;
u16 vendor_id; u16 vendor_id;
u16 awupf; /* 0's based awupf value. */
struct ida ns_ida; struct ida ns_ida;
#ifdef CONFIG_NVME_MULTIPATH #ifdef CONFIG_NVME_MULTIPATH
enum nvme_iopolicy iopolicy; enum nvme_iopolicy iopolicy;

View File

@ -1439,11 +1439,15 @@ static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq,
if (qid && dev->cmb_use_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) { if (qid && dev->cmb_use_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) {
nvmeq->sq_cmds = pci_alloc_p2pmem(pdev, SQ_SIZE(depth)); nvmeq->sq_cmds = pci_alloc_p2pmem(pdev, SQ_SIZE(depth));
nvmeq->sq_dma_addr = pci_p2pmem_virt_to_bus(pdev, if (nvmeq->sq_cmds) {
nvmeq->sq_cmds); nvmeq->sq_dma_addr = pci_p2pmem_virt_to_bus(pdev,
if (nvmeq->sq_dma_addr) { nvmeq->sq_cmds);
set_bit(NVMEQ_SQ_CMB, &nvmeq->flags); if (nvmeq->sq_dma_addr) {
return 0; set_bit(NVMEQ_SQ_CMB, &nvmeq->flags);
return 0;
}
pci_free_p2pmem(pdev, nvmeq->sq_cmds, SQ_SIZE(depth));
} }
} }
@ -2250,7 +2254,9 @@ static int nvme_dev_add(struct nvme_dev *dev)
if (!dev->ctrl.tagset) { if (!dev->ctrl.tagset) {
dev->tagset.ops = &nvme_mq_ops; dev->tagset.ops = &nvme_mq_ops;
dev->tagset.nr_hw_queues = dev->online_queues - 1; dev->tagset.nr_hw_queues = dev->online_queues - 1;
dev->tagset.nr_maps = 2; /* default + read */ dev->tagset.nr_maps = 1; /* default */
if (dev->io_queues[HCTX_TYPE_READ])
dev->tagset.nr_maps++;
if (dev->io_queues[HCTX_TYPE_POLL]) if (dev->io_queues[HCTX_TYPE_POLL])
dev->tagset.nr_maps++; dev->tagset.nr_maps++;
dev->tagset.timeout = NVME_IO_TIMEOUT; dev->tagset.timeout = NVME_IO_TIMEOUT;
@ -2289,8 +2295,7 @@ static int nvme_pci_enable(struct nvme_dev *dev)
pci_set_master(pdev); pci_set_master(pdev);
if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(64)) && if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(64)))
dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(32)))
goto disable; goto disable;
if (readl(dev->bar + NVME_REG_CSTS) == -1) { if (readl(dev->bar + NVME_REG_CSTS) == -1) {
@ -2498,7 +2503,8 @@ static void nvme_reset_work(struct work_struct *work)
* Limit the max command size to prevent iod->sg allocations going * Limit the max command size to prevent iod->sg allocations going
* over a single page. * over a single page.
*/ */
dev->ctrl.max_hw_sectors = NVME_MAX_KB_SZ << 1; dev->ctrl.max_hw_sectors = min_t(u32,
NVME_MAX_KB_SZ << 1, dma_max_mapping_size(dev->dev) >> 9);
dev->ctrl.max_segments = NVME_MAX_SEGS; dev->ctrl.max_segments = NVME_MAX_SEGS;
/* /*
@ -2923,7 +2929,7 @@ static int nvme_simple_resume(struct device *dev)
return 0; return 0;
} }
const struct dev_pm_ops nvme_dev_pm_ops = { static const struct dev_pm_ops nvme_dev_pm_ops = {
.suspend = nvme_suspend, .suspend = nvme_suspend,
.resume = nvme_resume, .resume = nvme_resume,
.freeze = nvme_simple_suspend, .freeze = nvme_simple_suspend,

View File

@ -860,7 +860,14 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
else else
flags |= MSG_MORE; flags |= MSG_MORE;
ret = kernel_sendpage(queue->sock, page, offset, len, flags); /* can't zcopy slab pages */
if (unlikely(PageSlab(page))) {
ret = sock_no_sendpage(queue->sock, page, offset, len,
flags);
} else {
ret = kernel_sendpage(queue->sock, page, offset, len,
flags);
}
if (ret <= 0) if (ret <= 0)
return ret; return ret;

View File

@ -7,6 +7,17 @@
#include <asm/unaligned.h> #include <asm/unaligned.h>
#include "trace.h" #include "trace.h"
static const char *nvme_trace_delete_sq(struct trace_seq *p, u8 *cdw10)
{
const char *ret = trace_seq_buffer_ptr(p);
u16 sqid = get_unaligned_le16(cdw10);
trace_seq_printf(p, "sqid=%u", sqid);
trace_seq_putc(p, 0);
return ret;
}
static const char *nvme_trace_create_sq(struct trace_seq *p, u8 *cdw10) static const char *nvme_trace_create_sq(struct trace_seq *p, u8 *cdw10)
{ {
const char *ret = trace_seq_buffer_ptr(p); const char *ret = trace_seq_buffer_ptr(p);
@ -23,6 +34,17 @@ static const char *nvme_trace_create_sq(struct trace_seq *p, u8 *cdw10)
return ret; return ret;
} }
static const char *nvme_trace_delete_cq(struct trace_seq *p, u8 *cdw10)
{
const char *ret = trace_seq_buffer_ptr(p);
u16 cqid = get_unaligned_le16(cdw10);
trace_seq_printf(p, "cqid=%u", cqid);
trace_seq_putc(p, 0);
return ret;
}
static const char *nvme_trace_create_cq(struct trace_seq *p, u8 *cdw10) static const char *nvme_trace_create_cq(struct trace_seq *p, u8 *cdw10)
{ {
const char *ret = trace_seq_buffer_ptr(p); const char *ret = trace_seq_buffer_ptr(p);
@ -107,8 +129,12 @@ const char *nvme_trace_parse_admin_cmd(struct trace_seq *p,
u8 opcode, u8 *cdw10) u8 opcode, u8 *cdw10)
{ {
switch (opcode) { switch (opcode) {
case nvme_admin_delete_sq:
return nvme_trace_delete_sq(p, cdw10);
case nvme_admin_create_sq: case nvme_admin_create_sq:
return nvme_trace_create_sq(p, cdw10); return nvme_trace_create_sq(p, cdw10);
case nvme_admin_delete_cq:
return nvme_trace_delete_cq(p, cdw10);
case nvme_admin_create_cq: case nvme_admin_create_cq:
return nvme_trace_create_cq(p, cdw10); return nvme_trace_create_cq(p, cdw10);
case nvme_admin_identify: case nvme_admin_identify:
@ -178,7 +204,7 @@ static const char *nvme_trace_fabrics_common(struct trace_seq *p, u8 *spc)
{ {
const char *ret = trace_seq_buffer_ptr(p); const char *ret = trace_seq_buffer_ptr(p);
trace_seq_printf(p, "spcecific=%*ph", 24, spc); trace_seq_printf(p, "specific=%*ph", 24, spc);
trace_seq_putc(p, 0); trace_seq_putc(p, 0);
return ret; return ret;
} }

View File

@ -442,6 +442,9 @@ static void nvmet_execute_identify_ns(struct nvmet_req *req)
break; break;
} }
if (ns->bdev)
nvmet_bdev_set_limits(ns->bdev, id);
/* /*
* We just provide a single LBA format that matches what the * We just provide a single LBA format that matches what the
* underlying device reports. * underlying device reports.

View File

@ -588,8 +588,10 @@ static struct config_group *nvmet_ns_make(struct config_group *group,
goto out; goto out;
ret = -EINVAL; ret = -EINVAL;
if (nsid == 0 || nsid == NVME_NSID_ALL) if (nsid == 0 || nsid == NVME_NSID_ALL) {
pr_err("invalid nsid %#x", nsid);
goto out; goto out;
}
ret = -ENOMEM; ret = -ENOMEM;
ns = nvmet_ns_alloc(subsys, nsid); ns = nvmet_ns_alloc(subsys, nsid);

View File

@ -434,7 +434,7 @@ fcloop_fcp_recv_work(struct work_struct *work)
int ret = 0; int ret = 0;
bool aborted = false; bool aborted = false;
spin_lock(&tfcp_req->reqlock); spin_lock_irq(&tfcp_req->reqlock);
switch (tfcp_req->inistate) { switch (tfcp_req->inistate) {
case INI_IO_START: case INI_IO_START:
tfcp_req->inistate = INI_IO_ACTIVE; tfcp_req->inistate = INI_IO_ACTIVE;
@ -443,11 +443,11 @@ fcloop_fcp_recv_work(struct work_struct *work)
aborted = true; aborted = true;
break; break;
default: default:
spin_unlock(&tfcp_req->reqlock); spin_unlock_irq(&tfcp_req->reqlock);
WARN_ON(1); WARN_ON(1);
return; return;
} }
spin_unlock(&tfcp_req->reqlock); spin_unlock_irq(&tfcp_req->reqlock);
if (unlikely(aborted)) if (unlikely(aborted))
ret = -ECANCELED; ret = -ECANCELED;
@ -469,7 +469,7 @@ fcloop_fcp_abort_recv_work(struct work_struct *work)
struct nvmefc_fcp_req *fcpreq; struct nvmefc_fcp_req *fcpreq;
bool completed = false; bool completed = false;
spin_lock(&tfcp_req->reqlock); spin_lock_irq(&tfcp_req->reqlock);
fcpreq = tfcp_req->fcpreq; fcpreq = tfcp_req->fcpreq;
switch (tfcp_req->inistate) { switch (tfcp_req->inistate) {
case INI_IO_ABORTED: case INI_IO_ABORTED:
@ -478,11 +478,11 @@ fcloop_fcp_abort_recv_work(struct work_struct *work)
completed = true; completed = true;
break; break;
default: default:
spin_unlock(&tfcp_req->reqlock); spin_unlock_irq(&tfcp_req->reqlock);
WARN_ON(1); WARN_ON(1);
return; return;
} }
spin_unlock(&tfcp_req->reqlock); spin_unlock_irq(&tfcp_req->reqlock);
if (unlikely(completed)) { if (unlikely(completed)) {
/* remove reference taken in original abort downcall */ /* remove reference taken in original abort downcall */
@ -494,9 +494,9 @@ fcloop_fcp_abort_recv_work(struct work_struct *work)
nvmet_fc_rcv_fcp_abort(tfcp_req->tport->targetport, nvmet_fc_rcv_fcp_abort(tfcp_req->tport->targetport,
&tfcp_req->tgt_fcp_req); &tfcp_req->tgt_fcp_req);
spin_lock(&tfcp_req->reqlock); spin_lock_irq(&tfcp_req->reqlock);
tfcp_req->fcpreq = NULL; tfcp_req->fcpreq = NULL;
spin_unlock(&tfcp_req->reqlock); spin_unlock_irq(&tfcp_req->reqlock);
fcloop_call_host_done(fcpreq, tfcp_req, -ECANCELED); fcloop_call_host_done(fcpreq, tfcp_req, -ECANCELED);
/* call_host_done releases reference for abort downcall */ /* call_host_done releases reference for abort downcall */
@ -513,10 +513,10 @@ fcloop_tgt_fcprqst_done_work(struct work_struct *work)
container_of(work, struct fcloop_fcpreq, tio_done_work); container_of(work, struct fcloop_fcpreq, tio_done_work);
struct nvmefc_fcp_req *fcpreq; struct nvmefc_fcp_req *fcpreq;
spin_lock(&tfcp_req->reqlock); spin_lock_irq(&tfcp_req->reqlock);
fcpreq = tfcp_req->fcpreq; fcpreq = tfcp_req->fcpreq;
tfcp_req->inistate = INI_IO_COMPLETED; tfcp_req->inistate = INI_IO_COMPLETED;
spin_unlock(&tfcp_req->reqlock); spin_unlock_irq(&tfcp_req->reqlock);
fcloop_call_host_done(fcpreq, tfcp_req, tfcp_req->status); fcloop_call_host_done(fcpreq, tfcp_req, tfcp_req->status);
} }
@ -535,7 +535,7 @@ fcloop_fcp_req(struct nvme_fc_local_port *localport,
if (!rport->targetport) if (!rport->targetport)
return -ECONNREFUSED; return -ECONNREFUSED;
tfcp_req = kzalloc(sizeof(*tfcp_req), GFP_KERNEL); tfcp_req = kzalloc(sizeof(*tfcp_req), GFP_ATOMIC);
if (!tfcp_req) if (!tfcp_req)
return -ENOMEM; return -ENOMEM;
@ -621,12 +621,12 @@ fcloop_fcp_op(struct nvmet_fc_target_port *tgtport,
int fcp_err = 0, active, aborted; int fcp_err = 0, active, aborted;
u8 op = tgt_fcpreq->op; u8 op = tgt_fcpreq->op;
spin_lock(&tfcp_req->reqlock); spin_lock_irq(&tfcp_req->reqlock);
fcpreq = tfcp_req->fcpreq; fcpreq = tfcp_req->fcpreq;
active = tfcp_req->active; active = tfcp_req->active;
aborted = tfcp_req->aborted; aborted = tfcp_req->aborted;
tfcp_req->active = true; tfcp_req->active = true;
spin_unlock(&tfcp_req->reqlock); spin_unlock_irq(&tfcp_req->reqlock);
if (unlikely(active)) if (unlikely(active))
/* illegal - call while i/o active */ /* illegal - call while i/o active */
@ -634,9 +634,9 @@ fcloop_fcp_op(struct nvmet_fc_target_port *tgtport,
if (unlikely(aborted)) { if (unlikely(aborted)) {
/* target transport has aborted i/o prior */ /* target transport has aborted i/o prior */
spin_lock(&tfcp_req->reqlock); spin_lock_irq(&tfcp_req->reqlock);
tfcp_req->active = false; tfcp_req->active = false;
spin_unlock(&tfcp_req->reqlock); spin_unlock_irq(&tfcp_req->reqlock);
tgt_fcpreq->transferred_length = 0; tgt_fcpreq->transferred_length = 0;
tgt_fcpreq->fcp_error = -ECANCELED; tgt_fcpreq->fcp_error = -ECANCELED;
tgt_fcpreq->done(tgt_fcpreq); tgt_fcpreq->done(tgt_fcpreq);
@ -693,9 +693,9 @@ fcloop_fcp_op(struct nvmet_fc_target_port *tgtport,
break; break;
} }
spin_lock(&tfcp_req->reqlock); spin_lock_irq(&tfcp_req->reqlock);
tfcp_req->active = false; tfcp_req->active = false;
spin_unlock(&tfcp_req->reqlock); spin_unlock_irq(&tfcp_req->reqlock);
tgt_fcpreq->transferred_length = xfrlen; tgt_fcpreq->transferred_length = xfrlen;
tgt_fcpreq->fcp_error = fcp_err; tgt_fcpreq->fcp_error = fcp_err;
@ -715,9 +715,9 @@ fcloop_tgt_fcp_abort(struct nvmet_fc_target_port *tgtport,
* (one doing io, other doing abort) and only kills ops posted * (one doing io, other doing abort) and only kills ops posted
* after the abort request * after the abort request
*/ */
spin_lock(&tfcp_req->reqlock); spin_lock_irq(&tfcp_req->reqlock);
tfcp_req->aborted = true; tfcp_req->aborted = true;
spin_unlock(&tfcp_req->reqlock); spin_unlock_irq(&tfcp_req->reqlock);
tfcp_req->status = NVME_SC_INTERNAL; tfcp_req->status = NVME_SC_INTERNAL;
@ -765,7 +765,7 @@ fcloop_fcp_abort(struct nvme_fc_local_port *localport,
return; return;
/* break initiator/target relationship for io */ /* break initiator/target relationship for io */
spin_lock(&tfcp_req->reqlock); spin_lock_irq(&tfcp_req->reqlock);
switch (tfcp_req->inistate) { switch (tfcp_req->inistate) {
case INI_IO_START: case INI_IO_START:
case INI_IO_ACTIVE: case INI_IO_ACTIVE:
@ -775,11 +775,11 @@ fcloop_fcp_abort(struct nvme_fc_local_port *localport,
abortio = false; abortio = false;
break; break;
default: default:
spin_unlock(&tfcp_req->reqlock); spin_unlock_irq(&tfcp_req->reqlock);
WARN_ON(1); WARN_ON(1);
return; return;
} }
spin_unlock(&tfcp_req->reqlock); spin_unlock_irq(&tfcp_req->reqlock);
if (abortio) if (abortio)
/* leave the reference while the work item is scheduled */ /* leave the reference while the work item is scheduled */

View File

@ -8,6 +8,45 @@
#include <linux/module.h> #include <linux/module.h>
#include "nvmet.h" #include "nvmet.h"
void nvmet_bdev_set_limits(struct block_device *bdev, struct nvme_id_ns *id)
{
const struct queue_limits *ql = &bdev_get_queue(bdev)->limits;
/* Number of physical blocks per logical block. */
const u32 ppl = ql->physical_block_size / ql->logical_block_size;
/* Physical blocks per logical block, 0's based. */
const __le16 ppl0b = to0based(ppl);
/*
* For NVMe 1.2 and later, bit 1 indicates that the fields NAWUN,
* NAWUPF, and NACWU are defined for this namespace and should be
* used by the host for this namespace instead of the AWUN, AWUPF,
* and ACWU fields in the Identify Controller data structure. If
* any of these fields are zero that means that the corresponding
* field from the identify controller data structure should be used.
*/
id->nsfeat |= 1 << 1;
id->nawun = ppl0b;
id->nawupf = ppl0b;
id->nacwu = ppl0b;
/*
* Bit 4 indicates that the fields NPWG, NPWA, NPDG, NPDA, and
* NOWS are defined for this namespace and should be used by
* the host for I/O optimization.
*/
id->nsfeat |= 1 << 4;
/* NPWG = Namespace Preferred Write Granularity. 0's based */
id->npwg = ppl0b;
/* NPWA = Namespace Preferred Write Alignment. 0's based */
id->npwa = id->npwg;
/* NPDG = Namespace Preferred Deallocate Granularity. 0's based */
id->npdg = to0based(ql->discard_granularity / ql->logical_block_size);
/* NPDG = Namespace Preferred Deallocate Alignment */
id->npda = id->npdg;
/* NOWS = Namespace Optimal Write Size */
id->nows = to0based(ql->io_opt / ql->logical_block_size);
}
int nvmet_bdev_ns_enable(struct nvmet_ns *ns) int nvmet_bdev_ns_enable(struct nvmet_ns *ns)
{ {
int ret; int ret;

View File

@ -365,6 +365,7 @@ u16 nvmet_set_feat_async_event(struct nvmet_req *req, u32 mask);
void nvmet_execute_async_event(struct nvmet_req *req); void nvmet_execute_async_event(struct nvmet_req *req);
u16 nvmet_parse_connect_cmd(struct nvmet_req *req); u16 nvmet_parse_connect_cmd(struct nvmet_req *req);
void nvmet_bdev_set_limits(struct block_device *bdev, struct nvme_id_ns *id);
u16 nvmet_bdev_parse_io_cmd(struct nvmet_req *req); u16 nvmet_bdev_parse_io_cmd(struct nvmet_req *req);
u16 nvmet_file_parse_io_cmd(struct nvmet_req *req); u16 nvmet_file_parse_io_cmd(struct nvmet_req *req);
u16 nvmet_parse_admin_cmd(struct nvmet_req *req); u16 nvmet_parse_admin_cmd(struct nvmet_req *req);
@ -492,4 +493,11 @@ static inline u32 nvmet_rw_len(struct nvmet_req *req)
} }
u16 errno_to_nvme_status(struct nvmet_req *req, int errno); u16 errno_to_nvme_status(struct nvmet_req *req, int errno);
/* Convert a 32-bit number to a 16-bit 0's based number */
static inline __le16 to0based(u32 a)
{
return cpu_to_le16(max(1U, min(1U << 16, a)) - 1);
}
#endif /* _NVMET_H */ #endif /* _NVMET_H */

View File

@ -146,7 +146,7 @@ static const char *nvmet_trace_fabrics_common(struct trace_seq *p, u8 *spc)
{ {
const char *ret = trace_seq_buffer_ptr(p); const char *ret = trace_seq_buffer_ptr(p);
trace_seq_printf(p, "spcecific=%*ph", 24, spc); trace_seq_printf(p, "specific=%*ph", 24, spc);
trace_seq_putc(p, 0); trace_seq_putc(p, 0);
return ret; return ret;
} }

View File

@ -213,8 +213,7 @@ extern blk_status_t sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd);
extern void sd_zbc_complete(struct scsi_cmnd *cmd, unsigned int good_bytes, extern void sd_zbc_complete(struct scsi_cmnd *cmd, unsigned int good_bytes,
struct scsi_sense_hdr *sshdr); struct scsi_sense_hdr *sshdr);
extern int sd_zbc_report_zones(struct gendisk *disk, sector_t sector, extern int sd_zbc_report_zones(struct gendisk *disk, sector_t sector,
struct blk_zone *zones, unsigned int *nr_zones, struct blk_zone *zones, unsigned int *nr_zones);
gfp_t gfp_mask);
#else /* CONFIG_BLK_DEV_ZONED */ #else /* CONFIG_BLK_DEV_ZONED */

View File

@ -9,6 +9,8 @@
*/ */
#include <linux/blkdev.h> #include <linux/blkdev.h>
#include <linux/vmalloc.h>
#include <linux/sched/mm.h>
#include <asm/unaligned.h> #include <asm/unaligned.h>
@ -50,7 +52,7 @@ static void sd_zbc_parse_report(struct scsi_disk *sdkp, u8 *buf,
/** /**
* sd_zbc_do_report_zones - Issue a REPORT ZONES scsi command. * sd_zbc_do_report_zones - Issue a REPORT ZONES scsi command.
* @sdkp: The target disk * @sdkp: The target disk
* @buf: Buffer to use for the reply * @buf: vmalloc-ed buffer to use for the reply
* @buflen: the buffer size * @buflen: the buffer size
* @lba: Start LBA of the report * @lba: Start LBA of the report
* @partial: Do partial report * @partial: Do partial report
@ -79,7 +81,6 @@ static int sd_zbc_do_report_zones(struct scsi_disk *sdkp, unsigned char *buf,
put_unaligned_be32(buflen, &cmd[10]); put_unaligned_be32(buflen, &cmd[10]);
if (partial) if (partial)
cmd[14] = ZBC_REPORT_ZONE_PARTIAL; cmd[14] = ZBC_REPORT_ZONE_PARTIAL;
memset(buf, 0, buflen);
result = scsi_execute_req(sdp, cmd, DMA_FROM_DEVICE, result = scsi_execute_req(sdp, cmd, DMA_FROM_DEVICE,
buf, buflen, &sshdr, buf, buflen, &sshdr,
@ -103,45 +104,83 @@ static int sd_zbc_do_report_zones(struct scsi_disk *sdkp, unsigned char *buf,
return 0; return 0;
} }
/*
* Maximum number of zones to get with one report zones command.
*/
#define SD_ZBC_REPORT_MAX_ZONES 8192U
/**
* Allocate a buffer for report zones reply.
* @sdkp: The target disk
* @nr_zones: Maximum number of zones to report
* @buflen: Size of the buffer allocated
*
* Try to allocate a reply buffer for the number of requested zones.
* The size of the buffer allocated may be smaller than requested to
* satify the device constraint (max_hw_sectors, max_segments, etc).
*
* Return the address of the allocated buffer and update @buflen with
* the size of the allocated buffer.
*/
static void *sd_zbc_alloc_report_buffer(struct scsi_disk *sdkp,
unsigned int nr_zones, size_t *buflen)
{
struct request_queue *q = sdkp->disk->queue;
size_t bufsize;
void *buf;
/*
* Report zone buffer size should be at most 64B times the number of
* zones requested plus the 64B reply header, but should be at least
* SECTOR_SIZE for ATA devices.
* Make sure that this size does not exceed the hardware capabilities.
* Furthermore, since the report zone command cannot be split, make
* sure that the allocated buffer can always be mapped by limiting the
* number of pages allocated to the HBA max segments limit.
*/
nr_zones = min(nr_zones, SD_ZBC_REPORT_MAX_ZONES);
bufsize = roundup((nr_zones + 1) * 64, 512);
bufsize = min_t(size_t, bufsize,
queue_max_hw_sectors(q) << SECTOR_SHIFT);
bufsize = min_t(size_t, bufsize, queue_max_segments(q) << PAGE_SHIFT);
buf = vzalloc(bufsize);
if (buf)
*buflen = bufsize;
return buf;
}
/** /**
* sd_zbc_report_zones - Disk report zones operation. * sd_zbc_report_zones - Disk report zones operation.
* @disk: The target disk * @disk: The target disk
* @sector: Start 512B sector of the report * @sector: Start 512B sector of the report
* @zones: Array of zone descriptors * @zones: Array of zone descriptors
* @nr_zones: Number of descriptors in the array * @nr_zones: Number of descriptors in the array
* @gfp_mask: Memory allocation mask
* *
* Execute a report zones command on the target disk. * Execute a report zones command on the target disk.
*/ */
int sd_zbc_report_zones(struct gendisk *disk, sector_t sector, int sd_zbc_report_zones(struct gendisk *disk, sector_t sector,
struct blk_zone *zones, unsigned int *nr_zones, struct blk_zone *zones, unsigned int *nr_zones)
gfp_t gfp_mask)
{ {
struct scsi_disk *sdkp = scsi_disk(disk); struct scsi_disk *sdkp = scsi_disk(disk);
unsigned int i, buflen, nrz = *nr_zones; unsigned int i, nrz = *nr_zones;
unsigned char *buf; unsigned char *buf;
size_t offset = 0; size_t buflen = 0, offset = 0;
int ret = 0; int ret = 0;
if (!sd_is_zoned(sdkp)) if (!sd_is_zoned(sdkp))
/* Not a zoned device */ /* Not a zoned device */
return -EOPNOTSUPP; return -EOPNOTSUPP;
/* buf = sd_zbc_alloc_report_buffer(sdkp, nrz, &buflen);
* Get a reply buffer for the number of requested zones plus a header,
* without exceeding the device maximum command size. For ATA disks,
* buffers must be aligned to 512B.
*/
buflen = min(queue_max_hw_sectors(disk->queue) << 9,
roundup((nrz + 1) * 64, 512));
buf = kmalloc(buflen, gfp_mask);
if (!buf) if (!buf)
return -ENOMEM; return -ENOMEM;
ret = sd_zbc_do_report_zones(sdkp, buf, buflen, ret = sd_zbc_do_report_zones(sdkp, buf, buflen,
sectors_to_logical(sdkp->device, sector), true); sectors_to_logical(sdkp->device, sector), true);
if (ret) if (ret)
goto out_free_buf; goto out;
nrz = min(nrz, get_unaligned_be32(&buf[0]) / 64); nrz = min(nrz, get_unaligned_be32(&buf[0]) / 64);
for (i = 0; i < nrz; i++) { for (i = 0; i < nrz; i++) {
@ -152,8 +191,8 @@ int sd_zbc_report_zones(struct gendisk *disk, sector_t sector,
*nr_zones = nrz; *nr_zones = nrz;
out_free_buf: out:
kfree(buf); kvfree(buf);
return ret; return ret;
} }
@ -287,8 +326,6 @@ static int sd_zbc_check_zoned_characteristics(struct scsi_disk *sdkp,
return 0; return 0;
} }
#define SD_ZBC_BUF_SIZE 131072U
/** /**
* sd_zbc_check_zones - Check the device capacity and zone sizes * sd_zbc_check_zones - Check the device capacity and zone sizes
* @sdkp: Target disk * @sdkp: Target disk
@ -304,22 +341,28 @@ static int sd_zbc_check_zoned_characteristics(struct scsi_disk *sdkp,
*/ */
static int sd_zbc_check_zones(struct scsi_disk *sdkp, u32 *zblocks) static int sd_zbc_check_zones(struct scsi_disk *sdkp, u32 *zblocks)
{ {
size_t bufsize, buflen;
unsigned int noio_flag;
u64 zone_blocks = 0; u64 zone_blocks = 0;
sector_t max_lba, block = 0; sector_t max_lba, block = 0;
unsigned char *buf; unsigned char *buf;
unsigned char *rec; unsigned char *rec;
unsigned int buf_len;
unsigned int list_length;
int ret; int ret;
u8 same; u8 same;
/* Do all memory allocations as if GFP_NOIO was specified */
noio_flag = memalloc_noio_save();
/* Get a buffer */ /* Get a buffer */
buf = kmalloc(SD_ZBC_BUF_SIZE, GFP_KERNEL); buf = sd_zbc_alloc_report_buffer(sdkp, SD_ZBC_REPORT_MAX_ZONES,
if (!buf) &bufsize);
return -ENOMEM; if (!buf) {
ret = -ENOMEM;
goto out;
}
/* Do a report zone to get max_lba and the same field */ /* Do a report zone to get max_lba and the same field */
ret = sd_zbc_do_report_zones(sdkp, buf, SD_ZBC_BUF_SIZE, 0, false); ret = sd_zbc_do_report_zones(sdkp, buf, bufsize, 0, false);
if (ret) if (ret)
goto out_free; goto out_free;
@ -355,12 +398,12 @@ static int sd_zbc_check_zones(struct scsi_disk *sdkp, u32 *zblocks)
do { do {
/* Parse REPORT ZONES header */ /* Parse REPORT ZONES header */
list_length = get_unaligned_be32(&buf[0]) + 64; buflen = min_t(size_t, get_unaligned_be32(&buf[0]) + 64,
bufsize);
rec = buf + 64; rec = buf + 64;
buf_len = min(list_length, SD_ZBC_BUF_SIZE);
/* Parse zone descriptors */ /* Parse zone descriptors */
while (rec < buf + buf_len) { while (rec < buf + buflen) {
u64 this_zone_blocks = get_unaligned_be64(&rec[8]); u64 this_zone_blocks = get_unaligned_be64(&rec[8]);
if (zone_blocks == 0) { if (zone_blocks == 0) {
@ -376,8 +419,8 @@ static int sd_zbc_check_zones(struct scsi_disk *sdkp, u32 *zblocks)
} }
if (block < sdkp->capacity) { if (block < sdkp->capacity) {
ret = sd_zbc_do_report_zones(sdkp, buf, SD_ZBC_BUF_SIZE, ret = sd_zbc_do_report_zones(sdkp, buf, bufsize, block,
block, true); true);
if (ret) if (ret)
goto out_free; goto out_free;
} }
@ -408,7 +451,8 @@ static int sd_zbc_check_zones(struct scsi_disk *sdkp, u32 *zblocks)
} }
out_free: out_free:
kfree(buf); memalloc_noio_restore(noio_flag);
kvfree(buf);
return ret; return ret;
} }

View File

@ -2911,7 +2911,7 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree,
bio = NULL; bio = NULL;
} else { } else {
if (wbc) if (wbc)
wbc_account_io(wbc, page, page_size); wbc_account_cgroup_owner(wbc, page, page_size);
return 0; return 0;
} }
} }
@ -2924,7 +2924,7 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree,
bio->bi_opf = opf; bio->bi_opf = opf;
if (wbc) { if (wbc) {
wbc_init_bio(wbc, bio); wbc_init_bio(wbc, bio);
wbc_account_io(wbc, page, page_size); wbc_account_cgroup_owner(wbc, page, page_size);
} }
*bio_ret = bio; *bio_ret = bio;

View File

@ -3089,7 +3089,7 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
if (wbc) { if (wbc) {
wbc_init_bio(wbc, bio); wbc_init_bio(wbc, bio);
wbc_account_io(wbc, bh->b_page, bh->b_size); wbc_account_cgroup_owner(wbc, bh->b_page, bh->b_size);
} }
submit_bio(bio); submit_bio(bio);

View File

@ -396,7 +396,7 @@ static int io_submit_add_bh(struct ext4_io_submit *io,
ret = bio_add_page(io->io_bio, page, bh->b_size, bh_offset(bh)); ret = bio_add_page(io->io_bio, page, bh->b_size, bh_offset(bh));
if (ret != bh->b_size) if (ret != bh->b_size)
goto submit_and_retry; goto submit_and_retry;
wbc_account_io(io->io_wbc, page, bh->b_size); wbc_account_cgroup_owner(io->io_wbc, page, bh->b_size);
io->io_next_block++; io->io_next_block++;
return 0; return 0;
} }

View File

@ -470,7 +470,7 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio)
} }
if (fio->io_wbc && !is_read_io(fio->op)) if (fio->io_wbc && !is_read_io(fio->op))
wbc_account_io(fio->io_wbc, page, PAGE_SIZE); wbc_account_cgroup_owner(fio->io_wbc, page, PAGE_SIZE);
bio_set_op_attrs(bio, fio->op, fio->op_flags); bio_set_op_attrs(bio, fio->op, fio->op_flags);
@ -513,7 +513,7 @@ int f2fs_merge_page_bio(struct f2fs_io_info *fio)
} }
if (fio->io_wbc) if (fio->io_wbc)
wbc_account_io(fio->io_wbc, page, PAGE_SIZE); wbc_account_cgroup_owner(fio->io_wbc, page, PAGE_SIZE);
inc_page_count(fio->sbi, WB_DATA_TYPE(page)); inc_page_count(fio->sbi, WB_DATA_TYPE(page));
@ -592,7 +592,7 @@ void f2fs_submit_page_write(struct f2fs_io_info *fio)
} }
if (fio->io_wbc) if (fio->io_wbc)
wbc_account_io(fio->io_wbc, bio_page, PAGE_SIZE); wbc_account_cgroup_owner(fio->io_wbc, bio_page, PAGE_SIZE);
io->last_block_in_bio = fio->new_blkaddr; io->last_block_in_bio = fio->new_blkaddr;
f2fs_trace_ios(fio, 0); f2fs_trace_ios(fio, 0);

View File

@ -2818,9 +2818,7 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi)
while (zones && sector < nr_sectors) { while (zones && sector < nr_sectors) {
nr_zones = F2FS_REPORT_NR_ZONES; nr_zones = F2FS_REPORT_NR_ZONES;
err = blkdev_report_zones(bdev, sector, err = blkdev_report_zones(bdev, sector, zones, &nr_zones);
zones, &nr_zones,
GFP_KERNEL);
if (err) if (err)
break; break;
if (!nr_zones) { if (!nr_zones) {

View File

@ -270,6 +270,7 @@ void __inode_attach_wb(struct inode *inode, struct page *page)
if (unlikely(cmpxchg(&inode->i_wb, NULL, wb))) if (unlikely(cmpxchg(&inode->i_wb, NULL, wb)))
wb_put(wb); wb_put(wb);
} }
EXPORT_SYMBOL_GPL(__inode_attach_wb);
/** /**
* locked_inode_to_wb_and_lock_list - determine a locked inode's wb and lock it * locked_inode_to_wb_and_lock_list - determine a locked inode's wb and lock it
@ -582,6 +583,7 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
if (unlikely(wb_dying(wbc->wb))) if (unlikely(wb_dying(wbc->wb)))
inode_switch_wbs(inode, wbc->wb_id); inode_switch_wbs(inode, wbc->wb_id);
} }
EXPORT_SYMBOL_GPL(wbc_attach_and_unlock_inode);
/** /**
* wbc_detach_inode - disassociate wbc from inode and perform foreign detection * wbc_detach_inode - disassociate wbc from inode and perform foreign detection
@ -701,9 +703,10 @@ void wbc_detach_inode(struct writeback_control *wbc)
wb_put(wbc->wb); wb_put(wbc->wb);
wbc->wb = NULL; wbc->wb = NULL;
} }
EXPORT_SYMBOL_GPL(wbc_detach_inode);
/** /**
* wbc_account_io - account IO issued during writeback * wbc_account_cgroup_owner - account writeback to update inode cgroup ownership
* @wbc: writeback_control of the writeback in progress * @wbc: writeback_control of the writeback in progress
* @page: page being written out * @page: page being written out
* @bytes: number of bytes being written out * @bytes: number of bytes being written out
@ -712,8 +715,8 @@ void wbc_detach_inode(struct writeback_control *wbc)
* controlled by @wbc. Keep the book for foreign inode detection. See * controlled by @wbc. Keep the book for foreign inode detection. See
* wbc_detach_inode(). * wbc_detach_inode().
*/ */
void wbc_account_io(struct writeback_control *wbc, struct page *page, void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
size_t bytes) size_t bytes)
{ {
struct cgroup_subsys_state *css; struct cgroup_subsys_state *css;
int id; int id;
@ -724,7 +727,7 @@ void wbc_account_io(struct writeback_control *wbc, struct page *page,
* behind a slow cgroup. Ultimately, we want pageout() to kick off * behind a slow cgroup. Ultimately, we want pageout() to kick off
* regular writeback instead of writing things out itself. * regular writeback instead of writing things out itself.
*/ */
if (!wbc->wb) if (!wbc->wb || wbc->no_cgroup_owner)
return; return;
css = mem_cgroup_css_from_page(page); css = mem_cgroup_css_from_page(page);
@ -750,7 +753,7 @@ void wbc_account_io(struct writeback_control *wbc, struct page *page,
else else
wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes); wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes);
} }
EXPORT_SYMBOL_GPL(wbc_account_io); EXPORT_SYMBOL_GPL(wbc_account_cgroup_owner);
/** /**
* inode_congested - test whether an inode is congested * inode_congested - test whether an inode is congested

View File

@ -647,7 +647,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
* the confused fail path above (OOM) will be very confused when * the confused fail path above (OOM) will be very confused when
* it finds all bh marked clean (i.e. it will not write anything) * it finds all bh marked clean (i.e. it will not write anything)
*/ */
wbc_account_io(wbc, page, PAGE_SIZE); wbc_account_cgroup_owner(wbc, page, PAGE_SIZE);
length = first_unmapped << blkbits; length = first_unmapped << blkbits;
if (bio_add_page(bio, page, length, 0) < length) { if (bio_add_page(bio, page, length, 0) < length) {
bio = mpage_bio_submit(REQ_OP_WRITE, op_flags, bio); bio = mpage_bio_submit(REQ_OP_WRITE, op_flags, bio);

View File

@ -796,7 +796,7 @@ xfs_add_to_ioend(
} }
wpc->ioend->io_size += len; wpc->ioend->io_size += len;
wbc_account_io(wbc, page, len); wbc_account_cgroup_owner(wbc, page, len);
} }
STATIC void STATIC void

View File

@ -48,6 +48,7 @@ extern spinlock_t bdi_lock;
extern struct list_head bdi_list; extern struct list_head bdi_list;
extern struct workqueue_struct *bdi_wq; extern struct workqueue_struct *bdi_wq;
extern struct workqueue_struct *bdi_async_bio_wq;
static inline bool wb_has_dirty_io(struct bdi_writeback *wb) static inline bool wb_has_dirty_io(struct bdi_writeback *wb)
{ {

View File

@ -132,13 +132,17 @@ struct blkcg_gq {
struct blkg_policy_data *pd[BLKCG_MAX_POLS]; struct blkg_policy_data *pd[BLKCG_MAX_POLS];
struct rcu_head rcu_head; spinlock_t async_bio_lock;
struct bio_list async_bios;
struct work_struct async_bio_work;
atomic_t use_delay; atomic_t use_delay;
atomic64_t delay_nsec; atomic64_t delay_nsec;
atomic64_t delay_start; atomic64_t delay_start;
u64 last_delay; u64 last_delay;
int last_use; int last_use;
struct rcu_head rcu_head;
}; };
typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp); typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp);
@ -701,6 +705,15 @@ static inline bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg
struct bio *bio) { return false; } struct bio *bio) { return false; }
#endif #endif
bool __blkcg_punt_bio_submit(struct bio *bio);
static inline bool blkcg_punt_bio_submit(struct bio *bio)
{
if (bio->bi_opf & REQ_CGROUP_PUNT)
return __blkcg_punt_bio_submit(bio);
else
return false;
}
static inline void blkcg_bio_issue_init(struct bio *bio) static inline void blkcg_bio_issue_init(struct bio *bio)
{ {
@ -848,6 +861,7 @@ static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; }
static inline void blkg_get(struct blkcg_gq *blkg) { } static inline void blkg_get(struct blkcg_gq *blkg) { }
static inline void blkg_put(struct blkcg_gq *blkg) { } static inline void blkg_put(struct blkcg_gq *blkg) { }
static inline bool blkcg_punt_bio_submit(struct bio *bio) { return false; }
static inline void blkcg_bio_issue_init(struct bio *bio) { } static inline void blkcg_bio_issue_init(struct bio *bio) { }
static inline bool blkcg_bio_issue_check(struct request_queue *q, static inline bool blkcg_bio_issue_check(struct request_queue *q,
struct bio *bio) { return true; } struct bio *bio) { return true; }

View File

@ -311,6 +311,14 @@ enum req_flag_bits {
__REQ_RAHEAD, /* read ahead, can fail anytime */ __REQ_RAHEAD, /* read ahead, can fail anytime */
__REQ_BACKGROUND, /* background IO */ __REQ_BACKGROUND, /* background IO */
__REQ_NOWAIT, /* Don't wait if request will block */ __REQ_NOWAIT, /* Don't wait if request will block */
/*
* When a shared kthread needs to issue a bio for a cgroup, doing
* so synchronously can lead to priority inversions as the kthread
* can be trapped waiting for that cgroup. CGROUP_PUNT flag makes
* submit_bio() punt the actual issuing to a dedicated per-blkcg
* work item to avoid such priority inversions.
*/
__REQ_CGROUP_PUNT,
/* command specific flags for REQ_OP_WRITE_ZEROES: */ /* command specific flags for REQ_OP_WRITE_ZEROES: */
__REQ_NOUNMAP, /* do not free blocks when zeroing */ __REQ_NOUNMAP, /* do not free blocks when zeroing */
@ -337,6 +345,8 @@ enum req_flag_bits {
#define REQ_RAHEAD (1ULL << __REQ_RAHEAD) #define REQ_RAHEAD (1ULL << __REQ_RAHEAD)
#define REQ_BACKGROUND (1ULL << __REQ_BACKGROUND) #define REQ_BACKGROUND (1ULL << __REQ_BACKGROUND)
#define REQ_NOWAIT (1ULL << __REQ_NOWAIT) #define REQ_NOWAIT (1ULL << __REQ_NOWAIT)
#define REQ_CGROUP_PUNT (1ULL << __REQ_CGROUP_PUNT)
#define REQ_NOUNMAP (1ULL << __REQ_NOUNMAP) #define REQ_NOUNMAP (1ULL << __REQ_NOUNMAP)
#define REQ_HIPRI (1ULL << __REQ_HIPRI) #define REQ_HIPRI (1ULL << __REQ_HIPRI)

View File

@ -344,10 +344,15 @@ struct queue_limits {
#ifdef CONFIG_BLK_DEV_ZONED #ifdef CONFIG_BLK_DEV_ZONED
/*
* Maximum number of zones to report with a single report zones command.
*/
#define BLK_ZONED_REPORT_MAX_ZONES 8192U
extern unsigned int blkdev_nr_zones(struct block_device *bdev); extern unsigned int blkdev_nr_zones(struct block_device *bdev);
extern int blkdev_report_zones(struct block_device *bdev, extern int blkdev_report_zones(struct block_device *bdev,
sector_t sector, struct blk_zone *zones, sector_t sector, struct blk_zone *zones,
unsigned int *nr_zones, gfp_t gfp_mask); unsigned int *nr_zones);
extern int blkdev_reset_zones(struct block_device *bdev, sector_t sectors, extern int blkdev_reset_zones(struct block_device *bdev, sector_t sectors,
sector_t nr_sectors, gfp_t gfp_mask); sector_t nr_sectors, gfp_t gfp_mask);
extern int blk_revalidate_disk_zones(struct gendisk *disk); extern int blk_revalidate_disk_zones(struct gendisk *disk);
@ -681,7 +686,7 @@ static inline bool blk_queue_is_zoned(struct request_queue *q)
} }
} }
static inline unsigned int blk_queue_zone_sectors(struct request_queue *q) static inline sector_t blk_queue_zone_sectors(struct request_queue *q)
{ {
return blk_queue_is_zoned(q) ? q->limits.chunk_sectors : 0; return blk_queue_is_zoned(q) ? q->limits.chunk_sectors : 0;
} }
@ -1418,7 +1423,7 @@ static inline bool bdev_is_zoned(struct block_device *bdev)
return false; return false;
} }
static inline unsigned int bdev_zone_sectors(struct block_device *bdev) static inline sector_t bdev_zone_sectors(struct block_device *bdev)
{ {
struct request_queue *q = bdev_get_queue(bdev); struct request_queue *q = bdev_get_queue(bdev);
@ -1673,8 +1678,7 @@ struct block_device_operations {
/* this callback is with swap_lock and sometimes page table lock held */ /* this callback is with swap_lock and sometimes page table lock held */
void (*swap_slot_free_notify) (struct block_device *, unsigned long); void (*swap_slot_free_notify) (struct block_device *, unsigned long);
int (*report_zones)(struct gendisk *, sector_t sector, int (*report_zones)(struct gendisk *, sector_t sector,
struct blk_zone *zones, unsigned int *nr_zones, struct blk_zone *zones, unsigned int *nr_zones);
gfp_t gfp_mask);
struct module *owner; struct module *owner;
const struct pr_ops *pr_ops; const struct pr_ops *pr_ops;
}; };

View File

@ -699,6 +699,7 @@ void cgroup_path_from_kernfs_id(const union kernfs_node_id *id,
struct cgroup_subsys_state; struct cgroup_subsys_state;
struct cgroup; struct cgroup;
static inline void css_get(struct cgroup_subsys_state *css) {}
static inline void css_put(struct cgroup_subsys_state *css) {} static inline void css_put(struct cgroup_subsys_state *css) {}
static inline int cgroup_attach_task_all(struct task_struct *from, static inline int cgroup_attach_task_all(struct task_struct *from,
struct task_struct *t) { return 0; } struct task_struct *t) { return 0; }

View File

@ -95,8 +95,7 @@ typedef int (*dm_prepare_ioctl_fn) (struct dm_target *ti, struct block_device **
typedef int (*dm_report_zones_fn) (struct dm_target *ti, sector_t sector, typedef int (*dm_report_zones_fn) (struct dm_target *ti, sector_t sector,
struct blk_zone *zones, struct blk_zone *zones,
unsigned int *nr_zones, unsigned int *nr_zones);
gfp_t gfp_mask);
/* /*
* These iteration functions are typically used to check (and combine) * These iteration functions are typically used to check (and combine)

View File

@ -75,7 +75,7 @@ struct elevator_type
size_t icq_size; /* see iocontext.h */ size_t icq_size; /* see iocontext.h */
size_t icq_align; /* ditto */ size_t icq_align; /* ditto */
struct elv_fs_entry *elevator_attrs; struct elv_fs_entry *elevator_attrs;
char elevator_name[ELV_NAME_MAX]; const char *elevator_name;
const char *elevator_alias; const char *elevator_alias;
struct module *elevator_owner; struct module *elevator_owner;
#ifdef CONFIG_BLK_DEBUG_FS #ifdef CONFIG_BLK_DEBUG_FS
@ -160,15 +160,6 @@ extern struct request *elv_rb_find(struct rb_root *, sector_t);
#define ELEVATOR_INSERT_FLUSH 5 #define ELEVATOR_INSERT_FLUSH 5
#define ELEVATOR_INSERT_SORT_MERGE 6 #define ELEVATOR_INSERT_SORT_MERGE 6
/*
* return values from elevator_may_queue_fn
*/
enum {
ELV_MQUEUE_MAY,
ELV_MQUEUE_NO,
ELV_MQUEUE_MUST,
};
#define rq_end_sector(rq) (blk_rq_pos(rq) + blk_rq_sectors(rq)) #define rq_end_sector(rq) (blk_rq_pos(rq) + blk_rq_sectors(rq))
#define rb_entry_rq(node) rb_entry((node), struct request, rb_node) #define rb_entry_rq(node) rb_entry((node), struct request, rb_node)

View File

@ -315,7 +315,7 @@ struct nvme_id_ns {
__u8 nmic; __u8 nmic;
__u8 rescap; __u8 rescap;
__u8 fpi; __u8 fpi;
__u8 rsvd33; __u8 dlfeat;
__le16 nawun; __le16 nawun;
__le16 nawupf; __le16 nawupf;
__le16 nacwu; __le16 nacwu;
@ -324,11 +324,17 @@ struct nvme_id_ns {
__le16 nabspf; __le16 nabspf;
__le16 noiob; __le16 noiob;
__u8 nvmcap[16]; __u8 nvmcap[16];
__u8 rsvd64[28]; __le16 npwg;
__le16 npwa;
__le16 npdg;
__le16 npda;
__le16 nows;
__u8 rsvd74[18];
__le32 anagrpid; __le32 anagrpid;
__u8 rsvd96[3]; __u8 rsvd96[3];
__u8 nsattr; __u8 nsattr;
__u8 rsvd100[4]; __le16 nvmsetid;
__le16 endgid;
__u8 nguid[16]; __u8 nguid[16];
__u8 eui64[8]; __u8 eui64[8];
struct nvme_lbaf lbaf[16]; struct nvme_lbaf lbaf[16];

View File

@ -11,6 +11,7 @@
#include <linux/flex_proportions.h> #include <linux/flex_proportions.h>
#include <linux/backing-dev-defs.h> #include <linux/backing-dev-defs.h>
#include <linux/blk_types.h> #include <linux/blk_types.h>
#include <linux/blk-cgroup.h>
struct bio; struct bio;
@ -68,6 +69,17 @@ struct writeback_control {
unsigned for_reclaim:1; /* Invoked from the page allocator */ unsigned for_reclaim:1; /* Invoked from the page allocator */
unsigned range_cyclic:1; /* range_start is cyclic */ unsigned range_cyclic:1; /* range_start is cyclic */
unsigned for_sync:1; /* sync(2) WB_SYNC_ALL writeback */ unsigned for_sync:1; /* sync(2) WB_SYNC_ALL writeback */
/*
* When writeback IOs are bounced through async layers, only the
* initial synchronous phase should be accounted towards inode
* cgroup ownership arbitration to avoid confusion. Later stages
* can set the following flag to disable the accounting.
*/
unsigned no_cgroup_owner:1;
unsigned punt_to_cgroup:1; /* cgrp punting, see __REQ_CGROUP_PUNT */
#ifdef CONFIG_CGROUP_WRITEBACK #ifdef CONFIG_CGROUP_WRITEBACK
struct bdi_writeback *wb; /* wb this writeback is issued under */ struct bdi_writeback *wb; /* wb this writeback is issued under */
struct inode *inode; /* inode being written out */ struct inode *inode; /* inode being written out */
@ -84,12 +96,27 @@ struct writeback_control {
static inline int wbc_to_write_flags(struct writeback_control *wbc) static inline int wbc_to_write_flags(struct writeback_control *wbc)
{ {
if (wbc->sync_mode == WB_SYNC_ALL) int flags = 0;
return REQ_SYNC;
else if (wbc->for_kupdate || wbc->for_background)
return REQ_BACKGROUND;
return 0; if (wbc->punt_to_cgroup)
flags = REQ_CGROUP_PUNT;
if (wbc->sync_mode == WB_SYNC_ALL)
flags |= REQ_SYNC;
else if (wbc->for_kupdate || wbc->for_background)
flags |= REQ_BACKGROUND;
return flags;
}
static inline struct cgroup_subsys_state *
wbc_blkcg_css(struct writeback_control *wbc)
{
#ifdef CONFIG_CGROUP_WRITEBACK
if (wbc->wb)
return wbc->wb->blkcg_css;
#endif
return blkcg_root_css;
} }
/* /*
@ -188,8 +215,8 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
struct inode *inode) struct inode *inode)
__releases(&inode->i_lock); __releases(&inode->i_lock);
void wbc_detach_inode(struct writeback_control *wbc); void wbc_detach_inode(struct writeback_control *wbc);
void wbc_account_io(struct writeback_control *wbc, struct page *page, void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
size_t bytes); size_t bytes);
void cgroup_writeback_umount(void); void cgroup_writeback_umount(void);
/** /**
@ -291,8 +318,8 @@ static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio)
{ {
} }
static inline void wbc_account_io(struct writeback_control *wbc, static inline void wbc_account_cgroup_owner(struct writeback_control *wbc,
struct page *page, size_t bytes) struct page *page, size_t bytes)
{ {
} }