From bf7ec93c644cb0064ba7d2fc40d4841c5ba382ab Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 4 Oct 2019 17:01:08 +0300 Subject: [PATCH 1/8] io_uring: fix reversed nonblock flag for link submission io_queue_link_head() accepts @force_nonblock flag, but io_ring_submit() passes something opposite. Fixes: c576666863b78 ("io_uring: optimize submit_and_wait API") Reported-by: kbuild test robot Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 0bc167aca46d..ab8c4e5e442c 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2761,7 +2761,7 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit, if (link) io_queue_link_head(ctx, link, &link->submit, shadow_req, - block_for_last); + !block_for_last); if (statep) io_submit_state_end(statep); From a2b90f11217790ec0964ba9c93a4abb369758c26 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Fri, 4 Oct 2019 13:00:24 +0300 Subject: [PATCH 2/8] bdi: Do not use freezable workqueue A removable block device, such as NVMe or SSD connected over Thunderbolt can be hot-removed any time including when the system is suspended. When device is hot-removed during suspend and the system gets resumed, kernel first resumes devices and then thaws the userspace including freezable workqueues. What happens in that case is that the NVMe driver notices that the device is unplugged and removes it from the system. This ends up calling bdi_unregister() for the gendisk which then schedules wb_workfn() to be run one more time. However, since the bdi_wq is still frozen flush_delayed_work() call in wb_shutdown() blocks forever halting system resume process. User sees this as hang as nothing is happening anymore. Triggering sysrq-w reveals this: Workqueue: nvme-wq nvme_remove_dead_ctrl_work [nvme] Call Trace: ? __schedule+0x2c5/0x630 ? wait_for_completion+0xa4/0x120 schedule+0x3e/0xc0 schedule_timeout+0x1c9/0x320 ? resched_curr+0x1f/0xd0 ? wait_for_completion+0xa4/0x120 wait_for_completion+0xc3/0x120 ? wake_up_q+0x60/0x60 __flush_work+0x131/0x1e0 ? flush_workqueue_prep_pwqs+0x130/0x130 bdi_unregister+0xb9/0x130 del_gendisk+0x2d2/0x2e0 nvme_ns_remove+0xed/0x110 [nvme_core] nvme_remove_namespaces+0x96/0xd0 [nvme_core] nvme_remove+0x5b/0x160 [nvme] pci_device_remove+0x36/0x90 device_release_driver_internal+0xdf/0x1c0 nvme_remove_dead_ctrl_work+0x14/0x30 [nvme] process_one_work+0x1c2/0x3f0 worker_thread+0x48/0x3e0 kthread+0x100/0x140 ? current_work+0x30/0x30 ? kthread_park+0x80/0x80 ret_from_fork+0x35/0x40 This is not limited to NVMes so exactly same issue can be reproduced by hot-removing SSD (over Thunderbolt) while the system is suspended. Prevent this from happening by removing WQ_FREEZABLE from bdi_wq. Reported-by: AceLan Kao Link: https://marc.info/?l=linux-kernel&m=138695698516487 Link: https://bugzilla.kernel.org/show_bug.cgi?id=204385 Link: https://lore.kernel.org/lkml/20191002122136.GD2819@lahna.fi.intel.com/#t Acked-by: Rafael J. Wysocki Signed-off-by: Mika Westerberg Signed-off-by: Jens Axboe --- mm/backing-dev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/backing-dev.c b/mm/backing-dev.c index d9daa3e422d0..c360f6a6c844 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -239,8 +239,8 @@ static int __init default_bdi_init(void) { int err; - bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_FREEZABLE | - WQ_UNBOUND | WQ_SYSFS, 0); + bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_UNBOUND | + WQ_SYSFS, 0); if (!bdi_wq) return -ENOMEM; From 0e48f51cbbfbdb79149806de14dcb8bf0f01ca94 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Fri, 4 Oct 2019 13:00:25 +0300 Subject: [PATCH 3/8] Revert "libata, freezer: avoid block device removal while system is frozen" This reverts commit 85fbd722ad0f5d64d1ad15888cd1eb2188bfb557. The commit was added as a quick band-aid for a hang that happened when a block device was removed during system suspend. Now that bdi_wq is not freezable anymore the hang should not be possible and we can get rid of this hack by reverting it. Acked-by: Rafael J. Wysocki Signed-off-by: Mika Westerberg Signed-off-by: Jens Axboe --- drivers/ata/libata-scsi.c | 21 --------------------- kernel/freezer.c | 6 ------ 2 files changed, 27 deletions(-) diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c index 76d0f9de767b..58e09ffe8b9c 100644 --- a/drivers/ata/libata-scsi.c +++ b/drivers/ata/libata-scsi.c @@ -4791,27 +4791,6 @@ void ata_scsi_hotplug(struct work_struct *work) return; } - /* - * XXX - UGLY HACK - * - * The block layer suspend/resume path is fundamentally broken due - * to freezable kthreads and workqueue and may deadlock if a block - * device gets removed while resume is in progress. I don't know - * what the solution is short of removing freezable kthreads and - * workqueues altogether. - * - * The following is an ugly hack to avoid kicking off device - * removal while freezer is active. This is a joke but does avoid - * this particular deadlock scenario. - * - * https://bugzilla.kernel.org/show_bug.cgi?id=62801 - * http://marc.info/?l=linux-kernel&m=138695698516487 - */ -#ifdef CONFIG_FREEZER - while (pm_freezing) - msleep(10); -#endif - DPRINTK("ENTER\n"); mutex_lock(&ap->scsi_scan_mutex); diff --git a/kernel/freezer.c b/kernel/freezer.c index c0738424bb43..dc520f01f99d 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -22,12 +22,6 @@ EXPORT_SYMBOL(system_freezing_cnt); bool pm_freezing; bool pm_nosig_freezing; -/* - * Temporary export for the deadlock workaround in ata_scsi_hotplug(). - * Remove once the hack becomes unnecessary. - */ -EXPORT_SYMBOL_GPL(pm_freezing); - /* protects freezing and frozen transitions */ static DEFINE_SPINLOCK(freezer_lock); From b84477d3ebb96294f87dc3161e53fa8fe22d9bfd Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Sat, 5 Oct 2019 11:59:27 -0700 Subject: [PATCH 4/8] blk-wbt: fix performance regression in wbt scale_up/scale_down scale_up wakes up waiters after scaling up. But after scaling max, it should not wake up more waiters as waiters will not have anything to do. This patch fixes this by making scale_up (and also scale_down) return when threshold is reached. This bug causes increased fdatasync latency when fdatasync and dd conv=sync are performed in parallel on 4.19 compared to 4.14. This bug was introduced during refactoring of blk-wbt code. Fixes: a79050434b45 ("blk-rq-qos: refactor out common elements of blk-wbt") Cc: stable@vger.kernel.org Cc: Josef Bacik Signed-off-by: Harshad Shirwadkar Signed-off-by: Jens Axboe --- block/blk-rq-qos.c | 14 +++++++++----- block/blk-rq-qos.h | 4 ++-- block/blk-wbt.c | 6 ++++-- 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c index 61b635bc2a31..656460636ad3 100644 --- a/block/blk-rq-qos.c +++ b/block/blk-rq-qos.c @@ -160,24 +160,27 @@ bool rq_depth_calc_max_depth(struct rq_depth *rqd) return ret; } -void rq_depth_scale_up(struct rq_depth *rqd) +/* Returns true on success and false if scaling up wasn't possible */ +bool rq_depth_scale_up(struct rq_depth *rqd) { /* * Hit max in previous round, stop here */ if (rqd->scaled_max) - return; + return false; rqd->scale_step--; rqd->scaled_max = rq_depth_calc_max_depth(rqd); + return true; } /* * Scale rwb down. If 'hard_throttle' is set, do it quicker, since we - * had a latency violation. + * had a latency violation. Returns true on success and returns false if + * scaling down wasn't possible. */ -void rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle) +bool rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle) { /* * Stop scaling down when we've hit the limit. This also prevents @@ -185,7 +188,7 @@ void rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle) * keep up. */ if (rqd->max_depth == 1) - return; + return false; if (rqd->scale_step < 0 && hard_throttle) rqd->scale_step = 0; @@ -194,6 +197,7 @@ void rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle) rqd->scaled_max = false; rq_depth_calc_max_depth(rqd); + return true; } struct rq_qos_wait_data { diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h index 08a09dbe0f4b..e8cb68f6958a 100644 --- a/block/blk-rq-qos.h +++ b/block/blk-rq-qos.h @@ -130,8 +130,8 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data, acquire_inflight_cb_t *acquire_inflight_cb, cleanup_cb_t *cleanup_cb); bool rq_wait_inc_below(struct rq_wait *rq_wait, unsigned int limit); -void rq_depth_scale_up(struct rq_depth *rqd); -void rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle); +bool rq_depth_scale_up(struct rq_depth *rqd); +bool rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle); bool rq_depth_calc_max_depth(struct rq_depth *rqd); void __rq_qos_cleanup(struct rq_qos *rqos, struct bio *bio); diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 8af553a0ba00..8641ba9793c5 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -308,7 +308,8 @@ static void calc_wb_limits(struct rq_wb *rwb) static void scale_up(struct rq_wb *rwb) { - rq_depth_scale_up(&rwb->rq_depth); + if (!rq_depth_scale_up(&rwb->rq_depth)) + return; calc_wb_limits(rwb); rwb->unknown_cnt = 0; rwb_wake_all(rwb); @@ -317,7 +318,8 @@ static void scale_up(struct rq_wb *rwb) static void scale_down(struct rq_wb *rwb, bool hard_throttle) { - rq_depth_scale_down(&rwb->rq_depth, hard_throttle); + if (!rq_depth_scale_down(&rwb->rq_depth, hard_throttle)) + return; calc_wb_limits(rwb); rwb->unknown_cnt = 0; rwb_trace_step(rwb, "scale down"); From 6805b32ec2b0897eb180295385efe306e5ac3b3d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 8 Oct 2019 02:18:42 +0300 Subject: [PATCH 5/8] io_uring: remove wait loop spurious wakeups Any changes interesting to tasks waiting in io_cqring_wait() are commited with io_cqring_ev_posted(). However, io_ring_drop_ctx_refs() also tries to do that but with no reason, that means spurious wakeups every io_free_req() and io_uring_enter(). Just use percpu_ref_put() instead. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index ab8c4e5e442c..ceb3497bdd2a 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -591,14 +591,6 @@ static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 user_data, io_cqring_ev_posted(ctx); } -static void io_ring_drop_ctx_refs(struct io_ring_ctx *ctx, unsigned refs) -{ - percpu_ref_put_many(&ctx->refs, refs); - - if (waitqueue_active(&ctx->wait)) - wake_up(&ctx->wait); -} - static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, struct io_submit_state *state) { @@ -646,7 +638,7 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, req->result = 0; return req; out: - io_ring_drop_ctx_refs(ctx, 1); + percpu_ref_put(&ctx->refs); return NULL; } @@ -654,7 +646,7 @@ static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr) { if (*nr) { kmem_cache_free_bulk(req_cachep, *nr, reqs); - io_ring_drop_ctx_refs(ctx, *nr); + percpu_ref_put_many(&ctx->refs, *nr); *nr = 0; } } @@ -663,7 +655,7 @@ static void __io_free_req(struct io_kiocb *req) { if (req->file && !(req->flags & REQ_F_FIXED_FILE)) fput(req->file); - io_ring_drop_ctx_refs(req->ctx, 1); + percpu_ref_put(&req->ctx->refs); kmem_cache_free(req_cachep, req); } @@ -3584,7 +3576,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, } } - io_ring_drop_ctx_refs(ctx, 1); + percpu_ref_put(&ctx->refs); out_fput: fdput(f); return submitted ? submitted : ret; From 8a99734081775c012a4a6c442fdef0379fe52bdf Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 9 Oct 2019 14:40:13 -0600 Subject: [PATCH 6/8] io_uring: only flush workqueues on fileset removal We should not remove the workqueue, we just need to ensure that the workqueues are synced. The workqueues are torn down on ctx removal. Cc: stable@vger.kernel.org Fixes: 6b06314c47e1 ("io_uring: add file set registration") Reported-by: Stefan Hajnoczi Signed-off-by: Jens Axboe --- fs/io_uring.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index ceb3497bdd2a..2c44648217bd 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2866,8 +2866,12 @@ static void io_finish_async(struct io_ring_ctx *ctx) static void io_destruct_skb(struct sk_buff *skb) { struct io_ring_ctx *ctx = skb->sk->sk_user_data; + int i; + + for (i = 0; i < ARRAY_SIZE(ctx->sqo_wq); i++) + if (ctx->sqo_wq[i]) + flush_workqueue(ctx->sqo_wq[i]); - io_finish_async(ctx); unix_destruct_scm(skb); } From 79a85e214d62da9a750cc63ef49483e62abbda81 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 10 Oct 2019 00:38:13 +0900 Subject: [PATCH 7/8] null_blk: Fix zoned command return code The return code from null_handle_zoned() sets the cmd->error value. Returning OK status when an error occured overwrites the intended cmd->error. Return the appropriate error code instead of setting the error in the cmd. Fixes: fceb5d1b19cbe626 ("null_blk: create a helper for zoned devices") Cc: Chaitanya Kulkarni Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/block/null_blk_zoned.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/block/null_blk_zoned.c b/drivers/block/null_blk_zoned.c index eabc116832a7..3d7fdea872f8 100644 --- a/drivers/block/null_blk_zoned.c +++ b/drivers/block/null_blk_zoned.c @@ -142,8 +142,7 @@ static blk_status_t null_zone_reset(struct nullb_cmd *cmd, sector_t sector) zone->wp = zone->start; break; default: - cmd->error = BLK_STS_NOTSUPP; - break; + return BLK_STS_NOTSUPP; } return BLK_STS_OK; } From 862488105b84ca744b3d8ff131e0fcfe10644be1 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Thu, 19 Sep 2019 11:44:27 +0530 Subject: [PATCH 8/8] nbd: fix possible sysfs duplicate warning 1. nbd_put takes the mutex and drops nbd->ref to 0. It then does idr_remove and drops the mutex. 2. nbd_genl_connect takes the mutex. idr_find/idr_for_each fails to find an existing device, so it does nbd_dev_add. 3. just before the nbd_put could call nbd_dev_remove or not finished totally, but if nbd_dev_add try to add_disk, we can hit: debugfs: Directory 'nbd1' with parent 'block' already present! This patch will make sure all the disk add/remove stuff are done by holding the nbd_index_mutex lock. Reported-by: Mike Christie Reviewed-by: Josef Bacik Signed-off-by: Xiubo Li Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index ac07e8c94c79..478aa86fc1f2 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -248,8 +248,8 @@ static void nbd_put(struct nbd_device *nbd) if (refcount_dec_and_mutex_lock(&nbd->refs, &nbd_index_mutex)) { idr_remove(&nbd_index_idr, nbd->index); - mutex_unlock(&nbd_index_mutex); nbd_dev_remove(nbd); + mutex_unlock(&nbd_index_mutex); } }