10 лет назад · d82312c808
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -557,6 +557,18 @@ void blk_cleanup_queue(struct request_queue *q)
 
				 }
			
 
				 EXPORT_SYMBOL(blk_cleanup_queue);
			
 
				 
			
 
				+/* Allocate memory local to the request queue */
			
 
				+static void *alloc_request_struct(gfp_t gfp_mask, void *data)
			
 
				+{
			
 
				+	int nid = (int)(long)data;
			
 
				+	return kmem_cache_alloc_node(request_cachep, gfp_mask, nid);
			
 
				+}
			
 
				+
			
 
				+static void free_request_struct(void *element, void *unused)
			
 
				+{
			
 
				+	kmem_cache_free(request_cachep, element);
			
 
				+}
			
 
				+
			
 
				 int blk_init_rl(struct request_list *rl, struct request_queue *q,
			
 
				 		gfp_t gfp_mask)
			
 
				 {
			
@@ -569,9 +581,10 @@ int blk_init_rl(struct request_list *rl, struct request_queue *q,
 
				 	init_waitqueue_head(&rl->wait[BLK_RW_SYNC]);
			
 
				 	init_waitqueue_head(&rl->wait[BLK_RW_ASYNC]);
			
 
				 
			
 
				-	rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab,
			
 
				-					  mempool_free_slab, request_cachep,
			
 
				-					  gfp_mask, q->node);
			
 
				+	rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, alloc_request_struct,
			
 
				+					  free_request_struct,
			
 
				+					  (void *)(long)q->node, gfp_mask,
			
 
				+					  q->node);
			
 
				 	if (!rl->rq_pool)
			
 
				 		return -ENOMEM;
			
 
				 
			
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -436,6 +436,7 @@ int blk_mq_register_disk(struct gendisk *disk)
 
				 
			
 
				 	return 0;
			
 
				 }
			
 
				+EXPORT_SYMBOL_GPL(blk_mq_register_disk);
			
 
				 
			
 
				 void blk_mq_sysfs_unregister(struct request_queue *q)
			
 
				 {
			
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -33,7 +33,6 @@ static DEFINE_MUTEX(all_q_mutex);
 
				 static LIST_HEAD(all_q_list);
			
 
				 
			
 
				 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx);
			
 
				-static void blk_mq_run_queues(struct request_queue *q);
			
 
				 
			
 
				 /*
			
 
				  * Check if any of the ctx's have pending work in this hardware queue
			
@@ -78,7 +77,7 @@ static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
 
				 	clear_bit(CTX_TO_BIT(hctx, ctx), &bm->word);
			
 
				 }
			
 
				 
			
 
				-static int blk_mq_queue_enter(struct request_queue *q)
			
 
				+static int blk_mq_queue_enter(struct request_queue *q, gfp_t gfp)
			
 
				 {
			
 
				 	while (true) {
			
 
				 		int ret;
			
@@ -86,6 +85,9 @@ static int blk_mq_queue_enter(struct request_queue *q)
 
				 		if (percpu_ref_tryget_live(&q->mq_usage_counter))
			
 
				 			return 0;
			
 
				 
			
 
				+		if (!(gfp & __GFP_WAIT))
			
 
				+			return -EBUSY;
			
 
				+
			
 
				 		ret = wait_event_interruptible(q->mq_freeze_wq,
			
 
				 				!q->mq_freeze_depth || blk_queue_dying(q));
			
 
				 		if (blk_queue_dying(q))
			
@@ -118,7 +120,7 @@ void blk_mq_freeze_queue_start(struct request_queue *q)
 
				 
			
 
				 	if (freeze) {
			
 
				 		percpu_ref_kill(&q->mq_usage_counter);
			
 
				-		blk_mq_run_queues(q);
			
 
				+		blk_mq_run_hw_queues(q, false);
			
 
				 	}
			
 
				 }
			
 
				 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_start);
			
@@ -257,7 +259,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp,
 
				 	struct blk_mq_alloc_data alloc_data;
			
 
				 	int ret;
			
 
				 
			
 
				-	ret = blk_mq_queue_enter(q);
			
 
				+	ret = blk_mq_queue_enter(q, gfp);
			
 
				 	if (ret)
			
 
				 		return ERR_PTR(ret);
			
 
				 
			
@@ -904,7 +906,7 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
 
				 			&hctx->run_work, 0);
			
 
				 }
			
 
				 
			
 
				-static void blk_mq_run_queues(struct request_queue *q)
			
 
				+void blk_mq_run_hw_queues(struct request_queue *q, bool async)
			
 
				 {
			
 
				 	struct blk_mq_hw_ctx *hctx;
			
 
				 	int i;
			
@@ -915,9 +917,10 @@ static void blk_mq_run_queues(struct request_queue *q)
 
				 		    test_bit(BLK_MQ_S_STOPPED, &hctx->state))
			
 
				 			continue;
			
 
				 
			
 
				-		blk_mq_run_hw_queue(hctx, false);
			
 
				+		blk_mq_run_hw_queue(hctx, async);
			
 
				 	}
			
 
				 }
			
 
				+EXPORT_SYMBOL(blk_mq_run_hw_queues);
			
 
				 
			
 
				 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
			
 
				 {
			
@@ -1186,7 +1189,7 @@ static struct request *blk_mq_map_request(struct request_queue *q,
 
				 	int rw = bio_data_dir(bio);
			
 
				 	struct blk_mq_alloc_data alloc_data;
			
 
				 
			
 
				-	if (unlikely(blk_mq_queue_enter(q))) {
			
 
				+	if (unlikely(blk_mq_queue_enter(q, GFP_KERNEL))) {
			
 
				 		bio_endio(bio, -EIO);
			
 
				 		return NULL;
			
 
				 	}
			
@@ -1517,8 +1520,6 @@ static int blk_mq_alloc_bitmap(struct blk_mq_ctxmap *bitmap, int node)
 
				 	if (!bitmap->map)
			
 
				 		return -ENOMEM;
			
 
				 
			
 
				-	bitmap->map_size = num_maps;
			
 
				-
			
 
				 	total = nr_cpu_ids;
			
 
				 	for (i = 0; i < num_maps; i++) {
			
 
				 		bitmap->map[i].depth = min(total, bitmap->bits_per_word);
			
@@ -1759,8 +1760,6 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
 
				 			continue;
			
 
				 
			
 
				 		hctx = q->mq_ops->map_queue(q, i);
			
 
				-		cpumask_set_cpu(i, hctx->cpumask);
			
 
				-		hctx->nr_ctx++;
			
 
				 
			
 
				 		/*
			
 
				 		 * Set local node, IFF we have more than one hw queue. If
			
@@ -1797,6 +1796,8 @@ static void blk_mq_map_swqueue(struct request_queue *q)
 
				 	}
			
 
				 
			
 
				 	queue_for_each_hw_ctx(q, hctx, i) {
			
 
				+		struct blk_mq_ctxmap *map = &hctx->ctx_map;
			
 
				+
			
 
				 		/*
			
 
				 		 * If no software queues are mapped to this hardware queue,
			
 
				 		 * disable it and free the request entries.
			
@@ -1812,6 +1813,13 @@ static void blk_mq_map_swqueue(struct request_queue *q)
 
				 			continue;
			
 
				 		}
			
 
				 
			
 
				+		/*
			
 
				+		 * Set the map size to the number of mapped software queues.
			
 
				+		 * This is more accurate and more efficient than looping
			
 
				+		 * over all possibly mapped software queues.
			
 
				+		 */
			
 
				+		map->map_size = hctx->nr_ctx / map->bits_per_word;
			
 
				+
			
 
				 		/*
			
 
				 		 * Initialize batch roundrobin counts
			
 
				 		 */
			
@@ -1888,10 +1896,26 @@ void blk_mq_release(struct request_queue *q)
 
				 }
			
 
				 
			
 
				 struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
			
 
				+{
			
 
				+	struct request_queue *uninit_q, *q;
			
 
				+
			
 
				+	uninit_q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node);
			
 
				+	if (!uninit_q)
			
 
				+		return ERR_PTR(-ENOMEM);
			
 
				+
			
 
				+	q = blk_mq_init_allocated_queue(set, uninit_q);
			
 
				+	if (IS_ERR(q))
			
 
				+		blk_cleanup_queue(uninit_q);
			
 
				+
			
 
				+	return q;
			
 
				+}
			
 
				+EXPORT_SYMBOL(blk_mq_init_queue);
			
 
				+
			
 
				+struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
			
 
				+						  struct request_queue *q)
			
 
				 {
			
 
				 	struct blk_mq_hw_ctx **hctxs;
			
 
				 	struct blk_mq_ctx __percpu *ctx;
			
 
				-	struct request_queue *q;
			
 
				 	unsigned int *map;
			
 
				 	int i;
			
 
				 
			
@@ -1926,20 +1950,16 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
 
				 		hctxs[i]->queue_num = i;
			
 
				 	}
			
 
				 
			
 
				-	q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node);
			
 
				-	if (!q)
			
 
				-		goto err_hctxs;
			
 
				-
			
 
				 	/*
			
 
				 	 * Init percpu_ref in atomic mode so that it's faster to shutdown.
			
 
				 	 * See blk_register_queue() for details.
			
 
				 	 */
			
 
				 	if (percpu_ref_init(&q->mq_usage_counter, blk_mq_usage_counter_release,
			
 
				 			    PERCPU_REF_INIT_ATOMIC, GFP_KERNEL))
			
 
				-		goto err_mq_usage;
			
 
				+		goto err_hctxs;
			
 
				 
			
 
				 	setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q);
			
 
				-	blk_queue_rq_timeout(q, 30000);
			
 
				+	blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30000);
			
 
				 
			
 
				 	q->nr_queues = nr_cpu_ids;
			
 
				 	q->nr_hw_queues = set->nr_hw_queues;
			
@@ -1965,9 +1985,6 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
 
				 	else
			
 
				 		blk_queue_make_request(q, blk_sq_make_request);
			
 
				 
			
 
				-	if (set->timeout)
			
 
				-		blk_queue_rq_timeout(q, set->timeout);
			
 
				-
			
 
				 	/*
			
 
				 	 * Do this after blk_queue_make_request() overrides it...
			
 
				 	 */
			
@@ -1979,7 +1996,7 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
 
				 	blk_mq_init_cpu_queues(q, set->nr_hw_queues);
			
 
				 
			
 
				 	if (blk_mq_init_hw_queues(q, set))
			
 
				-		goto err_mq_usage;
			
 
				+		goto err_hctxs;
			
 
				 
			
 
				 	mutex_lock(&all_q_mutex);
			
 
				 	list_add_tail(&q->all_q_node, &all_q_list);
			
@@ -1991,8 +2008,6 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
 
				 
			
 
				 	return q;
			
 
				 
			
 
				-err_mq_usage:
			
 
				-	blk_cleanup_queue(q);
			
 
				 err_hctxs:
			
 
				 	kfree(map);
			
 
				 	for (i = 0; i < set->nr_hw_queues; i++) {
			
@@ -2007,7 +2022,7 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
 
				 	free_percpu(ctx);
			
 
				 	return ERR_PTR(-ENOMEM);
			
 
				 }
			
 
				-EXPORT_SYMBOL(blk_mq_init_queue);
			
 
				+EXPORT_SYMBOL(blk_mq_init_allocated_queue);
			
 
				 
			
 
				 void blk_mq_free_queue(struct request_queue *q)
			
 
				 {
			
@@ -2159,7 +2174,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
 
				 	if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN)
			
 
				 		return -EINVAL;
			
 
				 
			
 
				-	if (!set->nr_hw_queues || !set->ops->queue_rq || !set->ops->map_queue)
			
 
				+	if (!set->ops->queue_rq || !set->ops->map_queue)
			
 
				 		return -EINVAL;
			
 
				 
			
 
				 	if (set->queue_depth > BLK_MQ_MAX_DEPTH) {
			
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -77,6 +77,11 @@ struct kioctx_cpu {
 
				 	unsigned		reqs_available;
			
 
				 };
			
 
				 
			
 
				+struct ctx_rq_wait {
			
 
				+	struct completion comp;
			
 
				+	atomic_t count;
			
 
				+};
			
 
				+
			
 
				 struct kioctx {
			
 
				 	struct percpu_ref	users;
			
 
				 	atomic_t		dead;
			
@@ -115,7 +120,7 @@ struct kioctx {
 
				 	/*
			
 
				 	 * signals when all in-flight requests are done
			
 
				 	 */
			
 
				-	struct completion *requests_done;
			
 
				+	struct ctx_rq_wait	*rq_wait;
			
 
				 
			
 
				 	struct {
			
 
				 		/*
			
@@ -572,8 +577,8 @@ static void free_ioctx_reqs(struct percpu_ref *ref)
 
				 	struct kioctx *ctx = container_of(ref, struct kioctx, reqs);
			
 
				 
			
 
				 	/* At this point we know that there are no any in-flight requests */
			
 
				-	if (ctx->requests_done)
			
 
				-		complete(ctx->requests_done);
			
 
				+	if (ctx->rq_wait && atomic_dec_and_test(&ctx->rq_wait->count))
			
 
				+		complete(&ctx->rq_wait->comp);
			
 
				 
			
 
				 	INIT_WORK(&ctx->free_work, free_ioctx);
			
 
				 	schedule_work(&ctx->free_work);
			
@@ -783,7 +788,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 
				  *	the rapid destruction of the kioctx.
			
 
				  */
			
 
				 static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,
			
 
				-		struct completion *requests_done)
			
 
				+		      struct ctx_rq_wait *wait)
			
 
				 {
			
 
				 	struct kioctx_table *table;
			
 
				 
			
@@ -813,7 +818,7 @@ static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,
 
				 	if (ctx->mmap_size)
			
 
				 		vm_munmap(ctx->mmap_base, ctx->mmap_size);
			
 
				 
			
 
				-	ctx->requests_done = requests_done;
			
 
				+	ctx->rq_wait = wait;
			
 
				 	percpu_ref_kill(&ctx->users);
			
 
				 	return 0;
			
 
				 }
			
@@ -829,18 +834,24 @@ static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,
 
				 void exit_aio(struct mm_struct *mm)
			
 
				 {
			
 
				 	struct kioctx_table *table = rcu_dereference_raw(mm->ioctx_table);
			
 
				-	int i;
			
 
				+	struct ctx_rq_wait wait;
			
 
				+	int i, skipped;
			
 
				 
			
 
				 	if (!table)
			
 
				 		return;
			
 
				 
			
 
				+	atomic_set(&wait.count, table->nr);
			
 
				+	init_completion(&wait.comp);
			
 
				+
			
 
				+	skipped = 0;
			
 
				 	for (i = 0; i < table->nr; ++i) {
			
 
				 		struct kioctx *ctx = table->table[i];
			
 
				-		struct completion requests_done =
			
 
				-			COMPLETION_INITIALIZER_ONSTACK(requests_done);
			
 
				 
			
 
				-		if (!ctx)
			
 
				+		if (!ctx) {
			
 
				+			skipped++;
			
 
				 			continue;
			
 
				+		}
			
 
				+
			
 
				 		/*
			
 
				 		 * We don't need to bother with munmap() here - exit_mmap(mm)
			
 
				 		 * is coming and it'll unmap everything. And we simply can't,
			
@@ -849,10 +860,12 @@ void exit_aio(struct mm_struct *mm)
 
				 		 * that it needs to unmap the area, just set it to 0.
			
 
				 		 */
			
 
				 		ctx->mmap_size = 0;
			
 
				-		kill_ioctx(mm, ctx, &requests_done);
			
 
				+		kill_ioctx(mm, ctx, &wait);
			
 
				+	}
			
 
				 
			
 
				+	if (!atomic_sub_and_test(skipped, &wait.count)) {
			
 
				 		/* Wait until all IO for the context are done. */
			
 
				-		wait_for_completion(&requests_done);
			
 
				+		wait_for_completion(&wait.comp);
			
 
				 	}
			
 
				 
			
 
				 	RCU_INIT_POINTER(mm->ioctx_table, NULL);
			
@@ -1331,15 +1344,17 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)
 
				 {
			
 
				 	struct kioctx *ioctx = lookup_ioctx(ctx);
			
 
				 	if (likely(NULL != ioctx)) {
			
 
				-		struct completion requests_done =
			
 
				-			COMPLETION_INITIALIZER_ONSTACK(requests_done);
			
 
				+		struct ctx_rq_wait wait;
			
 
				 		int ret;
			
 
				 
			
 
				+		init_completion(&wait.comp);
			
 
				+		atomic_set(&wait.count, 1);
			
 
				+
			
 
				 		/* Pass requests_done to kill_ioctx() where it can be set
			
 
				 		 * in a thread-safe way. If we try to set it here then we have
			
 
				 		 * a race condition if two io_destroy() called simultaneously.
			
 
				 		 */
			
 
				-		ret = kill_ioctx(current->mm, ioctx, &requests_done);
			
 
				+		ret = kill_ioctx(current->mm, ioctx, &wait);
			
 
				 		percpu_ref_put(&ioctx->users);
			
 
				 
			
 
				 		/* Wait until all IO for the context are done. Otherwise kernel
			
@@ -1347,7 +1362,7 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)
 
				 		 * is destroyed.
			
 
				 		 */
			
 
				 		if (!ret)
			
 
				-			wait_for_completion(&requests_done);
			
 
				+			wait_for_completion(&wait.comp);
			
 
				 
			
 
				 		return ret;
			
 
				 	}
			
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -164,6 +164,8 @@ enum {
 
				 		<< BLK_MQ_F_ALLOC_POLICY_START_BIT)
			
 
				 
			
 
				 struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *);
			
 
				+struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
			
 
				+						  struct request_queue *q);
			
 
				 void blk_mq_finish_init(struct request_queue *q);
			
 
				 int blk_mq_register_disk(struct gendisk *);
			
 
				 void blk_mq_unregister_disk(struct gendisk *);
			
@@ -218,6 +220,7 @@ void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx);
 
				 void blk_mq_stop_hw_queues(struct request_queue *q);
			
 
				 void blk_mq_start_hw_queues(struct request_queue *q);
			
 
				 void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async);
			
 
				+void blk_mq_run_hw_queues(struct request_queue *q, bool async);
			
 
				 void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
			
 
				 void blk_mq_tag_busy_iter(struct blk_mq_hw_ctx *hctx, busy_iter_fn *fn,
			
 
				 		void *priv);
			
@@ -227,7 +230,7 @@ void blk_mq_freeze_queue_start(struct request_queue *q);
 
				 
			
 
				 /*
			
 
				  * Driver command data is immediately after the request. So subtract request
			
 
				- * size to get back to the original request.
			
 
				+ * size to get back to the original request, add request size to get the PDU.
			
 
				  */
			
 
				 static inline struct request *blk_mq_rq_from_pdu(void *pdu)
			
 
				 {
			
@@ -235,7 +238,7 @@ static inline struct request *blk_mq_rq_from_pdu(void *pdu)
 
				 }
			
 
				 static inline void *blk_mq_rq_to_pdu(struct request *rq)
			
 
				 {
			
 
				-	return (void *) rq + sizeof(*rq);
			
 
				+	return rq + 1;
			
 
				 }
			
 
				 
			
 
				 #define queue_for_each_hw_ctx(q, hctx, i)				\