11 jaren geleden · caf292ae5b
--- a/Documentation/block/biodoc.txt
+++ b/Documentation/block/biodoc.txt
@@ -942,7 +942,11 @@ elevator_allow_merge_fn		called whenever the block layer determines
 
				 				request safely. The io scheduler may still
			
 
				 				want to stop a merge at this point if it
			
 
				 				results in some sort of conflict internally,
			
 
				-				this hook allows it to do that.
			
 
				+				this hook allows it to do that. Note however
			
 
				+				that two *requests* can still be merged at later
			
 
				+				time. Currently the io scheduler has no way to
			
 
				+				prevent that. It can only learn about the fact
			
 
				+				from elevator_merge_req_fn callback.
			
 
				 
			
 
				 elevator_dispatch_fn*		fills the dispatch queue with ready requests.
			
 
				 				I/O schedulers are free to postpone requests by
			
--- a/block/bio.c
+++ b/block/bio.c
@@ -748,6 +748,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
 
				 				}
			
 
				 			}
			
 
				 
			
 
				+			bio->bi_iter.bi_size += len;
			
 
				 			goto done;
			
 
				 		}
			
 
				 
			
@@ -764,28 +765,31 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
 
				 		return 0;
			
 
				 
			
 
				 	/*
			
 
				-	 * we might lose a segment or two here, but rather that than
			
 
				-	 * make this too complex.
			
 
				+	 * setup the new entry, we might clear it again later if we
			
 
				+	 * cannot add the page
			
 
				+	 */
			
 
				+	bvec = &bio->bi_io_vec[bio->bi_vcnt];
			
 
				+	bvec->bv_page = page;
			
 
				+	bvec->bv_len = len;
			
 
				+	bvec->bv_offset = offset;
			
 
				+	bio->bi_vcnt++;
			
 
				+	bio->bi_phys_segments++;
			
 
				+	bio->bi_iter.bi_size += len;
			
 
				+
			
 
				+	/*
			
 
				+	 * Perform a recount if the number of segments is greater
			
 
				+	 * than queue_max_segments(q).
			
 
				 	 */
			
 
				 
			
 
				-	while (bio->bi_phys_segments >= queue_max_segments(q)) {
			
 
				+	while (bio->bi_phys_segments > queue_max_segments(q)) {
			
 
				 
			
 
				 		if (retried_segments)
			
 
				-			return 0;
			
 
				+			goto failed;
			
 
				 
			
 
				 		retried_segments = 1;
			
 
				 		blk_recount_segments(q, bio);
			
 
				 	}
			
 
				 
			
 
				-	/*
			
 
				-	 * setup the new entry, we might clear it again later if we
			
 
				-	 * cannot add the page
			
 
				-	 */
			
 
				-	bvec = &bio->bi_io_vec[bio->bi_vcnt];
			
 
				-	bvec->bv_page = page;
			
 
				-	bvec->bv_len = len;
			
 
				-	bvec->bv_offset = offset;
			
 
				-
			
 
				 	/*
			
 
				 	 * if queue has other restrictions (eg varying max sector size
			
 
				 	 * depending on offset), it can specify a merge_bvec_fn in the
			
@@ -795,7 +799,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
 
				 		struct bvec_merge_data bvm = {
			
 
				 			.bi_bdev = bio->bi_bdev,
			
 
				 			.bi_sector = bio->bi_iter.bi_sector,
			
 
				-			.bi_size = bio->bi_iter.bi_size,
			
 
				+			.bi_size = bio->bi_iter.bi_size - len,
			
 
				 			.bi_rw = bio->bi_rw,
			
 
				 		};
			
 
				 
			
@@ -803,23 +807,25 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
 
				 		 * merge_bvec_fn() returns number of bytes it can accept
			
 
				 		 * at this offset
			
 
				 		 */
			
 
				-		if (q->merge_bvec_fn(q, &bvm, bvec) < bvec->bv_len) {
			
 
				-			bvec->bv_page = NULL;
			
 
				-			bvec->bv_len = 0;
			
 
				-			bvec->bv_offset = 0;
			
 
				-			return 0;
			
 
				-		}
			
 
				+		if (q->merge_bvec_fn(q, &bvm, bvec) < bvec->bv_len)
			
 
				+			goto failed;
			
 
				 	}
			
 
				 
			
 
				 	/* If we may be able to merge these biovecs, force a recount */
			
 
				-	if (bio->bi_vcnt && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec)))
			
 
				+	if (bio->bi_vcnt > 1 && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec)))
			
 
				 		bio->bi_flags &= ~(1 << BIO_SEG_VALID);
			
 
				 
			
 
				-	bio->bi_vcnt++;
			
 
				-	bio->bi_phys_segments++;
			
 
				  done:
			
 
				-	bio->bi_iter.bi_size += len;
			
 
				 	return len;
			
 
				+
			
 
				+ failed:
			
 
				+	bvec->bv_page = NULL;
			
 
				+	bvec->bv_len = 0;
			
 
				+	bvec->bv_offset = 0;
			
 
				+	bio->bi_vcnt--;
			
 
				+	bio->bi_iter.bi_size -= len;
			
 
				+	blk_recount_segments(q, bio);
			
 
				+	return 0;
			
 
				 }
			
 
				 
			
 
				 /**
			
@@ -1739,6 +1745,34 @@ void bio_check_pages_dirty(struct bio *bio)
 
				 	}
			
 
				 }
			
 
				 
			
 
				+void generic_start_io_acct(int rw, unsigned long sectors,
			
 
				+			   struct hd_struct *part)
			
 
				+{
			
 
				+	int cpu = part_stat_lock();
			
 
				+
			
 
				+	part_round_stats(cpu, part);
			
 
				+	part_stat_inc(cpu, part, ios[rw]);
			
 
				+	part_stat_add(cpu, part, sectors[rw], sectors);
			
 
				+	part_inc_in_flight(part, rw);
			
 
				+
			
 
				+	part_stat_unlock();
			
 
				+}
			
 
				+EXPORT_SYMBOL(generic_start_io_acct);
			
 
				+
			
 
				+void generic_end_io_acct(int rw, struct hd_struct *part,
			
 
				+			 unsigned long start_time)
			
 
				+{
			
 
				+	unsigned long duration = jiffies - start_time;
			
 
				+	int cpu = part_stat_lock();
			
 
				+
			
 
				+	part_stat_add(cpu, part, ticks[rw], duration);
			
 
				+	part_round_stats(cpu, part);
			
 
				+	part_dec_in_flight(part, rw);
			
 
				+
			
 
				+	part_stat_unlock();
			
 
				+}
			
 
				+EXPORT_SYMBOL(generic_end_io_acct);
			
 
				+
			
 
				 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
			
 
				 void bio_flush_dcache_pages(struct bio *bi)
			
 
				 {
			
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -525,6 +525,9 @@ void blk_cleanup_queue(struct request_queue *q)
 
				 	del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer);
			
 
				 	blk_sync_queue(q);
			
 
				 
			
 
				+	if (q->mq_ops)
			
 
				+		blk_mq_free_queue(q);
			
 
				+
			
 
				 	spin_lock_irq(lock);
			
 
				 	if (q->queue_lock != &q->__queue_lock)
			
 
				 		q->queue_lock = &q->__queue_lock;
			
--- a/block/blk-mq-cpumap.c
+++ b/block/blk-mq-cpumap.c
@@ -17,7 +17,7 @@
 
				 static int cpu_to_queue_index(unsigned int nr_cpus, unsigned int nr_queues,
			
 
				 			      const int cpu)
			
 
				 {
			
 
				-	return cpu / ((nr_cpus + nr_queues - 1) / nr_queues);
			
 
				+	return cpu * nr_queues / nr_cpus;
			
 
				 }
			
 
				 
			
 
				 static int get_first_sibling(unsigned int cpu)
			
@@ -90,7 +90,7 @@ unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set)
 
				 	unsigned int *map;
			
 
				 
			
 
				 	/* If cpus are offline, map them to first hctx */
			
 
				-	map = kzalloc_node(sizeof(*map) * num_possible_cpus(), GFP_KERNEL,
			
 
				+	map = kzalloc_node(sizeof(*map) * nr_cpu_ids, GFP_KERNEL,
			
 
				 				set->numa_node);
			
 
				 	if (!map)
			
 
				 		return NULL;
			
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -390,16 +390,15 @@ static void blk_mq_sysfs_init(struct request_queue *q)
 
				 {
			
 
				 	struct blk_mq_hw_ctx *hctx;
			
 
				 	struct blk_mq_ctx *ctx;
			
 
				-	int i, j;
			
 
				+	int i;
			
 
				 
			
 
				 	kobject_init(&q->mq_kobj, &blk_mq_ktype);
			
 
				 
			
 
				-	queue_for_each_hw_ctx(q, hctx, i) {
			
 
				+	queue_for_each_hw_ctx(q, hctx, i)
			
 
				 		kobject_init(&hctx->kobj, &blk_mq_hw_ktype);
			
 
				 
			
 
				-		hctx_for_each_ctx(hctx, ctx, j)
			
 
				-			kobject_init(&ctx->kobj, &blk_mq_ctx_ktype);
			
 
				-	}
			
 
				+	queue_for_each_ctx(q, ctx, i)
			
 
				+		kobject_init(&ctx->kobj, &blk_mq_ctx_ktype);
			
 
				 }
			
 
				 
			
 
				 /* see blk_register_queue() */
			
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -137,6 +137,7 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
 
				 static int __bt_get_word(struct blk_align_bitmap *bm, unsigned int last_tag)
			
 
				 {
			
 
				 	int tag, org_last_tag, end;
			
 
				+	bool wrap = last_tag != 0;
			
 
				 
			
 
				 	org_last_tag = last_tag;
			
 
				 	end = bm->depth;
			
@@ -148,15 +149,16 @@ static int __bt_get_word(struct blk_align_bitmap *bm, unsigned int last_tag)
 
				 			 * We started with an offset, start from 0 to
			
 
				 			 * exhaust the map.
			
 
				 			 */
			
 
				-			if (org_last_tag && last_tag) {
			
 
				-				end = last_tag;
			
 
				+			if (wrap) {
			
 
				+				wrap = false;
			
 
				+				end = org_last_tag;
			
 
				 				last_tag = 0;
			
 
				 				goto restart;
			
 
				 			}
			
 
				 			return -1;
			
 
				 		}
			
 
				 		last_tag = tag + 1;
			
 
				-	} while (test_and_set_bit_lock(tag, &bm->word));
			
 
				+	} while (test_and_set_bit(tag, &bm->word));
			
 
				 
			
 
				 	return tag;
			
 
				 }
			
@@ -246,14 +248,29 @@ static int bt_get(struct blk_mq_alloc_data *data,
 
				 	if (!(data->gfp & __GFP_WAIT))
			
 
				 		return -1;
			
 
				 
			
 
				-	bs = bt_wait_ptr(bt, hctx);
			
 
				 	do {
			
 
				+		bs = bt_wait_ptr(bt, hctx);
			
 
				 		prepare_to_wait(&bs->wait, &wait, TASK_UNINTERRUPTIBLE);
			
 
				 
			
 
				 		tag = __bt_get(hctx, bt, last_tag);
			
 
				 		if (tag != -1)
			
 
				 			break;
			
 
				 
			
 
				+		/*
			
 
				+		 * We're out of tags on this hardware queue, kick any
			
 
				+		 * pending IO submits before going to sleep waiting for
			
 
				+		 * some to complete.
			
 
				+		 */
			
 
				+		blk_mq_run_hw_queue(hctx, false);
			
 
				+
			
 
				+		/*
			
 
				+		 * Retry tag allocation after running the hardware queue,
			
 
				+		 * as running the queue may also have found completions.
			
 
				+		 */
			
 
				+		tag = __bt_get(hctx, bt, last_tag);
			
 
				+		if (tag != -1)
			
 
				+			break;
			
 
				+
			
 
				 		blk_mq_put_ctx(data->ctx);
			
 
				 
			
 
				 		io_schedule();
			
@@ -268,8 +285,6 @@ static int bt_get(struct blk_mq_alloc_data *data,
 
				 			hctx = data->hctx;
			
 
				 			bt = &hctx->tags->bitmap_tags;
			
 
				 		}
			
 
				-		finish_wait(&bs->wait, &wait);
			
 
				-		bs = bt_wait_ptr(bt, hctx);
			
 
				 	} while (1);
			
 
				 
			
 
				 	finish_wait(&bs->wait, &wait);
			
@@ -340,11 +355,10 @@ static void bt_clear_tag(struct blk_mq_bitmap_tags *bt, unsigned int tag)
 
				 	struct bt_wait_state *bs;
			
 
				 	int wait_cnt;
			
 
				 
			
 
				-	/*
			
 
				-	 * The unlock memory barrier need to order access to req in free
			
 
				-	 * path and clearing tag bit
			
 
				-	 */
			
 
				-	clear_bit_unlock(TAG_TO_BIT(bt, tag), &bt->map[index].word);
			
 
				+	clear_bit(TAG_TO_BIT(bt, tag), &bt->map[index].word);
			
 
				+
			
 
				+	/* Ensure that the wait list checks occur after clear_bit(). */
			
 
				+	smp_mb();
			
 
				 
			
 
				 	bs = bt_wake_ptr(bt);
			
 
				 	if (!bs)
			
@@ -360,21 +374,6 @@ static void bt_clear_tag(struct blk_mq_bitmap_tags *bt, unsigned int tag)
 
				 	}
			
 
				 }
			
 
				 
			
 
				-static void __blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag)
			
 
				-{
			
 
				-	BUG_ON(tag >= tags->nr_tags);
			
 
				-
			
 
				-	bt_clear_tag(&tags->bitmap_tags, tag);
			
 
				-}
			
 
				-
			
 
				-static void __blk_mq_put_reserved_tag(struct blk_mq_tags *tags,
			
 
				-				      unsigned int tag)
			
 
				-{
			
 
				-	BUG_ON(tag >= tags->nr_reserved_tags);
			
 
				-
			
 
				-	bt_clear_tag(&tags->breserved_tags, tag);
			
 
				-}
			
 
				-
			
 
				 void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag,
			
 
				 		    unsigned int *last_tag)
			
 
				 {
			
@@ -383,10 +382,13 @@ void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag,
 
				 	if (tag >= tags->nr_reserved_tags) {
			
 
				 		const int real_tag = tag - tags->nr_reserved_tags;
			
 
				 
			
 
				-		__blk_mq_put_tag(tags, real_tag);
			
 
				+		BUG_ON(real_tag >= tags->nr_tags);
			
 
				+		bt_clear_tag(&tags->bitmap_tags, real_tag);
			
 
				 		*last_tag = real_tag;
			
 
				-	} else
			
 
				-		__blk_mq_put_reserved_tag(tags, tag);
			
 
				+	} else {
			
 
				+		BUG_ON(tag >= tags->nr_reserved_tags);
			
 
				+		bt_clear_tag(&tags->breserved_tags, tag);
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				 static void bt_for_each(struct blk_mq_hw_ctx *hctx,
			
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -279,17 +279,25 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
 
				 	blk_mq_queue_exit(q);
			
 
				 }
			
 
				 
			
 
				-void blk_mq_free_request(struct request *rq)
			
 
				+void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *hctx, struct request *rq)
			
 
				 {
			
 
				 	struct blk_mq_ctx *ctx = rq->mq_ctx;
			
 
				-	struct blk_mq_hw_ctx *hctx;
			
 
				-	struct request_queue *q = rq->q;
			
 
				 
			
 
				 	ctx->rq_completed[rq_is_sync(rq)]++;
			
 
				-
			
 
				-	hctx = q->mq_ops->map_queue(q, ctx->cpu);
			
 
				 	__blk_mq_free_request(hctx, ctx, rq);
			
 
				+
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(blk_mq_free_hctx_request);
			
 
				+
			
 
				+void blk_mq_free_request(struct request *rq)
			
 
				+{
			
 
				+	struct blk_mq_hw_ctx *hctx;
			
 
				+	struct request_queue *q = rq->q;
			
 
				+
			
 
				+	hctx = q->mq_ops->map_queue(q, rq->mq_ctx->cpu);
			
 
				+	blk_mq_free_hctx_request(hctx, rq);
			
 
				 }
			
 
				+EXPORT_SYMBOL_GPL(blk_mq_free_request);
			
 
				 
			
 
				 inline void __blk_mq_end_request(struct request *rq, int error)
			
 
				 {
			
@@ -591,7 +599,7 @@ static void blk_mq_rq_timer(unsigned long priv)
 
				 		 * If not software queues are currently mapped to this
			
 
				 		 * hardware queue, there's nothing to check
			
 
				 		 */
			
 
				-		if (!hctx->nr_ctx || !hctx->tags)
			
 
				+		if (!blk_mq_hw_queue_mapped(hctx))
			
 
				 			continue;
			
 
				 
			
 
				 		blk_mq_tag_busy_iter(hctx, blk_mq_check_expired, &data);
			
@@ -690,6 +698,8 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 
				 	struct request_queue *q = hctx->queue;
			
 
				 	struct request *rq;
			
 
				 	LIST_HEAD(rq_list);
			
 
				+	LIST_HEAD(driver_list);
			
 
				+	struct list_head *dptr;
			
 
				 	int queued;
			
 
				 
			
 
				 	WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask));
			
@@ -715,17 +725,28 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 
				 		spin_unlock(&hctx->lock);
			
 
				 	}
			
 
				 
			
 
				+	/*
			
 
				+	 * Start off with dptr being NULL, so we start the first request
			
 
				+	 * immediately, even if we have more pending.
			
 
				+	 */
			
 
				+	dptr = NULL;
			
 
				+
			
 
				 	/*
			
 
				 	 * Now process all the entries, sending them to the driver.
			
 
				 	 */
			
 
				 	queued = 0;
			
 
				 	while (!list_empty(&rq_list)) {
			
 
				+		struct blk_mq_queue_data bd;
			
 
				 		int ret;
			
 
				 
			
 
				 		rq = list_first_entry(&rq_list, struct request, queuelist);
			
 
				 		list_del_init(&rq->queuelist);
			
 
				 
			
 
				-		ret = q->mq_ops->queue_rq(hctx, rq, list_empty(&rq_list));
			
 
				+		bd.rq = rq;
			
 
				+		bd.list = dptr;
			
 
				+		bd.last = list_empty(&rq_list);
			
 
				+
			
 
				+		ret = q->mq_ops->queue_rq(hctx, &bd);
			
 
				 		switch (ret) {
			
 
				 		case BLK_MQ_RQ_QUEUE_OK:
			
 
				 			queued++;
			
@@ -744,6 +765,13 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 
				 
			
 
				 		if (ret == BLK_MQ_RQ_QUEUE_BUSY)
			
 
				 			break;
			
 
				+
			
 
				+		/*
			
 
				+		 * We've done the first request. If we have more than 1
			
 
				+		 * left in the list, set dptr to defer issue.
			
 
				+		 */
			
 
				+		if (!dptr && rq_list.next != rq_list.prev)
			
 
				+			dptr = &driver_list;
			
 
				 	}
			
 
				 
			
 
				 	if (!queued)
			
@@ -770,10 +798,11 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 
				  */
			
 
				 static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
			
 
				 {
			
 
				-	int cpu = hctx->next_cpu;
			
 
				+	if (hctx->queue->nr_hw_queues == 1)
			
 
				+		return WORK_CPU_UNBOUND;
			
 
				 
			
 
				 	if (--hctx->next_cpu_batch <= 0) {
			
 
				-		int next_cpu;
			
 
				+		int cpu = hctx->next_cpu, next_cpu;
			
 
				 
			
 
				 		next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask);
			
 
				 		if (next_cpu >= nr_cpu_ids)
			
@@ -781,26 +810,32 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
 
				 
			
 
				 		hctx->next_cpu = next_cpu;
			
 
				 		hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
			
 
				+
			
 
				+		return cpu;
			
 
				 	}
			
 
				 
			
 
				-	return cpu;
			
 
				+	return hctx->next_cpu;
			
 
				 }
			
 
				 
			
 
				 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
			
 
				 {
			
 
				-	if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state)))
			
 
				+	if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state) ||
			
 
				+	    !blk_mq_hw_queue_mapped(hctx)))
			
 
				 		return;
			
 
				 
			
 
				-	if (!async && cpumask_test_cpu(smp_processor_id(), hctx->cpumask))
			
 
				-		__blk_mq_run_hw_queue(hctx);
			
 
				-	else if (hctx->queue->nr_hw_queues == 1)
			
 
				-		kblockd_schedule_delayed_work(&hctx->run_work, 0);
			
 
				-	else {
			
 
				-		unsigned int cpu;
			
 
				+	if (!async) {
			
 
				+		int cpu = get_cpu();
			
 
				+		if (cpumask_test_cpu(cpu, hctx->cpumask)) {
			
 
				+			__blk_mq_run_hw_queue(hctx);
			
 
				+			put_cpu();
			
 
				+			return;
			
 
				+		}
			
 
				 
			
 
				-		cpu = blk_mq_hctx_next_cpu(hctx);
			
 
				-		kblockd_schedule_delayed_work_on(cpu, &hctx->run_work, 0);
			
 
				+		put_cpu();
			
 
				 	}
			
 
				+
			
 
				+	kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
			
 
				+			&hctx->run_work, 0);
			
 
				 }
			
 
				 
			
 
				 void blk_mq_run_queues(struct request_queue *q, bool async)
			
@@ -814,9 +849,7 @@ void blk_mq_run_queues(struct request_queue *q, bool async)
 
				 		    test_bit(BLK_MQ_S_STOPPED, &hctx->state))
			
 
				 			continue;
			
 
				 
			
 
				-		preempt_disable();
			
 
				 		blk_mq_run_hw_queue(hctx, async);
			
 
				-		preempt_enable();
			
 
				 	}
			
 
				 }
			
 
				 EXPORT_SYMBOL(blk_mq_run_queues);
			
@@ -843,9 +876,7 @@ void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
 
				 {
			
 
				 	clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
			
 
				 
			
 
				-	preempt_disable();
			
 
				 	blk_mq_run_hw_queue(hctx, false);
			
 
				-	preempt_enable();
			
 
				 }
			
 
				 EXPORT_SYMBOL(blk_mq_start_hw_queue);
			
 
				 
			
@@ -870,9 +901,7 @@ void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async)
 
				 			continue;
			
 
				 
			
 
				 		clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
			
 
				-		preempt_disable();
			
 
				 		blk_mq_run_hw_queue(hctx, async);
			
 
				-		preempt_enable();
			
 
				 	}
			
 
				 }
			
 
				 EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
			
@@ -898,16 +927,11 @@ static void blk_mq_delay_work_fn(struct work_struct *work)
 
				 
			
 
				 void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
			
 
				 {
			
 
				-	unsigned long tmo = msecs_to_jiffies(msecs);
			
 
				-
			
 
				-	if (hctx->queue->nr_hw_queues == 1)
			
 
				-		kblockd_schedule_delayed_work(&hctx->delay_work, tmo);
			
 
				-	else {
			
 
				-		unsigned int cpu;
			
 
				+	if (unlikely(!blk_mq_hw_queue_mapped(hctx)))
			
 
				+		return;
			
 
				 
			
 
				-		cpu = blk_mq_hctx_next_cpu(hctx);
			
 
				-		kblockd_schedule_delayed_work_on(cpu, &hctx->delay_work, tmo);
			
 
				-	}
			
 
				+	kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
			
 
				+			&hctx->delay_work, msecs_to_jiffies(msecs));
			
 
				 }
			
 
				 EXPORT_SYMBOL(blk_mq_delay_queue);
			
 
				 
			
@@ -1162,7 +1186,17 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
 
				 		goto run_queue;
			
 
				 	}
			
 
				 
			
 
				-	if (is_sync) {
			
 
				+	/*
			
 
				+	 * If the driver supports defer issued based on 'last', then
			
 
				+	 * queue it up like normal since we can potentially save some
			
 
				+	 * CPU this way.
			
 
				+	 */
			
 
				+	if (is_sync && !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) {
			
 
				+		struct blk_mq_queue_data bd = {
			
 
				+			.rq = rq,
			
 
				+			.list = NULL,
			
 
				+			.last = 1
			
 
				+		};
			
 
				 		int ret;
			
 
				 
			
 
				 		blk_mq_bio_to_request(rq, bio);
			
@@ -1172,7 +1206,7 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
 
				 		 * error (busy), just add it to our list as we previously
			
 
				 		 * would have done
			
 
				 		 */
			
 
				-		ret = q->mq_ops->queue_rq(data.hctx, rq, true);
			
 
				+		ret = q->mq_ops->queue_rq(data.hctx, &bd);
			
 
				 		if (ret == BLK_MQ_RQ_QUEUE_OK)
			
 
				 			goto done;
			
 
				 		else {
			
@@ -1784,16 +1818,6 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
 
				 	if (!ctx)
			
 
				 		return ERR_PTR(-ENOMEM);
			
 
				 
			
 
				-	/*
			
 
				-	 * If a crashdump is active, then we are potentially in a very
			
 
				-	 * memory constrained environment. Limit us to 1 queue and
			
 
				-	 * 64 tags to prevent using too much memory.
			
 
				-	 */
			
 
				-	if (is_kdump_kernel()) {
			
 
				-		set->nr_hw_queues = 1;
			
 
				-		set->queue_depth = min(64U, set->queue_depth);
			
 
				-	}
			
 
				-
			
 
				 	hctxs = kmalloc_node(set->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL,
			
 
				 			set->numa_node);
			
 
				 
			
@@ -2067,6 +2091,16 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
 
				 		set->queue_depth = BLK_MQ_MAX_DEPTH;
			
 
				 	}
			
 
				 
			
 
				+	/*
			
 
				+	 * If a crashdump is active, then we are potentially in a very
			
 
				+	 * memory constrained environment. Limit us to 1 queue and
			
 
				+	 * 64 tags to prevent using too much memory.
			
 
				+	 */
			
 
				+	if (is_kdump_kernel()) {
			
 
				+		set->nr_hw_queues = 1;
			
 
				+		set->queue_depth = min(64U, set->queue_depth);
			
 
				+	}
			
 
				+
			
 
				 	set->tags = kmalloc_node(set->nr_hw_queues *
			
 
				 				 sizeof(struct blk_mq_tags *),
			
 
				 				 GFP_KERNEL, set->numa_node);
			
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -115,4 +115,9 @@ static inline void blk_mq_set_alloc_data(struct blk_mq_alloc_data *data,
 
				 	data->hctx = hctx;
			
 
				 }
			
 
				 
			
 
				+static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx)
			
 
				+{
			
 
				+	return hctx->nr_ctx && hctx->tags;
			
 
				+}
			
 
				+
			
 
				 #endif
			
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -257,9 +257,7 @@ void blk_limits_max_hw_sectors(struct queue_limits *limits, unsigned int max_hw_
 
				 		       __func__, max_hw_sectors);
			
 
				 	}
			
 
				 
			
 
				-	limits->max_hw_sectors = max_hw_sectors;
			
 
				-	limits->max_sectors = min_t(unsigned int, max_hw_sectors,
			
 
				-				    BLK_DEF_MAX_SECTORS);
			
 
				+	limits->max_sectors = limits->max_hw_sectors = max_hw_sectors;
			
 
				 }
			
 
				 EXPORT_SYMBOL(blk_limits_max_hw_sectors);
			
 
				 
			
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -492,17 +492,15 @@ static void blk_free_queue_rcu(struct rcu_head *rcu_head)
 
				  *     Currently, its primary task it to free all the &struct request
			
 
				  *     structures that were allocated to the queue and the queue itself.
			
 
				  *
			
 
				- * Caveat:
			
 
				- *     Hopefully the low level driver will have finished any
			
 
				- *     outstanding requests first...
			
 
				+ * Note:
			
 
				+ *     The low level driver must have finished any outstanding requests first
			
 
				+ *     via blk_cleanup_queue().
			
 
				  **/
			
 
				 static void blk_release_queue(struct kobject *kobj)
			
 
				 {
			
 
				 	struct request_queue *q =
			
 
				 		container_of(kobj, struct request_queue, kobj);
			
 
				 
			
 
				-	blk_sync_queue(q);
			
 
				-
			
 
				 	blkcg_exit_queue(q);
			
 
				 
			
 
				 	if (q->elevator) {
			
@@ -517,9 +515,7 @@ static void blk_release_queue(struct kobject *kobj)
 
				 	if (q->queue_tags)
			
 
				 		__blk_queue_free_tags(q);
			
 
				 
			
 
				-	if (q->mq_ops)
			
 
				-		blk_mq_free_queue(q);
			
 
				-	else
			
 
				+	if (!q->mq_ops)
			
 
				 		blk_free_flush_queue(q->fq);
			
 
				 
			
 
				 	blk_trace_shutdown(q);
			
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1070,9 +1070,16 @@ int disk_expand_part_tbl(struct gendisk *disk, int partno)
 
				 	struct disk_part_tbl *old_ptbl = disk->part_tbl;
			
 
				 	struct disk_part_tbl *new_ptbl;
			
 
				 	int len = old_ptbl ? old_ptbl->len : 0;
			
 
				-	int target = partno + 1;
			
 
				+	int i, target;
			
 
				 	size_t size;
			
 
				-	int i;
			
 
				+
			
 
				+	/*
			
 
				+	 * check for int overflow, since we can get here from blkpg_ioctl()
			
 
				+	 * with a user passed 'partno'.
			
 
				+	 */
			
 
				+	target = partno + 1;
			
 
				+	if (target < 0)
			
 
				+		return -EINVAL;
			
 
				 
			
 
				 	/* disk_max_parts() is zero during initialization, ignore if so */
			
 
				 	if (disk_max_parts(disk) && target > disk_max_parts(disk))
			
--- a/drivers/block/aoe/aoeblk.c
+++ b/drivers/block/aoe/aoeblk.c
@@ -395,7 +395,7 @@ aoeblk_gdalloc(void *vp)
 
				 	WARN_ON(d->flags & DEVFL_TKILL);
			
 
				 	WARN_ON(d->gd);
			
 
				 	WARN_ON(d->flags & DEVFL_UP);
			
 
				-	blk_queue_max_hw_sectors(q, BLK_DEF_MAX_SECTORS);
			
 
				+	blk_queue_max_hw_sectors(q, 1024);
			
 
				 	q->backing_dev_info.name = "aoe";
			
 
				 	q->backing_dev_info.ra_pages = READ_AHEAD / PAGE_CACHE_SIZE;
			
 
				 	d->bufpool = mp;
			
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -3775,9 +3775,10 @@ static bool mtip_check_unal_depth(struct blk_mq_hw_ctx *hctx,
 
				 	return false;
			
 
				 }
			
 
				 
			
 
				-static int mtip_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq,
			
 
				-		bool last)
			
 
				+static int mtip_queue_rq(struct blk_mq_hw_ctx *hctx,
			
 
				+			 const struct blk_mq_queue_data *bd)
			
 
				 {
			
 
				+	struct request *rq = bd->rq;
			
 
				 	int ret;
			
 
				 
			
 
				 	if (unlikely(mtip_check_unal_depth(hctx, rq)))
			
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -313,15 +313,15 @@ static void null_request_fn(struct request_queue *q)
 
				 	}
			
 
				 }
			
 
				 
			
 
				-static int null_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq,
			
 
				-		bool last)
			
 
				+static int null_queue_rq(struct blk_mq_hw_ctx *hctx,
			
 
				+			 const struct blk_mq_queue_data *bd)
			
 
				 {
			
 
				-	struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq);
			
 
				+	struct nullb_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
			
 
				 
			
 
				-	cmd->rq = rq;
			
 
				+	cmd->rq = bd->rq;
			
 
				 	cmd->nq = hctx->driver_data;
			
 
				 
			
 
				-	blk_mq_start_request(rq);
			
 
				+	blk_mq_start_request(bd->rq);
			
 
				 
			
 
				 	null_handle_cmd(cmd);
			
 
				 	return BLK_MQ_RQ_QUEUE_OK;
			
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -159,10 +159,11 @@ static void virtblk_done(struct virtqueue *vq)
 
				 	spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags);
			
 
				 }
			
 
				 
			
 
				-static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req,
			
 
				-		bool last)
			
 
				+static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx,
			
 
				+			   const struct blk_mq_queue_data *bd)
			
 
				 {
			
 
				 	struct virtio_blk *vblk = hctx->queue->queuedata;
			
 
				+	struct request *req = bd->rq;
			
 
				 	struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
			
 
				 	unsigned long flags;
			
 
				 	unsigned int num;
			
@@ -223,7 +224,7 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req,
 
				 		return BLK_MQ_RQ_QUEUE_ERROR;
			
 
				 	}
			
 
				 
			
 
				-	if (last && virtqueue_kick_prepare(vblk->vqs[qid].vq))
			
 
				+	if (bd->last && virtqueue_kick_prepare(vblk->vqs[qid].vq))
			
 
				 		notify = true;
			
 
				 	spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags);
			
 
				 
			
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1947,9 +1947,10 @@ static void scsi_mq_done(struct scsi_cmnd *cmd)
 
				 	blk_mq_complete_request(cmd->request);
			
 
				 }
			
 
				 
			
 
				-static int scsi_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req,
			
 
				-		bool last)
			
 
				+static int scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
			
 
				+			 const struct blk_mq_queue_data *bd)
			
 
				 {
			
 
				+	struct request *req = bd->rq;
			
 
				 	struct request_queue *q = req->q;
			
 
				 	struct scsi_device *sdev = q->queuedata;
			
 
				 	struct Scsi_Host *shost = sdev->host;
			
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -479,12 +479,28 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 
				 	 * write_inode()
			
 
				 	 */
			
 
				 	spin_lock(&inode->i_lock);
			
 
				-	/* Clear I_DIRTY_PAGES if we've written out all dirty pages */
			
 
				-	if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
			
 
				-		inode->i_state &= ~I_DIRTY_PAGES;
			
 
				+
			
 
				 	dirty = inode->i_state & I_DIRTY;
			
 
				-	inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC);
			
 
				+	inode->i_state &= ~I_DIRTY;
			
 
				+
			
 
				+	/*
			
 
				+	 * Paired with smp_mb() in __mark_inode_dirty().  This allows
			
 
				+	 * __mark_inode_dirty() to test i_state without grabbing i_lock -
			
 
				+	 * either they see the I_DIRTY bits cleared or we see the dirtied
			
 
				+	 * inode.
			
 
				+	 *
			
 
				+	 * I_DIRTY_PAGES is always cleared together above even if @mapping
			
 
				+	 * still has dirty pages.  The flag is reinstated after smp_mb() if
			
 
				+	 * necessary.  This guarantees that either __mark_inode_dirty()
			
 
				+	 * sees clear I_DIRTY_PAGES or we see PAGECACHE_TAG_DIRTY.
			
 
				+	 */
			
 
				+	smp_mb();
			
 
				+
			
 
				+	if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
			
 
				+		inode->i_state |= I_DIRTY_PAGES;
			
 
				+
			
 
				 	spin_unlock(&inode->i_lock);
			
 
				+
			
 
				 	/* Don't write the inode if only I_DIRTY_PAGES was set */
			
 
				 	if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
			
 
				 		int err = write_inode(inode, wbc);
			
@@ -1148,12 +1164,11 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 
				 	}
			
 
				 
			
 
				 	/*
			
 
				-	 * make sure that changes are seen by all cpus before we test i_state
			
 
				-	 * -- mikulas
			
 
				+	 * Paired with smp_mb() in __writeback_single_inode() for the
			
 
				+	 * following lockless i_state test.  See there for details.
			
 
				 	 */
			
 
				 	smp_mb();
			
 
				 
			
 
				-	/* avoid the locking if we can */
			
 
				 	if ((inode->i_state & flags) == flags)
			
 
				 		return;
			
 
				 
			
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -443,6 +443,11 @@ extern struct bio *bio_copy_kern(struct request_queue *, void *, unsigned int,
 
				 extern void bio_set_pages_dirty(struct bio *bio);
			
 
				 extern void bio_check_pages_dirty(struct bio *bio);
			
 
				 
			
 
				+void generic_start_io_acct(int rw, unsigned long sectors,
			
 
				+			   struct hd_struct *part);
			
 
				+void generic_end_io_acct(int rw, struct hd_struct *part,
			
 
				+			 unsigned long start_time);
			
 
				+
			
 
				 #ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
			
 
				 # error	"You should define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE for your platform"
			
 
				 #endif
			
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -79,7 +79,13 @@ struct blk_mq_tag_set {
 
				 	struct list_head	tag_list;
			
 
				 };
			
 
				 
			
 
				-typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, struct request *, bool);
			
 
				+struct blk_mq_queue_data {
			
 
				+	struct request *rq;
			
 
				+	struct list_head *list;
			
 
				+	bool last;
			
 
				+};
			
 
				+
			
 
				+typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, const struct blk_mq_queue_data *);
			
 
				 typedef struct blk_mq_hw_ctx *(map_queue_fn)(struct request_queue *, const int);
			
 
				 typedef enum blk_eh_timer_return (timeout_fn)(struct request *, bool);
			
 
				 typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int);
			
@@ -140,6 +146,7 @@ enum {
 
				 	BLK_MQ_F_TAG_SHARED	= 1 << 1,
			
 
				 	BLK_MQ_F_SG_MERGE	= 1 << 2,
			
 
				 	BLK_MQ_F_SYSFS_UP	= 1 << 3,
			
 
				+	BLK_MQ_F_DEFER_ISSUE	= 1 << 4,
			
 
				 
			
 
				 	BLK_MQ_S_STOPPED	= 0,
			
 
				 	BLK_MQ_S_TAG_ACTIVE	= 1,
			
@@ -162,6 +169,7 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule);
 
				 void blk_mq_insert_request(struct request *, bool, bool, bool);
			
 
				 void blk_mq_run_queues(struct request_queue *q, bool async);
			
 
				 void blk_mq_free_request(struct request *rq);
			
 
				+void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *, struct request *rq);
			
 
				 bool blk_mq_can_queue(struct blk_mq_hw_ctx *);
			
 
				 struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
			
 
				 		gfp_t gfp, bool reserved);
			
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1184,7 +1184,6 @@ extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm);
 
				 enum blk_default_limits {
			
 
				 	BLK_MAX_SEGMENTS	= 128,
			
 
				 	BLK_SAFE_MAX_SECTORS	= 255,
			
 
				-	BLK_DEF_MAX_SECTORS	= 1024,
			
 
				 	BLK_MAX_SEGMENT_SIZE	= 65536,
			
 
				 	BLK_SEG_BOUNDARY_MASK	= 0xFFFFFFFFUL,
			
 
				 };
			
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1477,9 +1477,6 @@ static int blk_trace_remove_queue(struct request_queue *q)
 
				 	if (atomic_dec_and_test(&blk_probes_ref))
			
 
				 		blk_unregister_tracepoints();
			
 
				 
			
 
				-	spin_lock_irq(&running_trace_lock);
			
 
				-	list_del(&bt->running_list);
			
 
				-	spin_unlock_irq(&running_trace_lock);
			
 
				 	blk_trace_free(bt);
			
 
				 	return 0;
			
 
				 }