Browse Source

Merge tag 'for-4.18/block-20180603' of git://git.kernel.dk/linux-block

Pull block updates from Jens Axboe:

 - clean up how we pass around gfp_t and
   blk_mq_req_flags_t (Christoph)

 - prepare us to defer scheduler attach (Christoph)

 - clean up drivers handling of bounce buffers (Christoph)

 - fix timeout handling corner cases (Christoph/Bart/Keith)

 - bcache fixes (Coly)

 - prep work for bcachefs and some block layer optimizations (Kent).

 - convert users of bio_sets to using embedded structs (Kent).

 - fixes for the BFQ io scheduler (Paolo/Davide/Filippo)

 - lightnvm fixes and improvements (Matias, with contributions from Hans
   and Javier)

 - adding discard throttling to blk-wbt (me)

 - sbitmap blk-mq-tag handling (me/Omar/Ming).

 - remove the sparc jsflash block driver, acked by DaveM.

 - Kyber scheduler improvement from Jianchao, making it more friendly
   wrt merging.

 - conversion of symbolic proc permissions to octal, from Joe Perches.
   Previously the block parts were a mix of both.

 - nbd fixes (Josef and Kevin Vigor)

 - unify how we handle the various kinds of timestamps that the block
   core and utility code uses (Omar)

 - three NVMe pull requests from Keith and Christoph, bringing AEN to
   feature completeness, file backed namespaces, cq/sq lock split, and
   various fixes

 - various little fixes and improvements all over the map

* tag 'for-4.18/block-20180603' of git://git.kernel.dk/linux-block: (196 commits)
  blk-mq: update nr_requests when switching to 'none' scheduler
  block: don't use blocking queue entered for recursive bio submits
  dm-crypt: fix warning in shutdown path
  lightnvm: pblk: take bitmap alloc. out of critical section
  lightnvm: pblk: kick writer on new flush points
  lightnvm: pblk: only try to recover lines with written smeta
  lightnvm: pblk: remove unnecessary bio_get/put
  lightnvm: pblk: add possibility to set write buffer size manually
  lightnvm: fix partial read error path
  lightnvm: proper error handling for pblk_bio_add_pages
  lightnvm: pblk: fix smeta write error path
  lightnvm: pblk: garbage collect lines with failed writes
  lightnvm: pblk: rework write error recovery path
  lightnvm: pblk: remove dead function
  lightnvm: pass flag on graceful teardown to targets
  lightnvm: pblk: check for chunk size before allocating it
  lightnvm: pblk: remove unnecessary argument
  lightnvm: pblk: remove unnecessary indirection
  lightnvm: pblk: return NVM_ error on failed submission
  lightnvm: pblk: warn in case of corrupted write buffer
  ...
Linus Torvalds 7 years ago
parent
commit
f459c34538
100 changed files with 2192 additions and 2143 deletions
  1. 6 3
      Documentation/block/null_blk.txt
  2. 2 13
      Documentation/scsi/scsi_eh.txt
  3. 1 1
      MAINTAINERS
  4. 0 40
      arch/sparc/include/uapi/asm/jsflash.h
  5. 20 20
      block/bfq-cgroup.c
  6. 237 241
      block/bfq-iosched.c
  7. 9 21
      block/bfq-iosched.h
  8. 14 15
      block/bio-integrity.c
  9. 109 80
      block/bio.c
  10. 52 68
      block/blk-core.c
  11. 6 6
      block/blk-integrity.c
  12. 9 3
      block/blk-lib.c
  13. 19 10
      block/blk-merge.c
  14. 0 1
      block/blk-mq-debugfs.c
  15. 25 21
      block/blk-mq-sched.c
  16. 0 2
      block/blk-mq-sched.h
  17. 3 3
      block/blk-mq-sysfs.c
  18. 13 1
      block/blk-mq-tag.c
  19. 108 232
      block/blk-mq.c
  20. 2 40
      block/blk-mq.h
  21. 3 7
      block/blk-stat.c
  22. 1 44
      block/blk-stat.h
  23. 45 35
      block/blk-sysfs.c
  24. 16 19
      block/blk-throttle.c
  25. 1 5
      block/blk-timeout.c
  26. 80 49
      block/blk-wbt.c
  27. 21 34
      block/blk-wbt.h
  28. 6 2
      block/blk-zoned.c
  29. 4 1
      block/blk.h
  30. 26 26
      block/bounce.c
  31. 2 4
      block/bsg-lib.c
  32. 7 37
      block/bsg.c
  33. 30 36
      block/cfq-iosched.c
  34. 1 2
      block/deadline-iosched.c
  35. 52 49
      block/elevator.c
  36. 17 20
      block/genhd.c
  37. 164 35
      block/kyber-iosched.c
  38. 1 2
      block/mq-deadline.c
  39. 12 14
      block/partition-generic.c
  40. 4 6
      block/scsi_ioctl.c
  41. 0 51
      drivers/ata/libata-eh.c
  42. 3 8
      drivers/block/DAC960.c
  43. 0 1
      drivers/block/DAC960.h
  44. 5 6
      drivers/block/aoe/aoeblk.c
  45. 2 1
      drivers/block/aoe/aoecmd.c
  46. 7 3
      drivers/block/brd.c
  47. 3 2
      drivers/block/drbd/drbd_bitmap.c
  48. 10 10
      drivers/block/drbd/drbd_debugfs.c
  49. 5 5
      drivers/block/drbd/drbd_int.h
  50. 26 47
      drivers/block/drbd/drbd_main.c
  51. 3 3
      drivers/block/drbd/drbd_receiver.c
  52. 2 2
      drivers/block/drbd/drbd_req.c
  53. 1 1
      drivers/block/drbd/drbd_req.h
  54. 1 1
      drivers/block/floppy.c
  55. 9 8
      drivers/block/loop.c
  56. 1 0
      drivers/block/loop.h
  57. 10 19
      drivers/block/mtip32xx/mtip32xx.c
  58. 52 25
      drivers/block/nbd.c
  59. 19 17
      drivers/block/null_blk.c
  60. 1 1
      drivers/block/paride/pd.c
  61. 30 30
      drivers/block/pktcdvd.c
  62. 0 2
      drivers/block/ps3disk.c
  63. 22 22
      drivers/block/rbd.c
  64. 3 3
      drivers/block/rsxx/core.c
  65. 1 1
      drivers/block/sx8.c
  66. 4 4
      drivers/block/virtio_blk.c
  67. 1 1
      drivers/block/xen-blkback/blkback.c
  68. 2 2
      drivers/block/xen-blkback/xenbus.c
  69. 3 4
      drivers/block/xen-blkfront.c
  70. 1 1
      drivers/cdrom/cdrom.c
  71. 1 1
      drivers/ide/ide-atapi.c
  72. 1 1
      drivers/ide/ide-cd.c
  73. 1 1
      drivers/ide/ide-cd_ioctl.c
  74. 1 1
      drivers/ide/ide-devsets.c
  75. 1 1
      drivers/ide/ide-disk.c
  76. 2 2
      drivers/ide/ide-ioctls.c
  77. 2 2
      drivers/ide/ide-park.c
  78. 2 3
      drivers/ide/ide-pm.c
  79. 2 2
      drivers/ide/ide-tape.c
  80. 2 2
      drivers/ide/ide-taskfile.c
  81. 5 5
      drivers/lightnvm/core.c
  82. 6 4
      drivers/lightnvm/pblk-cache.c
  83. 154 79
      drivers/lightnvm/pblk-core.c
  84. 71 41
      drivers/lightnvm/pblk-gc.c
  85. 105 67
      drivers/lightnvm/pblk-init.c
  86. 24 9
      drivers/lightnvm/pblk-map.c
  87. 4 44
      drivers/lightnvm/pblk-rb.c
  88. 88 58
      drivers/lightnvm/pblk-read.c
  89. 21 100
      drivers/lightnvm/pblk-recovery.c
  90. 25 4
      drivers/lightnvm/pblk-rl.c
  91. 13 2
      drivers/lightnvm/pblk-sysfs.c
  92. 178 91
      drivers/lightnvm/pblk-write.c
  93. 36 22
      drivers/lightnvm/pblk.h
  94. 7 7
      drivers/md/bcache/bcache.h
  95. 4 9
      drivers/md/bcache/bset.c
  96. 1 1
      drivers/md/bcache/bset.h
  97. 2 2
      drivers/md/bcache/btree.c
  98. 2 2
      drivers/md/bcache/io.c
  99. 9 9
      drivers/md/bcache/request.c
  100. 67 42
      drivers/md/bcache/super.c

+ 6 - 3
Documentation/block/null_blk.txt

@@ -71,13 +71,16 @@ use_per_node_hctx=[0/1]: Default: 0
   1: The multi-queue block layer is instantiated with a hardware dispatch
      queue for each CPU node in the system.
 
-use_lightnvm=[0/1]: Default: 0
-  Register device with LightNVM. Requires blk-mq and CONFIG_NVM to be enabled.
-
 no_sched=[0/1]: Default: 0
   0: nullb* use default blk-mq io scheduler.
   1: nullb* doesn't use io scheduler.
 
+blocking=[0/1]: Default: 0
+  0: Register as a non-blocking blk-mq driver device.
+  1: Register as a blocking blk-mq driver device, null_blk will set
+     the BLK_MQ_F_BLOCKING flag, indicating that it sometimes/always
+     needs to block in its ->queue_rq() function.
+
 shared_tags=[0/1]: Default: 0
   0: Tag set is not shared.
   1: Tag set shared between devices for blk-mq. Only makes sense with

+ 2 - 13
Documentation/scsi/scsi_eh.txt

@@ -82,24 +82,13 @@ function
  1. invokes optional hostt->eh_timed_out() callback.  Return value can
     be one of
 
-    - BLK_EH_HANDLED
-	This indicates that eh_timed_out() dealt with the timeout.
-	The command is passed back to the block layer and completed
-	via __blk_complete_requests().
-
-	*NOTE* After returning BLK_EH_HANDLED the SCSI layer is
-	assumed to be finished with the command, and no other
-	functions from the SCSI layer will be called. So this
-	should typically only be returned if the eh_timed_out()
-	handler raced with normal completion.
-
     - BLK_EH_RESET_TIMER
 	This indicates that more time is required to finish the
 	command.  Timer is restarted.  This action is counted as a
 	retry and only allowed scmd->allowed + 1(!) times.  Once the
-	limit is reached, action for BLK_EH_NOT_HANDLED is taken instead.
+	limit is reached, action for BLK_EH_DONE is taken instead.
 
-    - BLK_EH_NOT_HANDLED
+    - BLK_EH_DONE
         eh_timed_out() callback did not handle the command.
 	Step #2 is taken.
 

+ 1 - 1
MAINTAINERS

@@ -9700,7 +9700,7 @@ S:	Maintained
 F:	drivers/net/ethernet/netronome/
 
 NETWORK BLOCK DEVICE (NBD)
-M:	Josef Bacik <jbacik@fb.com>
+M:	Josef Bacik <josef@toxicpanda.com>
 S:	Maintained
 L:	linux-block@vger.kernel.org
 L:	nbd@other.debian.org

+ 0 - 40
arch/sparc/include/uapi/asm/jsflash.h

@@ -1,40 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- * jsflash.h: OS Flash SIMM support for JavaStations.
- *
- * Copyright (C) 1999  Pete Zaitcev
- */
-
-#ifndef _SPARC_JSFLASH_H
-#define _SPARC_JSFLASH_H
-
-#ifndef _SPARC_TYPES_H
-#include <linux/types.h>
-#endif
-
-/*
- * Semantics of the offset is a full address.
- * Hardcode it or get it from probe ioctl.
- *
- * We use full bus address, so that we would be
- * automatically compatible with possible future systems.
- */
-
-#define JSFLASH_IDENT   (('F'<<8)|54)
-struct jsflash_ident_arg {
-	__u64 off;                /* 0x20000000 is included */
-	__u32 size;
-	char name[32];		/* With trailing zero */
-};
-
-#define JSFLASH_ERASE   (('F'<<8)|55)
-/* Put 0 as argument, may be flags or sector number... */
-
-#define JSFLASH_PROGRAM (('F'<<8)|56)
-struct jsflash_program_arg {
-	__u64 data;		/* char* for sparc and sparc64 */
-	__u64 off;
-	__u32 size;
-};
-
-#endif /* _SPARC_JSFLASH_H */

+ 20 - 20
block/bfq-cgroup.c

@@ -55,13 +55,13 @@ BFQG_FLAG_FNS(empty)
 /* This should be called with the scheduler lock held. */
 static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats)
 {
-	unsigned long long now;
+	u64 now;
 
 	if (!bfqg_stats_waiting(stats))
 		return;
 
-	now = sched_clock();
-	if (time_after64(now, stats->start_group_wait_time))
+	now = ktime_get_ns();
+	if (now > stats->start_group_wait_time)
 		blkg_stat_add(&stats->group_wait_time,
 			      now - stats->start_group_wait_time);
 	bfqg_stats_clear_waiting(stats);
@@ -77,20 +77,20 @@ static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg,
 		return;
 	if (bfqg == curr_bfqg)
 		return;
-	stats->start_group_wait_time = sched_clock();
+	stats->start_group_wait_time = ktime_get_ns();
 	bfqg_stats_mark_waiting(stats);
 }
 
 /* This should be called with the scheduler lock held. */
 static void bfqg_stats_end_empty_time(struct bfqg_stats *stats)
 {
-	unsigned long long now;
+	u64 now;
 
 	if (!bfqg_stats_empty(stats))
 		return;
 
-	now = sched_clock();
-	if (time_after64(now, stats->start_empty_time))
+	now = ktime_get_ns();
+	if (now > stats->start_empty_time)
 		blkg_stat_add(&stats->empty_time,
 			      now - stats->start_empty_time);
 	bfqg_stats_clear_empty(stats);
@@ -116,7 +116,7 @@ void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg)
 	if (bfqg_stats_empty(stats))
 		return;
 
-	stats->start_empty_time = sched_clock();
+	stats->start_empty_time = ktime_get_ns();
 	bfqg_stats_mark_empty(stats);
 }
 
@@ -125,9 +125,9 @@ void bfqg_stats_update_idle_time(struct bfq_group *bfqg)
 	struct bfqg_stats *stats = &bfqg->stats;
 
 	if (bfqg_stats_idling(stats)) {
-		unsigned long long now = sched_clock();
+		u64 now = ktime_get_ns();
 
-		if (time_after64(now, stats->start_idle_time))
+		if (now > stats->start_idle_time)
 			blkg_stat_add(&stats->idle_time,
 				      now - stats->start_idle_time);
 		bfqg_stats_clear_idling(stats);
@@ -138,7 +138,7 @@ void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg)
 {
 	struct bfqg_stats *stats = &bfqg->stats;
 
-	stats->start_idle_time = sched_clock();
+	stats->start_idle_time = ktime_get_ns();
 	bfqg_stats_mark_idling(stats);
 }
 
@@ -171,18 +171,18 @@ void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op)
 	blkg_rwstat_add(&bfqg->stats.merged, op, 1);
 }
 
-void bfqg_stats_update_completion(struct bfq_group *bfqg, uint64_t start_time,
-				  uint64_t io_start_time, unsigned int op)
+void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns,
+				  u64 io_start_time_ns, unsigned int op)
 {
 	struct bfqg_stats *stats = &bfqg->stats;
-	unsigned long long now = sched_clock();
+	u64 now = ktime_get_ns();
 
-	if (time_after64(now, io_start_time))
+	if (now > io_start_time_ns)
 		blkg_rwstat_add(&stats->service_time, op,
-				now - io_start_time);
-	if (time_after64(io_start_time, start_time))
+				now - io_start_time_ns);
+	if (io_start_time_ns > start_time_ns)
 		blkg_rwstat_add(&stats->wait_time, op,
-				io_start_time - start_time);
+				io_start_time_ns - start_time_ns);
 }
 
 #else /* CONFIG_BFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
@@ -191,8 +191,8 @@ void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq,
 			      unsigned int op) { }
 void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) { }
 void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) { }
-void bfqg_stats_update_completion(struct bfq_group *bfqg, uint64_t start_time,
-				  uint64_t io_start_time, unsigned int op) { }
+void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns,
+				  u64 io_start_time_ns, unsigned int op) { }
 void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { }
 void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { }
 void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { }

+ 237 - 241
block/bfq-iosched.c

@@ -49,9 +49,39 @@
  *
  * In particular, to provide these low-latency guarantees, BFQ
  * explicitly privileges the I/O of two classes of time-sensitive
- * applications: interactive and soft real-time. This feature enables
- * BFQ to provide applications in these classes with a very low
- * latency. Finally, BFQ also features additional heuristics for
+ * applications: interactive and soft real-time. In more detail, BFQ
+ * behaves this way if the low_latency parameter is set (default
+ * configuration). This feature enables BFQ to provide applications in
+ * these classes with a very low latency.
+ *
+ * To implement this feature, BFQ constantly tries to detect whether
+ * the I/O requests in a bfq_queue come from an interactive or a soft
+ * real-time application. For brevity, in these cases, the queue is
+ * said to be interactive or soft real-time. In both cases, BFQ
+ * privileges the service of the queue, over that of non-interactive
+ * and non-soft-real-time queues. This privileging is performed,
+ * mainly, by raising the weight of the queue. So, for brevity, we
+ * call just weight-raising periods the time periods during which a
+ * queue is privileged, because deemed interactive or soft real-time.
+ *
+ * The detection of soft real-time queues/applications is described in
+ * detail in the comments on the function
+ * bfq_bfqq_softrt_next_start. On the other hand, the detection of an
+ * interactive queue works as follows: a queue is deemed interactive
+ * if it is constantly non empty only for a limited time interval,
+ * after which it does become empty. The queue may be deemed
+ * interactive again (for a limited time), if it restarts being
+ * constantly non empty, provided that this happens only after the
+ * queue has remained empty for a given minimum idle time.
+ *
+ * By default, BFQ computes automatically the above maximum time
+ * interval, i.e., the time interval after which a constantly
+ * non-empty queue stops being deemed interactive. Since a queue is
+ * weight-raised while it is deemed interactive, this maximum time
+ * interval happens to coincide with the (maximum) duration of the
+ * weight-raising for interactive queues.
+ *
+ * Finally, BFQ also features additional heuristics for
  * preserving both a low latency and a high throughput on NCQ-capable,
  * rotational or flash-based devices, and to get the job done quickly
  * for applications consisting in many I/O-bound processes.
@@ -61,14 +91,14 @@
  * all low-latency heuristics for that device, by setting low_latency
  * to 0.
  *
- * BFQ is described in [1], where also a reference to the initial, more
- * theoretical paper on BFQ can be found. The interested reader can find
- * in the latter paper full details on the main algorithm, as well as
- * formulas of the guarantees and formal proofs of all the properties.
- * With respect to the version of BFQ presented in these papers, this
- * implementation adds a few more heuristics, such as the one that
- * guarantees a low latency to soft real-time applications, and a
- * hierarchical extension based on H-WF2Q+.
+ * BFQ is described in [1], where also a reference to the initial,
+ * more theoretical paper on BFQ can be found. The interested reader
+ * can find in the latter paper full details on the main algorithm, as
+ * well as formulas of the guarantees and formal proofs of all the
+ * properties.  With respect to the version of BFQ presented in these
+ * papers, this implementation adds a few more heuristics, such as the
+ * ones that guarantee a low latency to interactive and soft real-time
+ * applications, and a hierarchical extension based on H-WF2Q+.
  *
  * B-WF2Q+ is based on WF2Q+, which is described in [2], together with
  * H-WF2Q+, while the augmented tree used here to implement B-WF2Q+
@@ -218,56 +248,46 @@ static struct kmem_cache *bfq_pool;
 #define BFQ_RATE_SHIFT		16
 
 /*
- * By default, BFQ computes the duration of the weight raising for
- * interactive applications automatically, using the following formula:
- * duration = (R / r) * T, where r is the peak rate of the device, and
- * R and T are two reference parameters.
- * In particular, R is the peak rate of the reference device (see
- * below), and T is a reference time: given the systems that are
- * likely to be installed on the reference device according to its
- * speed class, T is about the maximum time needed, under BFQ and
- * while reading two files in parallel, to load typical large
- * applications on these systems (see the comments on
- * max_service_from_wr below, for more details on how T is obtained).
- * In practice, the slower/faster the device at hand is, the more/less
- * it takes to load applications with respect to the reference device.
- * Accordingly, the longer/shorter BFQ grants weight raising to
- * interactive applications.
- *
- * BFQ uses four different reference pairs (R, T), depending on:
- * . whether the device is rotational or non-rotational;
- * . whether the device is slow, such as old or portable HDDs, as well as
- *   SD cards, or fast, such as newer HDDs and SSDs.
+ * When configured for computing the duration of the weight-raising
+ * for interactive queues automatically (see the comments at the
+ * beginning of this file), BFQ does it using the following formula:
+ * duration = (ref_rate / r) * ref_wr_duration,
+ * where r is the peak rate of the device, and ref_rate and
+ * ref_wr_duration are two reference parameters.  In particular,
+ * ref_rate is the peak rate of the reference storage device (see
+ * below), and ref_wr_duration is about the maximum time needed, with
+ * BFQ and while reading two files in parallel, to load typical large
+ * applications on the reference device (see the comments on
+ * max_service_from_wr below, for more details on how ref_wr_duration
+ * is obtained).  In practice, the slower/faster the device at hand
+ * is, the more/less it takes to load applications with respect to the
+ * reference device.  Accordingly, the longer/shorter BFQ grants
+ * weight raising to interactive applications.
  *
- * The device's speed class is dynamically (re)detected in
- * bfq_update_peak_rate() every time the estimated peak rate is updated.
+ * BFQ uses two different reference pairs (ref_rate, ref_wr_duration),
+ * depending on whether the device is rotational or non-rotational.
  *
- * In the following definitions, R_slow[0]/R_fast[0] and
- * T_slow[0]/T_fast[0] are the reference values for a slow/fast
- * rotational device, whereas R_slow[1]/R_fast[1] and
- * T_slow[1]/T_fast[1] are the reference values for a slow/fast
- * non-rotational device. Finally, device_speed_thresh are the
- * thresholds used to switch between speed classes. The reference
- * rates are not the actual peak rates of the devices used as a
- * reference, but slightly lower values. The reason for using these
- * slightly lower values is that the peak-rate estimator tends to
- * yield slightly lower values than the actual peak rate (it can yield
- * the actual peak rate only if there is only one process doing I/O,
- * and the process does sequential I/O).
+ * In the following definitions, ref_rate[0] and ref_wr_duration[0]
+ * are the reference values for a rotational device, whereas
+ * ref_rate[1] and ref_wr_duration[1] are the reference values for a
+ * non-rotational device. The reference rates are not the actual peak
+ * rates of the devices used as a reference, but slightly lower
+ * values. The reason for using slightly lower values is that the
+ * peak-rate estimator tends to yield slightly lower values than the
+ * actual peak rate (it can yield the actual peak rate only if there
+ * is only one process doing I/O, and the process does sequential
+ * I/O).
  *
- * Both the reference peak rates and the thresholds are measured in
- * sectors/usec, left-shifted by BFQ_RATE_SHIFT.
+ * The reference peak rates are measured in sectors/usec, left-shifted
+ * by BFQ_RATE_SHIFT.
  */
-static int R_slow[2] = {1000, 10700};
-static int R_fast[2] = {14000, 33000};
+static int ref_rate[2] = {14000, 33000};
 /*
- * To improve readability, a conversion function is used to initialize the
- * following arrays, which entails that they can be initialized only in a
- * function.
+ * To improve readability, a conversion function is used to initialize
+ * the following array, which entails that the array can be
+ * initialized only in a function.
  */
-static int T_slow[2];
-static int T_fast[2];
-static int device_speed_thresh[2];
+static int ref_wr_duration[2];
 
 /*
  * BFQ uses the above-detailed, time-based weight-raising mechanism to
@@ -486,46 +506,6 @@ static struct request *bfq_choose_req(struct bfq_data *bfqd,
 	}
 }
 
-/*
- * See the comments on bfq_limit_depth for the purpose of
- * the depths set in the function.
- */
-static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt)
-{
-	bfqd->sb_shift = bt->sb.shift;
-
-	/*
-	 * In-word depths if no bfq_queue is being weight-raised:
-	 * leaving 25% of tags only for sync reads.
-	 *
-	 * In next formulas, right-shift the value
-	 * (1U<<bfqd->sb_shift), instead of computing directly
-	 * (1U<<(bfqd->sb_shift - something)), to be robust against
-	 * any possible value of bfqd->sb_shift, without having to
-	 * limit 'something'.
-	 */
-	/* no more than 50% of tags for async I/O */
-	bfqd->word_depths[0][0] = max((1U<<bfqd->sb_shift)>>1, 1U);
-	/*
-	 * no more than 75% of tags for sync writes (25% extra tags
-	 * w.r.t. async I/O, to prevent async I/O from starving sync
-	 * writes)
-	 */
-	bfqd->word_depths[0][1] = max(((1U<<bfqd->sb_shift) * 3)>>2, 1U);
-
-	/*
-	 * In-word depths in case some bfq_queue is being weight-
-	 * raised: leaving ~63% of tags for sync reads. This is the
-	 * highest percentage for which, in our tests, application
-	 * start-up times didn't suffer from any regression due to tag
-	 * shortage.
-	 */
-	/* no more than ~18% of tags for async I/O */
-	bfqd->word_depths[1][0] = max(((1U<<bfqd->sb_shift) * 3)>>4, 1U);
-	/* no more than ~37% of tags for sync writes (~20% extra tags) */
-	bfqd->word_depths[1][1] = max(((1U<<bfqd->sb_shift) * 6)>>4, 1U);
-}
-
 /*
  * Async I/O can easily starve sync I/O (both sync reads and sync
  * writes), by consuming all tags. Similarly, storms of sync writes,
@@ -535,25 +515,11 @@ static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt)
  */
 static void bfq_limit_depth(unsigned int op, struct blk_mq_alloc_data *data)
 {
-	struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
 	struct bfq_data *bfqd = data->q->elevator->elevator_data;
-	struct sbitmap_queue *bt;
 
 	if (op_is_sync(op) && !op_is_write(op))
 		return;
 
-	if (data->flags & BLK_MQ_REQ_RESERVED) {
-		if (unlikely(!tags->nr_reserved_tags)) {
-			WARN_ON_ONCE(1);
-			return;
-		}
-		bt = &tags->breserved_tags;
-	} else
-		bt = &tags->bitmap_tags;
-
-	if (unlikely(bfqd->sb_shift != bt->sb.shift))
-		bfq_update_depths(bfqd, bt);
-
 	data->shallow_depth =
 		bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)];
 
@@ -906,26 +872,30 @@ static unsigned int bfq_wr_duration(struct bfq_data *bfqd)
 	if (bfqd->bfq_wr_max_time > 0)
 		return bfqd->bfq_wr_max_time;
 
-	dur = bfqd->RT_prod;
+	dur = bfqd->rate_dur_prod;
 	do_div(dur, bfqd->peak_rate);
 
 	/*
-	 * Limit duration between 3 and 13 seconds. Tests show that
-	 * higher values than 13 seconds often yield the opposite of
-	 * the desired result, i.e., worsen responsiveness by letting
-	 * non-interactive and non-soft-real-time applications
-	 * preserve weight raising for a too long time interval.
+	 * Limit duration between 3 and 25 seconds. The upper limit
+	 * has been conservatively set after the following worst case:
+	 * on a QEMU/KVM virtual machine
+	 * - running in a slow PC
+	 * - with a virtual disk stacked on a slow low-end 5400rpm HDD
+	 * - serving a heavy I/O workload, such as the sequential reading
+	 *   of several files
+	 * mplayer took 23 seconds to start, if constantly weight-raised.
+	 *
+	 * As for higher values than that accomodating the above bad
+	 * scenario, tests show that higher values would often yield
+	 * the opposite of the desired result, i.e., would worsen
+	 * responsiveness by allowing non-interactive applications to
+	 * preserve weight raising for too long.
 	 *
 	 * On the other end, lower values than 3 seconds make it
 	 * difficult for most interactive tasks to complete their jobs
 	 * before weight-raising finishes.
 	 */
-	if (dur > msecs_to_jiffies(13000))
-		dur = msecs_to_jiffies(13000);
-	else if (dur < msecs_to_jiffies(3000))
-		dur = msecs_to_jiffies(3000);
-
-	return dur;
+	return clamp_val(dur, msecs_to_jiffies(3000), msecs_to_jiffies(25000));
 }
 
 /* switch back from soft real-time to interactive weight raising */
@@ -1392,15 +1362,6 @@ static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd,
 	return wr_or_deserves_wr;
 }
 
-/*
- * Return the farthest future time instant according to jiffies
- * macros.
- */
-static unsigned long bfq_greatest_from_now(void)
-{
-	return jiffies + MAX_JIFFY_OFFSET;
-}
-
 /*
  * Return the farthest past time instant according to jiffies
  * macros.
@@ -1545,7 +1506,8 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
 	in_burst = bfq_bfqq_in_large_burst(bfqq);
 	soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 &&
 		!in_burst &&
-		time_is_before_jiffies(bfqq->soft_rt_next_start);
+		time_is_before_jiffies(bfqq->soft_rt_next_start) &&
+		bfqq->dispatched == 0;
 	*interactive = !in_burst && idle_for_long_time;
 	wr_or_deserves_wr = bfqd->low_latency &&
 		(bfqq->wr_coeff > 1 ||
@@ -1858,6 +1820,8 @@ static int bfq_request_merge(struct request_queue *q, struct request **req,
 	return ELEVATOR_NO_MERGE;
 }
 
+static struct bfq_queue *bfq_init_rq(struct request *rq);
+
 static void bfq_request_merged(struct request_queue *q, struct request *req,
 			       enum elv_merge type)
 {
@@ -1866,7 +1830,7 @@ static void bfq_request_merged(struct request_queue *q, struct request *req,
 	    blk_rq_pos(req) <
 	    blk_rq_pos(container_of(rb_prev(&req->rb_node),
 				    struct request, rb_node))) {
-		struct bfq_queue *bfqq = RQ_BFQQ(req);
+		struct bfq_queue *bfqq = bfq_init_rq(req);
 		struct bfq_data *bfqd = bfqq->bfqd;
 		struct request *prev, *next_rq;
 
@@ -1891,14 +1855,25 @@ static void bfq_request_merged(struct request_queue *q, struct request *req,
 	}
 }
 
+/*
+ * This function is called to notify the scheduler that the requests
+ * rq and 'next' have been merged, with 'next' going away.  BFQ
+ * exploits this hook to address the following issue: if 'next' has a
+ * fifo_time lower that rq, then the fifo_time of rq must be set to
+ * the value of 'next', to not forget the greater age of 'next'.
+ *
+ * NOTE: in this function we assume that rq is in a bfq_queue, basing
+ * on that rq is picked from the hash table q->elevator->hash, which,
+ * in its turn, is filled only with I/O requests present in
+ * bfq_queues, while BFQ is in use for the request queue q. In fact,
+ * the function that fills this hash table (elv_rqhash_add) is called
+ * only by bfq_insert_request.
+ */
 static void bfq_requests_merged(struct request_queue *q, struct request *rq,
 				struct request *next)
 {
-	struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next);
-
-	if (!RB_EMPTY_NODE(&rq->rb_node))
-		goto end;
-	spin_lock_irq(&bfqq->bfqd->lock);
+	struct bfq_queue *bfqq = bfq_init_rq(rq),
+		*next_bfqq = bfq_init_rq(next);
 
 	/*
 	 * If next and rq belong to the same bfq_queue and next is older
@@ -1920,11 +1895,6 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq,
 	if (bfqq->next_rq == next)
 		bfqq->next_rq = rq;
 
-	bfq_remove_request(q, next);
-	bfqg_stats_update_io_remove(bfqq_group(bfqq), next->cmd_flags);
-
-	spin_unlock_irq(&bfqq->bfqd->lock);
-end:
 	bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags);
 }
 
@@ -2506,37 +2476,15 @@ static unsigned long bfq_calc_max_budget(struct bfq_data *bfqd)
 /*
  * Update parameters related to throughput and responsiveness, as a
  * function of the estimated peak rate. See comments on
- * bfq_calc_max_budget(), and on T_slow and T_fast arrays.
+ * bfq_calc_max_budget(), and on the ref_wr_duration array.
  */
 static void update_thr_responsiveness_params(struct bfq_data *bfqd)
 {
-	int dev_type = blk_queue_nonrot(bfqd->queue);
-
-	if (bfqd->bfq_user_max_budget == 0)
+	if (bfqd->bfq_user_max_budget == 0) {
 		bfqd->bfq_max_budget =
 			bfq_calc_max_budget(bfqd);
-
-	if (bfqd->device_speed == BFQ_BFQD_FAST &&
-	    bfqd->peak_rate < device_speed_thresh[dev_type]) {
-		bfqd->device_speed = BFQ_BFQD_SLOW;
-		bfqd->RT_prod = R_slow[dev_type] *
-			T_slow[dev_type];
-	} else if (bfqd->device_speed == BFQ_BFQD_SLOW &&
-		   bfqd->peak_rate > device_speed_thresh[dev_type]) {
-		bfqd->device_speed = BFQ_BFQD_FAST;
-		bfqd->RT_prod = R_fast[dev_type] *
-			T_fast[dev_type];
+		bfq_log(bfqd, "new max_budget = %d", bfqd->bfq_max_budget);
 	}
-
-	bfq_log(bfqd,
-"dev_type %s dev_speed_class = %s (%llu sects/sec), thresh %llu setcs/sec",
-		dev_type == 0 ? "ROT" : "NONROT",
-		bfqd->device_speed == BFQ_BFQD_FAST ? "FAST" : "SLOW",
-		bfqd->device_speed == BFQ_BFQD_FAST ?
-		(USEC_PER_SEC*(u64)R_fast[dev_type])>>BFQ_RATE_SHIFT :
-		(USEC_PER_SEC*(u64)R_slow[dev_type])>>BFQ_RATE_SHIFT,
-		(USEC_PER_SEC*(u64)device_speed_thresh[dev_type])>>
-		BFQ_RATE_SHIFT);
 }
 
 static void bfq_reset_rate_computation(struct bfq_data *bfqd,
@@ -3265,23 +3213,6 @@ void bfq_bfqq_expire(struct bfq_data *bfqd,
 			bfqq->soft_rt_next_start =
 				bfq_bfqq_softrt_next_start(bfqd, bfqq);
 		else {
-			/*
-			 * The application is still waiting for the
-			 * completion of one or more requests:
-			 * prevent it from possibly being incorrectly
-			 * deemed as soft real-time by setting its
-			 * soft_rt_next_start to infinity. In fact,
-			 * without this assignment, the application
-			 * would be incorrectly deemed as soft
-			 * real-time if:
-			 * 1) it issued a new request before the
-			 *    completion of all its in-flight
-			 *    requests, and
-			 * 2) at that time, its soft_rt_next_start
-			 *    happened to be in the past.
-			 */
-			bfqq->soft_rt_next_start =
-				bfq_greatest_from_now();
 			/*
 			 * Schedule an update of soft_rt_next_start to when
 			 * the task may be discovered to be isochronous.
@@ -4540,14 +4471,12 @@ static inline void bfq_update_insert_stats(struct request_queue *q,
 					   unsigned int cmd_flags) {}
 #endif
 
-static void bfq_prepare_request(struct request *rq, struct bio *bio);
-
 static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
 			       bool at_head)
 {
 	struct request_queue *q = hctx->queue;
 	struct bfq_data *bfqd = q->elevator->elevator_data;
-	struct bfq_queue *bfqq = RQ_BFQQ(rq);
+	struct bfq_queue *bfqq;
 	bool idle_timer_disabled = false;
 	unsigned int cmd_flags;
 
@@ -4562,24 +4491,13 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
 	blk_mq_sched_request_inserted(rq);
 
 	spin_lock_irq(&bfqd->lock);
+	bfqq = bfq_init_rq(rq);
 	if (at_head || blk_rq_is_passthrough(rq)) {
 		if (at_head)
 			list_add(&rq->queuelist, &bfqd->dispatch);
 		else
 			list_add_tail(&rq->queuelist, &bfqd->dispatch);
-	} else {
-		if (WARN_ON_ONCE(!bfqq)) {
-			/*
-			 * This should never happen. Most likely rq is
-			 * a requeued regular request, being
-			 * re-inserted without being first
-			 * re-prepared. Do a prepare, to avoid
-			 * failure.
-			 */
-			bfq_prepare_request(rq, rq->bio);
-			bfqq = RQ_BFQQ(rq);
-		}
-
+	} else { /* bfqq is assumed to be non null here */
 		idle_timer_disabled = __bfq_insert_request(bfqd, rq);
 		/*
 		 * Update bfqq, because, if a queue merge has occurred
@@ -4778,8 +4696,8 @@ static void bfq_finish_requeue_request(struct request *rq)
 
 	if (rq->rq_flags & RQF_STARTED)
 		bfqg_stats_update_completion(bfqq_group(bfqq),
-					     rq_start_time_ns(rq),
-					     rq_io_start_time_ns(rq),
+					     rq->start_time_ns,
+					     rq->io_start_time_ns,
 					     rq->cmd_flags);
 
 	if (likely(rq->rq_flags & RQF_STARTED)) {
@@ -4922,11 +4840,48 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd,
 }
 
 /*
- * Allocate bfq data structures associated with this request.
+ * Only reset private fields. The actual request preparation will be
+ * performed by bfq_init_rq, when rq is either inserted or merged. See
+ * comments on bfq_init_rq for the reason behind this delayed
+ * preparation.
  */
 static void bfq_prepare_request(struct request *rq, struct bio *bio)
+{
+	/*
+	 * Regardless of whether we have an icq attached, we have to
+	 * clear the scheduler pointers, as they might point to
+	 * previously allocated bic/bfqq structs.
+	 */
+	rq->elv.priv[0] = rq->elv.priv[1] = NULL;
+}
+
+/*
+ * If needed, init rq, allocate bfq data structures associated with
+ * rq, and increment reference counters in the destination bfq_queue
+ * for rq. Return the destination bfq_queue for rq, or NULL is rq is
+ * not associated with any bfq_queue.
+ *
+ * This function is invoked by the functions that perform rq insertion
+ * or merging. One may have expected the above preparation operations
+ * to be performed in bfq_prepare_request, and not delayed to when rq
+ * is inserted or merged. The rationale behind this delayed
+ * preparation is that, after the prepare_request hook is invoked for
+ * rq, rq may still be transformed into a request with no icq, i.e., a
+ * request not associated with any queue. No bfq hook is invoked to
+ * signal this tranformation. As a consequence, should these
+ * preparation operations be performed when the prepare_request hook
+ * is invoked, and should rq be transformed one moment later, bfq
+ * would end up in an inconsistent state, because it would have
+ * incremented some queue counters for an rq destined to
+ * transformation, without any chance to correctly lower these
+ * counters back. In contrast, no transformation can still happen for
+ * rq after rq has been inserted or merged. So, it is safe to execute
+ * these preparation operations when rq is finally inserted or merged.
+ */
+static struct bfq_queue *bfq_init_rq(struct request *rq)
 {
 	struct request_queue *q = rq->q;
+	struct bio *bio = rq->bio;
 	struct bfq_data *bfqd = q->elevator->elevator_data;
 	struct bfq_io_cq *bic;
 	const int is_sync = rq_is_sync(rq);
@@ -4934,20 +4889,21 @@ static void bfq_prepare_request(struct request *rq, struct bio *bio)
 	bool new_queue = false;
 	bool bfqq_already_existing = false, split = false;
 
+	if (unlikely(!rq->elv.icq))
+		return NULL;
+
 	/*
-	 * Even if we don't have an icq attached, we should still clear
-	 * the scheduler pointers, as they might point to previously
-	 * allocated bic/bfqq structs.
+	 * Assuming that elv.priv[1] is set only if everything is set
+	 * for this rq. This holds true, because this function is
+	 * invoked only for insertion or merging, and, after such
+	 * events, a request cannot be manipulated any longer before
+	 * being removed from bfq.
 	 */
-	if (!rq->elv.icq) {
-		rq->elv.priv[0] = rq->elv.priv[1] = NULL;
-		return;
-	}
+	if (rq->elv.priv[1])
+		return rq->elv.priv[1];
 
 	bic = icq_to_bic(rq->elv.icq);
 
-	spin_lock_irq(&bfqd->lock);
-
 	bfq_check_ioprio_change(bic, bio);
 
 	bfq_bic_update_cgroup(bic, bio);
@@ -5006,7 +4962,7 @@ static void bfq_prepare_request(struct request *rq, struct bio *bio)
 	if (unlikely(bfq_bfqq_just_created(bfqq)))
 		bfq_handle_burst(bfqd, bfqq);
 
-	spin_unlock_irq(&bfqd->lock);
+	return bfqq;
 }
 
 static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq)
@@ -5105,6 +5061,64 @@ void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
 	__bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);
 }
 
+/*
+ * See the comments on bfq_limit_depth for the purpose of
+ * the depths set in the function. Return minimum shallow depth we'll use.
+ */
+static unsigned int bfq_update_depths(struct bfq_data *bfqd,
+				      struct sbitmap_queue *bt)
+{
+	unsigned int i, j, min_shallow = UINT_MAX;
+
+	/*
+	 * In-word depths if no bfq_queue is being weight-raised:
+	 * leaving 25% of tags only for sync reads.
+	 *
+	 * In next formulas, right-shift the value
+	 * (1U<<bt->sb.shift), instead of computing directly
+	 * (1U<<(bt->sb.shift - something)), to be robust against
+	 * any possible value of bt->sb.shift, without having to
+	 * limit 'something'.
+	 */
+	/* no more than 50% of tags for async I/O */
+	bfqd->word_depths[0][0] = max((1U << bt->sb.shift) >> 1, 1U);
+	/*
+	 * no more than 75% of tags for sync writes (25% extra tags
+	 * w.r.t. async I/O, to prevent async I/O from starving sync
+	 * writes)
+	 */
+	bfqd->word_depths[0][1] = max(((1U << bt->sb.shift) * 3) >> 2, 1U);
+
+	/*
+	 * In-word depths in case some bfq_queue is being weight-
+	 * raised: leaving ~63% of tags for sync reads. This is the
+	 * highest percentage for which, in our tests, application
+	 * start-up times didn't suffer from any regression due to tag
+	 * shortage.
+	 */
+	/* no more than ~18% of tags for async I/O */
+	bfqd->word_depths[1][0] = max(((1U << bt->sb.shift) * 3) >> 4, 1U);
+	/* no more than ~37% of tags for sync writes (~20% extra tags) */
+	bfqd->word_depths[1][1] = max(((1U << bt->sb.shift) * 6) >> 4, 1U);
+
+	for (i = 0; i < 2; i++)
+		for (j = 0; j < 2; j++)
+			min_shallow = min(min_shallow, bfqd->word_depths[i][j]);
+
+	return min_shallow;
+}
+
+static int bfq_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int index)
+{
+	struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
+	struct blk_mq_tags *tags = hctx->sched_tags;
+	unsigned int min_shallow;
+
+	min_shallow = bfq_update_depths(bfqd, &tags->bitmap_tags);
+	sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, min_shallow);
+	return 0;
+}
+
 static void bfq_exit_queue(struct elevator_queue *e)
 {
 	struct bfq_data *bfqd = e->elevator_data;
@@ -5242,14 +5256,12 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
 	bfqd->wr_busy_queues = 0;
 
 	/*
-	 * Begin by assuming, optimistically, that the device is a
-	 * high-speed one, and that its peak rate is equal to 2/3 of
-	 * the highest reference rate.
+	 * Begin by assuming, optimistically, that the device peak
+	 * rate is equal to 2/3 of the highest reference rate.
 	 */
-	bfqd->RT_prod = R_fast[blk_queue_nonrot(bfqd->queue)] *
-			T_fast[blk_queue_nonrot(bfqd->queue)];
-	bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)] * 2 / 3;
-	bfqd->device_speed = BFQ_BFQD_FAST;
+	bfqd->rate_dur_prod = ref_rate[blk_queue_nonrot(bfqd->queue)] *
+		ref_wr_duration[blk_queue_nonrot(bfqd->queue)];
+	bfqd->peak_rate = ref_rate[blk_queue_nonrot(bfqd->queue)] * 2 / 3;
 
 	spin_lock_init(&bfqd->lock);
 
@@ -5526,6 +5538,7 @@ static struct elevator_type iosched_bfq_mq = {
 		.requests_merged	= bfq_requests_merged,
 		.request_merged		= bfq_request_merged,
 		.has_work		= bfq_has_work,
+		.init_hctx		= bfq_init_hctx,
 		.init_sched		= bfq_init_queue,
 		.exit_sched		= bfq_exit_queue,
 	},
@@ -5556,8 +5569,8 @@ static int __init bfq_init(void)
 	/*
 	 * Times to load large popular applications for the typical
 	 * systems installed on the reference devices (see the
-	 * comments before the definitions of the next two
-	 * arrays). Actually, we use slightly slower values, as the
+	 * comments before the definition of the next
+	 * array). Actually, we use slightly lower values, as the
 	 * estimated peak rate tends to be smaller than the actual
 	 * peak rate.  The reason for this last fact is that estimates
 	 * are computed over much shorter time intervals than the long
@@ -5566,25 +5579,8 @@ static int __init bfq_init(void)
 	 * scheduler cannot rely on a peak-rate-evaluation workload to
 	 * be run for a long time.
 	 */
-	T_slow[0] = msecs_to_jiffies(3500); /* actually 4 sec */
-	T_slow[1] = msecs_to_jiffies(6000); /* actually 6.5 sec */
-	T_fast[0] = msecs_to_jiffies(7000); /* actually 8 sec */
-	T_fast[1] = msecs_to_jiffies(2500); /* actually 3 sec */
-
-	/*
-	 * Thresholds that determine the switch between speed classes
-	 * (see the comments before the definition of the array
-	 * device_speed_thresh). These thresholds are biased towards
-	 * transitions to the fast class. This is safer than the
-	 * opposite bias. In fact, a wrong transition to the slow
-	 * class results in short weight-raising periods, because the
-	 * speed of the device then tends to be higher that the
-	 * reference peak rate. On the opposite end, a wrong
-	 * transition to the fast class tends to increase
-	 * weight-raising periods, because of the opposite reason.
-	 */
-	device_speed_thresh[0] = (4 * R_slow[0]) / 3;
-	device_speed_thresh[1] = (4 * R_slow[1]) / 3;
+	ref_wr_duration[0] = msecs_to_jiffies(7000); /* actually 8 sec */
+	ref_wr_duration[1] = msecs_to_jiffies(2500); /* actually 3 sec */
 
 	ret = elv_register(&iosched_bfq_mq);
 	if (ret)

+ 9 - 21
block/bfq-iosched.h

@@ -399,11 +399,6 @@ struct bfq_io_cq {
 	struct bfq_ttime saved_ttime;
 };
 
-enum bfq_device_speed {
-	BFQ_BFQD_FAST,
-	BFQ_BFQD_SLOW,
-};
-
 /**
  * struct bfq_data - per-device data structure.
  *
@@ -611,12 +606,11 @@ struct bfq_data {
 	/* Max service-rate for a soft real-time queue, in sectors/sec */
 	unsigned int bfq_wr_max_softrt_rate;
 	/*
-	 * Cached value of the product R*T, used for computing the
-	 * maximum duration of weight raising automatically.
+	 * Cached value of the product ref_rate*ref_wr_duration, used
+	 * for computing the maximum duration of weight raising
+	 * automatically.
 	 */
-	u64 RT_prod;
-	/* device-speed class for the low-latency heuristic */
-	enum bfq_device_speed device_speed;
+	u64 rate_dur_prod;
 
 	/* fallback dummy bfqq for extreme OOM conditions */
 	struct bfq_queue oom_bfqq;
@@ -635,12 +629,6 @@ struct bfq_data {
 	/* bfqq associated with the task issuing current bio for merging */
 	struct bfq_queue *bio_bfqq;
 
-	/*
-	 * Cached sbitmap shift, used to compute depth limits in
-	 * bfq_update_depths.
-	 */
-	unsigned int sb_shift;
-
 	/*
 	 * Depth limits used in bfq_limit_depth (see comments on the
 	 * function)
@@ -732,9 +720,9 @@ struct bfqg_stats {
 	/* total time with empty current active q with other requests queued */
 	struct blkg_stat		empty_time;
 	/* fields after this shouldn't be cleared on stat reset */
-	uint64_t			start_group_wait_time;
-	uint64_t			start_idle_time;
-	uint64_t			start_empty_time;
+	u64				start_group_wait_time;
+	u64				start_idle_time;
+	u64				start_empty_time;
 	uint16_t			flags;
 #endif	/* CONFIG_BFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
 };
@@ -856,8 +844,8 @@ void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq,
 			      unsigned int op);
 void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op);
 void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op);
-void bfqg_stats_update_completion(struct bfq_group *bfqg, uint64_t start_time,
-				  uint64_t io_start_time, unsigned int op);
+void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns,
+				  u64 io_start_time_ns, unsigned int op);
 void bfqg_stats_update_dequeue(struct bfq_group *bfqg);
 void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg);
 void bfqg_stats_update_idle_time(struct bfq_group *bfqg);

+ 14 - 15
block/bio-integrity.c

@@ -56,12 +56,12 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
 	struct bio_set *bs = bio->bi_pool;
 	unsigned inline_vecs;
 
-	if (!bs || !bs->bio_integrity_pool) {
+	if (!bs || !mempool_initialized(&bs->bio_integrity_pool)) {
 		bip = kmalloc(sizeof(struct bio_integrity_payload) +
 			      sizeof(struct bio_vec) * nr_vecs, gfp_mask);
 		inline_vecs = nr_vecs;
 	} else {
-		bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask);
+		bip = mempool_alloc(&bs->bio_integrity_pool, gfp_mask);
 		inline_vecs = BIP_INLINE_VECS;
 	}
 
@@ -74,7 +74,7 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
 		unsigned long idx = 0;
 
 		bip->bip_vec = bvec_alloc(gfp_mask, nr_vecs, &idx,
-					  bs->bvec_integrity_pool);
+					  &bs->bvec_integrity_pool);
 		if (!bip->bip_vec)
 			goto err;
 		bip->bip_max_vcnt = bvec_nr_vecs(idx);
@@ -90,7 +90,7 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
 
 	return bip;
 err:
-	mempool_free(bip, bs->bio_integrity_pool);
+	mempool_free(bip, &bs->bio_integrity_pool);
 	return ERR_PTR(-ENOMEM);
 }
 EXPORT_SYMBOL(bio_integrity_alloc);
@@ -111,10 +111,10 @@ static void bio_integrity_free(struct bio *bio)
 		kfree(page_address(bip->bip_vec->bv_page) +
 		      bip->bip_vec->bv_offset);
 
-	if (bs && bs->bio_integrity_pool) {
-		bvec_free(bs->bvec_integrity_pool, bip->bip_vec, bip->bip_slab);
+	if (bs && mempool_initialized(&bs->bio_integrity_pool)) {
+		bvec_free(&bs->bvec_integrity_pool, bip->bip_vec, bip->bip_slab);
 
-		mempool_free(bip, bs->bio_integrity_pool);
+		mempool_free(bip, &bs->bio_integrity_pool);
 	} else {
 		kfree(bip);
 	}
@@ -465,16 +465,15 @@ EXPORT_SYMBOL(bio_integrity_clone);
 
 int bioset_integrity_create(struct bio_set *bs, int pool_size)
 {
-	if (bs->bio_integrity_pool)
+	if (mempool_initialized(&bs->bio_integrity_pool))
 		return 0;
 
-	bs->bio_integrity_pool = mempool_create_slab_pool(pool_size, bip_slab);
-	if (!bs->bio_integrity_pool)
+	if (mempool_init_slab_pool(&bs->bio_integrity_pool,
+				   pool_size, bip_slab))
 		return -1;
 
-	bs->bvec_integrity_pool = biovec_create_pool(pool_size);
-	if (!bs->bvec_integrity_pool) {
-		mempool_destroy(bs->bio_integrity_pool);
+	if (biovec_init_pool(&bs->bvec_integrity_pool, pool_size)) {
+		mempool_exit(&bs->bio_integrity_pool);
 		return -1;
 	}
 
@@ -484,8 +483,8 @@ EXPORT_SYMBOL(bioset_integrity_create);
 
 void bioset_integrity_free(struct bio_set *bs)
 {
-	mempool_destroy(bs->bio_integrity_pool);
-	mempool_destroy(bs->bvec_integrity_pool);
+	mempool_exit(&bs->bio_integrity_pool);
+	mempool_exit(&bs->bvec_integrity_pool);
 }
 EXPORT_SYMBOL(bioset_integrity_free);
 

+ 109 - 80
block/bio.c

@@ -53,7 +53,7 @@ static struct biovec_slab bvec_slabs[BVEC_POOL_NR] __read_mostly = {
  * fs_bio_set is the bio_set containing bio and iovec memory pools used by
  * IO code that does not need private memory pools.
  */
-struct bio_set *fs_bio_set;
+struct bio_set fs_bio_set;
 EXPORT_SYMBOL(fs_bio_set);
 
 /*
@@ -254,7 +254,7 @@ static void bio_free(struct bio *bio)
 	bio_uninit(bio);
 
 	if (bs) {
-		bvec_free(bs->bvec_pool, bio->bi_io_vec, BVEC_POOL_IDX(bio));
+		bvec_free(&bs->bvec_pool, bio->bi_io_vec, BVEC_POOL_IDX(bio));
 
 		/*
 		 * If we have front padding, adjust the bio pointer before freeing
@@ -262,7 +262,7 @@ static void bio_free(struct bio *bio)
 		p = bio;
 		p -= bs->front_pad;
 
-		mempool_free(p, bs->bio_pool);
+		mempool_free(p, &bs->bio_pool);
 	} else {
 		/* Bio was allocated by bio_kmalloc() */
 		kfree(bio);
@@ -454,7 +454,8 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs,
 		inline_vecs = nr_iovecs;
 	} else {
 		/* should not use nobvec bioset for nr_iovecs > 0 */
-		if (WARN_ON_ONCE(!bs->bvec_pool && nr_iovecs > 0))
+		if (WARN_ON_ONCE(!mempool_initialized(&bs->bvec_pool) &&
+				 nr_iovecs > 0))
 			return NULL;
 		/*
 		 * generic_make_request() converts recursion to iteration; this
@@ -483,11 +484,11 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs,
 		    bs->rescue_workqueue)
 			gfp_mask &= ~__GFP_DIRECT_RECLAIM;
 
-		p = mempool_alloc(bs->bio_pool, gfp_mask);
+		p = mempool_alloc(&bs->bio_pool, gfp_mask);
 		if (!p && gfp_mask != saved_gfp) {
 			punt_bios_to_rescuer(bs);
 			gfp_mask = saved_gfp;
-			p = mempool_alloc(bs->bio_pool, gfp_mask);
+			p = mempool_alloc(&bs->bio_pool, gfp_mask);
 		}
 
 		front_pad = bs->front_pad;
@@ -503,11 +504,11 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs,
 	if (nr_iovecs > inline_vecs) {
 		unsigned long idx = 0;
 
-		bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, bs->bvec_pool);
+		bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, &bs->bvec_pool);
 		if (!bvl && gfp_mask != saved_gfp) {
 			punt_bios_to_rescuer(bs);
 			gfp_mask = saved_gfp;
-			bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, bs->bvec_pool);
+			bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, &bs->bvec_pool);
 		}
 
 		if (unlikely(!bvl))
@@ -524,25 +525,25 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs,
 	return bio;
 
 err_free:
-	mempool_free(p, bs->bio_pool);
+	mempool_free(p, &bs->bio_pool);
 	return NULL;
 }
 EXPORT_SYMBOL(bio_alloc_bioset);
 
-void zero_fill_bio(struct bio *bio)
+void zero_fill_bio_iter(struct bio *bio, struct bvec_iter start)
 {
 	unsigned long flags;
 	struct bio_vec bv;
 	struct bvec_iter iter;
 
-	bio_for_each_segment(bv, bio, iter) {
+	__bio_for_each_segment(bv, bio, iter, start) {
 		char *data = bvec_kmap_irq(&bv, &flags);
 		memset(data, 0, bv.bv_len);
 		flush_dcache_page(bv.bv_page);
 		bvec_kunmap_irq(data, &flags);
 	}
 }
-EXPORT_SYMBOL(zero_fill_bio);
+EXPORT_SYMBOL(zero_fill_bio_iter);
 
 /**
  * bio_put - release a reference to a bio
@@ -970,27 +971,68 @@ void bio_advance(struct bio *bio, unsigned bytes)
 }
 EXPORT_SYMBOL(bio_advance);
 
+void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
+			struct bio *src, struct bvec_iter *src_iter)
+{
+	struct bio_vec src_bv, dst_bv;
+	void *src_p, *dst_p;
+	unsigned bytes;
+
+	while (src_iter->bi_size && dst_iter->bi_size) {
+		src_bv = bio_iter_iovec(src, *src_iter);
+		dst_bv = bio_iter_iovec(dst, *dst_iter);
+
+		bytes = min(src_bv.bv_len, dst_bv.bv_len);
+
+		src_p = kmap_atomic(src_bv.bv_page);
+		dst_p = kmap_atomic(dst_bv.bv_page);
+
+		memcpy(dst_p + dst_bv.bv_offset,
+		       src_p + src_bv.bv_offset,
+		       bytes);
+
+		kunmap_atomic(dst_p);
+		kunmap_atomic(src_p);
+
+		flush_dcache_page(dst_bv.bv_page);
+
+		bio_advance_iter(src, src_iter, bytes);
+		bio_advance_iter(dst, dst_iter, bytes);
+	}
+}
+EXPORT_SYMBOL(bio_copy_data_iter);
+
 /**
- * bio_copy_data - copy contents of data buffers from one chain of bios to
- * another
- * @src: source bio list
- * @dst: destination bio list
- *
- * If @src and @dst are single bios, bi_next must be NULL - otherwise, treats
- * @src and @dst as linked lists of bios.
+ * bio_copy_data - copy contents of data buffers from one bio to another
+ * @src: source bio
+ * @dst: destination bio
  *
  * Stops when it reaches the end of either @src or @dst - that is, copies
  * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios).
  */
 void bio_copy_data(struct bio *dst, struct bio *src)
 {
-	struct bvec_iter src_iter, dst_iter;
-	struct bio_vec src_bv, dst_bv;
-	void *src_p, *dst_p;
-	unsigned bytes;
+	struct bvec_iter src_iter = src->bi_iter;
+	struct bvec_iter dst_iter = dst->bi_iter;
 
-	src_iter = src->bi_iter;
-	dst_iter = dst->bi_iter;
+	bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
+}
+EXPORT_SYMBOL(bio_copy_data);
+
+/**
+ * bio_list_copy_data - copy contents of data buffers from one chain of bios to
+ * another
+ * @src: source bio list
+ * @dst: destination bio list
+ *
+ * Stops when it reaches the end of either the @src list or @dst list - that is,
+ * copies min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of
+ * bios).
+ */
+void bio_list_copy_data(struct bio *dst, struct bio *src)
+{
+	struct bvec_iter src_iter = src->bi_iter;
+	struct bvec_iter dst_iter = dst->bi_iter;
 
 	while (1) {
 		if (!src_iter.bi_size) {
@@ -1009,26 +1051,10 @@ void bio_copy_data(struct bio *dst, struct bio *src)
 			dst_iter = dst->bi_iter;
 		}
 
-		src_bv = bio_iter_iovec(src, src_iter);
-		dst_bv = bio_iter_iovec(dst, dst_iter);
-
-		bytes = min(src_bv.bv_len, dst_bv.bv_len);
-
-		src_p = kmap_atomic(src_bv.bv_page);
-		dst_p = kmap_atomic(dst_bv.bv_page);
-
-		memcpy(dst_p + dst_bv.bv_offset,
-		       src_p + src_bv.bv_offset,
-		       bytes);
-
-		kunmap_atomic(dst_p);
-		kunmap_atomic(src_p);
-
-		bio_advance_iter(src, &src_iter, bytes);
-		bio_advance_iter(dst, &dst_iter, bytes);
+		bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
 	}
 }
-EXPORT_SYMBOL(bio_copy_data);
+EXPORT_SYMBOL(bio_list_copy_data);
 
 struct bio_map_data {
 	int is_our_pages;
@@ -1584,6 +1610,7 @@ void bio_set_pages_dirty(struct bio *bio)
 			set_page_dirty_lock(page);
 	}
 }
+EXPORT_SYMBOL_GPL(bio_set_pages_dirty);
 
 static void bio_release_pages(struct bio *bio)
 {
@@ -1667,6 +1694,7 @@ void bio_check_pages_dirty(struct bio *bio)
 		bio_put(bio);
 	}
 }
+EXPORT_SYMBOL_GPL(bio_check_pages_dirty);
 
 void generic_start_io_acct(struct request_queue *q, int rw,
 			   unsigned long sectors, struct hd_struct *part)
@@ -1749,6 +1777,9 @@ again:
 	if (!bio_integrity_endio(bio))
 		return;
 
+	if (WARN_ONCE(bio->bi_next, "driver left bi_next not NULL"))
+		bio->bi_next = NULL;
+
 	/*
 	 * Need to have a real endio function for chained bios, otherwise
 	 * various corner cases will break (like stacking block devices that
@@ -1848,30 +1879,38 @@ EXPORT_SYMBOL_GPL(bio_trim);
  * create memory pools for biovec's in a bio_set.
  * use the global biovec slabs created for general use.
  */
-mempool_t *biovec_create_pool(int pool_entries)
+int biovec_init_pool(mempool_t *pool, int pool_entries)
 {
 	struct biovec_slab *bp = bvec_slabs + BVEC_POOL_MAX;
 
-	return mempool_create_slab_pool(pool_entries, bp->slab);
+	return mempool_init_slab_pool(pool, pool_entries, bp->slab);
 }
 
-void bioset_free(struct bio_set *bs)
+/*
+ * bioset_exit - exit a bioset initialized with bioset_init()
+ *
+ * May be called on a zeroed but uninitialized bioset (i.e. allocated with
+ * kzalloc()).
+ */
+void bioset_exit(struct bio_set *bs)
 {
 	if (bs->rescue_workqueue)
 		destroy_workqueue(bs->rescue_workqueue);
+	bs->rescue_workqueue = NULL;
 
-	mempool_destroy(bs->bio_pool);
-	mempool_destroy(bs->bvec_pool);
+	mempool_exit(&bs->bio_pool);
+	mempool_exit(&bs->bvec_pool);
 
 	bioset_integrity_free(bs);
-	bio_put_slab(bs);
-
-	kfree(bs);
+	if (bs->bio_slab)
+		bio_put_slab(bs);
+	bs->bio_slab = NULL;
 }
-EXPORT_SYMBOL(bioset_free);
+EXPORT_SYMBOL(bioset_exit);
 
 /**
- * bioset_create  - Create a bio_set
+ * bioset_init - Initialize a bio_set
+ * @bs:		pool to initialize
  * @pool_size:	Number of bio and bio_vecs to cache in the mempool
  * @front_pad:	Number of bytes to allocate in front of the returned bio
  * @flags:	Flags to modify behavior, currently %BIOSET_NEED_BVECS
@@ -1890,16 +1929,12 @@ EXPORT_SYMBOL(bioset_free);
  *    dispatch queued requests when the mempool runs out of space.
  *
  */
-struct bio_set *bioset_create(unsigned int pool_size,
-			      unsigned int front_pad,
-			      int flags)
+int bioset_init(struct bio_set *bs,
+		unsigned int pool_size,
+		unsigned int front_pad,
+		int flags)
 {
 	unsigned int back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec);
-	struct bio_set *bs;
-
-	bs = kzalloc(sizeof(*bs), GFP_KERNEL);
-	if (!bs)
-		return NULL;
 
 	bs->front_pad = front_pad;
 
@@ -1908,34 +1943,29 @@ struct bio_set *bioset_create(unsigned int pool_size,
 	INIT_WORK(&bs->rescue_work, bio_alloc_rescue);
 
 	bs->bio_slab = bio_find_or_create_slab(front_pad + back_pad);
-	if (!bs->bio_slab) {
-		kfree(bs);
-		return NULL;
-	}
+	if (!bs->bio_slab)
+		return -ENOMEM;
 
-	bs->bio_pool = mempool_create_slab_pool(pool_size, bs->bio_slab);
-	if (!bs->bio_pool)
+	if (mempool_init_slab_pool(&bs->bio_pool, pool_size, bs->bio_slab))
 		goto bad;
 
-	if (flags & BIOSET_NEED_BVECS) {
-		bs->bvec_pool = biovec_create_pool(pool_size);
-		if (!bs->bvec_pool)
-			goto bad;
-	}
+	if ((flags & BIOSET_NEED_BVECS) &&
+	    biovec_init_pool(&bs->bvec_pool, pool_size))
+		goto bad;
 
 	if (!(flags & BIOSET_NEED_RESCUER))
-		return bs;
+		return 0;
 
 	bs->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM, 0);
 	if (!bs->rescue_workqueue)
 		goto bad;
 
-	return bs;
+	return 0;
 bad:
-	bioset_free(bs);
-	return NULL;
+	bioset_exit(bs);
+	return -ENOMEM;
 }
-EXPORT_SYMBOL(bioset_create);
+EXPORT_SYMBOL(bioset_init);
 
 #ifdef CONFIG_BLK_CGROUP
 
@@ -2020,11 +2050,10 @@ static int __init init_bio(void)
 	bio_integrity_init();
 	biovec_init_slabs();
 
-	fs_bio_set = bioset_create(BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
-	if (!fs_bio_set)
+	if (bioset_init(&fs_bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS))
 		panic("bio: can't allocate bios\n");
 
-	if (bioset_integrity_create(fs_bio_set, BIO_POOL_SIZE))
+	if (bioset_integrity_create(&fs_bio_set, BIO_POOL_SIZE))
 		panic("bio: can't create integrity pool\n");
 
 	return 0;

+ 52 - 68
block/blk-core.c

@@ -196,15 +196,8 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
 	RB_CLEAR_NODE(&rq->rb_node);
 	rq->tag = -1;
 	rq->internal_tag = -1;
-	rq->start_time = jiffies;
-	set_start_time_ns(rq);
+	rq->start_time_ns = ktime_get_ns();
 	rq->part = NULL;
-	seqcount_init(&rq->gstate_seq);
-	u64_stats_init(&rq->aborted_gstate_sync);
-	/*
-	 * See comment of blk_mq_init_request
-	 */
-	WRITE_ONCE(rq->gstate, MQ_RQ_GEN_INC);
 }
 EXPORT_SYMBOL(blk_rq_init);
 
@@ -280,6 +273,10 @@ static void req_bio_endio(struct request *rq, struct bio *bio,
 	bio_advance(bio, nbytes);
 
 	/* don't actually finish bio if it's part of flush sequence */
+	/*
+	 * XXX this code looks suspicious - it's not consistent with advancing
+	 * req->bio in caller
+	 */
 	if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ))
 		bio_endio(bio);
 }
@@ -360,7 +357,6 @@ EXPORT_SYMBOL(blk_start_queue_async);
 void blk_start_queue(struct request_queue *q)
 {
 	lockdep_assert_held(q->queue_lock);
-	WARN_ON(!in_interrupt() && !irqs_disabled());
 	WARN_ON_ONCE(q->mq_ops);
 
 	queue_flag_clear(QUEUE_FLAG_STOPPED, q);
@@ -996,18 +992,24 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id,
 					   spinlock_t *lock)
 {
 	struct request_queue *q;
+	int ret;
 
 	q = kmem_cache_alloc_node(blk_requestq_cachep,
 				gfp_mask | __GFP_ZERO, node_id);
 	if (!q)
 		return NULL;
 
+	INIT_LIST_HEAD(&q->queue_head);
+	q->last_merge = NULL;
+	q->end_sector = 0;
+	q->boundary_rq = NULL;
+
 	q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask);
 	if (q->id < 0)
 		goto fail_q;
 
-	q->bio_split = bioset_create(BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
-	if (!q->bio_split)
+	ret = bioset_init(&q->bio_split, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
+	if (ret)
 		goto fail_id;
 
 	q->backing_dev_info = bdi_alloc_node(gfp_mask, node_id);
@@ -1079,7 +1081,7 @@ fail_bdi:
 fail_stats:
 	bdi_put(q->backing_dev_info);
 fail_split:
-	bioset_free(q->bio_split);
+	bioset_exit(&q->bio_split);
 fail_id:
 	ida_simple_remove(&blk_queue_ida, q->id);
 fail_q:
@@ -1173,16 +1175,8 @@ int blk_init_allocated_queue(struct request_queue *q)
 
 	q->sg_reserved_size = INT_MAX;
 
-	/* Protect q->elevator from elevator_change */
-	mutex_lock(&q->sysfs_lock);
-
-	/* init elevator */
-	if (elevator_init(q, NULL)) {
-		mutex_unlock(&q->sysfs_lock);
+	if (elevator_init(q))
 		goto out_exit_flush_rq;
-	}
-
-	mutex_unlock(&q->sysfs_lock);
 	return 0;
 
 out_exit_flush_rq:
@@ -1334,6 +1328,7 @@ int blk_update_nr_requests(struct request_queue *q, unsigned int nr)
  * @op: operation and flags
  * @bio: bio to allocate request for (can be %NULL)
  * @flags: BLQ_MQ_REQ_* flags
+ * @gfp_mask: allocator flags
  *
  * Get a free request from @q.  This function may fail under memory
  * pressure or if @q is dead.
@@ -1343,7 +1338,7 @@ int blk_update_nr_requests(struct request_queue *q, unsigned int nr)
  * Returns request pointer on success, with @q->queue_lock *not held*.
  */
 static struct request *__get_request(struct request_list *rl, unsigned int op,
-				     struct bio *bio, blk_mq_req_flags_t flags)
+		struct bio *bio, blk_mq_req_flags_t flags, gfp_t gfp_mask)
 {
 	struct request_queue *q = rl->q;
 	struct request *rq;
@@ -1352,8 +1347,6 @@ static struct request *__get_request(struct request_list *rl, unsigned int op,
 	struct io_cq *icq = NULL;
 	const bool is_sync = op_is_sync(op);
 	int may_queue;
-	gfp_t gfp_mask = flags & BLK_MQ_REQ_NOWAIT ? GFP_ATOMIC :
-			 __GFP_DIRECT_RECLAIM;
 	req_flags_t rq_flags = RQF_ALLOCED;
 
 	lockdep_assert_held(q->queue_lock);
@@ -1517,8 +1510,9 @@ rq_starved:
  * @op: operation and flags
  * @bio: bio to allocate request for (can be %NULL)
  * @flags: BLK_MQ_REQ_* flags.
+ * @gfp: allocator flags
  *
- * Get a free request from @q.  If %__GFP_DIRECT_RECLAIM is set in @gfp_mask,
+ * Get a free request from @q.  If %BLK_MQ_REQ_NOWAIT is set in @flags,
  * this function keeps retrying under memory pressure and fails iff @q is dead.
  *
  * Must be called with @q->queue_lock held and,
@@ -1526,7 +1520,7 @@ rq_starved:
  * Returns request pointer on success, with @q->queue_lock *not held*.
  */
 static struct request *get_request(struct request_queue *q, unsigned int op,
-				   struct bio *bio, blk_mq_req_flags_t flags)
+		struct bio *bio, blk_mq_req_flags_t flags, gfp_t gfp)
 {
 	const bool is_sync = op_is_sync(op);
 	DEFINE_WAIT(wait);
@@ -1538,7 +1532,7 @@ static struct request *get_request(struct request_queue *q, unsigned int op,
 
 	rl = blk_get_rl(q, bio);	/* transferred to @rq on success */
 retry:
-	rq = __get_request(rl, op, bio, flags);
+	rq = __get_request(rl, op, bio, flags, gfp);
 	if (!IS_ERR(rq))
 		return rq;
 
@@ -1579,8 +1573,7 @@ static struct request *blk_old_get_request(struct request_queue *q,
 				unsigned int op, blk_mq_req_flags_t flags)
 {
 	struct request *rq;
-	gfp_t gfp_mask = flags & BLK_MQ_REQ_NOWAIT ? GFP_ATOMIC :
-			 __GFP_DIRECT_RECLAIM;
+	gfp_t gfp_mask = flags & BLK_MQ_REQ_NOWAIT ? GFP_ATOMIC : GFP_NOIO;
 	int ret = 0;
 
 	WARN_ON_ONCE(q->mq_ops);
@@ -1592,7 +1585,7 @@ static struct request *blk_old_get_request(struct request_queue *q,
 	if (ret)
 		return ERR_PTR(ret);
 	spin_lock_irq(q->queue_lock);
-	rq = get_request(q, op, NULL, flags);
+	rq = get_request(q, op, NULL, flags, gfp_mask);
 	if (IS_ERR(rq)) {
 		spin_unlock_irq(q->queue_lock);
 		blk_queue_exit(q);
@@ -1607,13 +1600,13 @@ static struct request *blk_old_get_request(struct request_queue *q,
 }
 
 /**
- * blk_get_request_flags - allocate a request
+ * blk_get_request - allocate a request
  * @q: request queue to allocate a request for
  * @op: operation (REQ_OP_*) and REQ_* flags, e.g. REQ_SYNC.
  * @flags: BLK_MQ_REQ_* flags, e.g. BLK_MQ_REQ_NOWAIT.
  */
-struct request *blk_get_request_flags(struct request_queue *q, unsigned int op,
-				      blk_mq_req_flags_t flags)
+struct request *blk_get_request(struct request_queue *q, unsigned int op,
+				blk_mq_req_flags_t flags)
 {
 	struct request *req;
 
@@ -1632,14 +1625,6 @@ struct request *blk_get_request_flags(struct request_queue *q, unsigned int op,
 
 	return req;
 }
-EXPORT_SYMBOL(blk_get_request_flags);
-
-struct request *blk_get_request(struct request_queue *q, unsigned int op,
-				gfp_t gfp_mask)
-{
-	return blk_get_request_flags(q, op, gfp_mask & __GFP_DIRECT_RECLAIM ?
-				     0 : BLK_MQ_REQ_NOWAIT);
-}
 EXPORT_SYMBOL(blk_get_request);
 
 /**
@@ -1660,7 +1645,7 @@ void blk_requeue_request(struct request_queue *q, struct request *rq)
 	blk_delete_timer(rq);
 	blk_clear_rq_complete(rq);
 	trace_block_rq_requeue(q, rq);
-	wbt_requeue(q->rq_wb, &rq->issue_stat);
+	wbt_requeue(q->rq_wb, rq);
 
 	if (rq->rq_flags & RQF_QUEUED)
 		blk_queue_end_tag(q, rq);
@@ -1767,7 +1752,7 @@ void __blk_put_request(struct request_queue *q, struct request *req)
 	/* this is a bio leak */
 	WARN_ON(req->bio != NULL);
 
-	wbt_done(q->rq_wb, &req->issue_stat);
+	wbt_done(q->rq_wb, req);
 
 	/*
 	 * Request may not have originated from ll_rw_blk. if not,
@@ -2066,7 +2051,7 @@ get_rq:
 	 * Returns with the queue unlocked.
 	 */
 	blk_queue_enter_live(q);
-	req = get_request(q, bio->bi_opf, bio, 0);
+	req = get_request(q, bio->bi_opf, bio, 0, GFP_NOIO);
 	if (IS_ERR(req)) {
 		blk_queue_exit(q);
 		__wbt_done(q->rq_wb, wb_acct);
@@ -2078,7 +2063,7 @@ get_rq:
 		goto out_unlock;
 	}
 
-	wbt_track(&req->issue_stat, wb_acct);
+	wbt_track(req, wb_acct);
 
 	/*
 	 * After dropping the lock and possibly sleeping here, our request
@@ -2392,7 +2377,9 @@ blk_qc_t generic_make_request(struct bio *bio)
 
 	if (bio->bi_opf & REQ_NOWAIT)
 		flags = BLK_MQ_REQ_NOWAIT;
-	if (blk_queue_enter(q, flags) < 0) {
+	if (bio_flagged(bio, BIO_QUEUE_ENTERED))
+		blk_queue_enter_live(q);
+	else if (blk_queue_enter(q, flags) < 0) {
 		if (!blk_queue_dying(q) && (bio->bi_opf & REQ_NOWAIT))
 			bio_wouldblock_error(bio);
 		else
@@ -2727,7 +2714,7 @@ void blk_account_io_completion(struct request *req, unsigned int bytes)
 	}
 }
 
-void blk_account_io_done(struct request *req)
+void blk_account_io_done(struct request *req, u64 now)
 {
 	/*
 	 * Account IO completion.  flush_rq isn't accounted as a
@@ -2735,11 +2722,12 @@ void blk_account_io_done(struct request *req)
 	 * containing request is enough.
 	 */
 	if (blk_do_io_stat(req) && !(req->rq_flags & RQF_FLUSH_SEQ)) {
-		unsigned long duration = jiffies - req->start_time;
+		unsigned long duration;
 		const int rw = rq_data_dir(req);
 		struct hd_struct *part;
 		int cpu;
 
+		duration = nsecs_to_jiffies(now - req->start_time_ns);
 		cpu = part_stat_lock();
 		part = req->part;
 
@@ -2970,10 +2958,8 @@ static void blk_dequeue_request(struct request *rq)
 	 * and to it is freed is accounted as io that is in progress at
 	 * the driver side.
 	 */
-	if (blk_account_rq(rq)) {
+	if (blk_account_rq(rq))
 		q->in_flight[rq_is_sync(rq)]++;
-		set_io_start_time_ns(rq);
-	}
 }
 
 /**
@@ -2992,9 +2978,12 @@ void blk_start_request(struct request *req)
 	blk_dequeue_request(req);
 
 	if (test_bit(QUEUE_FLAG_STATS, &req->q->queue_flags)) {
-		blk_stat_set_issue(&req->issue_stat, blk_rq_sectors(req));
+		req->io_start_time_ns = ktime_get_ns();
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+		req->throtl_size = blk_rq_sectors(req);
+#endif
 		req->rq_flags |= RQF_STATS;
-		wbt_issue(req->q->rq_wb, &req->issue_stat);
+		wbt_issue(req->q->rq_wb, req);
 	}
 
 	BUG_ON(blk_rq_is_complete(req));
@@ -3092,8 +3081,10 @@ bool blk_update_request(struct request *req, blk_status_t error,
 		struct bio *bio = req->bio;
 		unsigned bio_bytes = min(bio->bi_iter.bi_size, nr_bytes);
 
-		if (bio_bytes == bio->bi_iter.bi_size)
+		if (bio_bytes == bio->bi_iter.bi_size) {
 			req->bio = bio->bi_next;
+			bio->bi_next = NULL;
+		}
 
 		/* Completion has already been traced */
 		bio_clear_flag(bio, BIO_TRACE_COMPLETION);
@@ -3190,12 +3181,13 @@ EXPORT_SYMBOL_GPL(blk_unprep_request);
 void blk_finish_request(struct request *req, blk_status_t error)
 {
 	struct request_queue *q = req->q;
+	u64 now = ktime_get_ns();
 
 	lockdep_assert_held(req->q->queue_lock);
 	WARN_ON_ONCE(q->mq_ops);
 
 	if (req->rq_flags & RQF_STATS)
-		blk_stat_add(req);
+		blk_stat_add(req, now);
 
 	if (req->rq_flags & RQF_QUEUED)
 		blk_queue_end_tag(q, req);
@@ -3210,10 +3202,10 @@ void blk_finish_request(struct request *req, blk_status_t error)
 	if (req->rq_flags & RQF_DONTPREP)
 		blk_unprep_request(req);
 
-	blk_account_io_done(req);
+	blk_account_io_done(req, now);
 
 	if (req->end_io) {
-		wbt_done(req->q->rq_wb, &req->issue_stat);
+		wbt_done(req->q->rq_wb, req);
 		req->end_io(req, error);
 	} else {
 		if (blk_bidi_rq(req))
@@ -3519,7 +3511,7 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
 	struct bio *bio, *bio_src;
 
 	if (!bs)
-		bs = fs_bio_set;
+		bs = &fs_bio_set;
 
 	__rq_for_each_bio(bio_src, rq_src) {
 		bio = bio_clone_fast(bio_src, gfp_mask, bs);
@@ -3630,7 +3622,7 @@ static void queue_unplugged(struct request_queue *q, unsigned int depth,
 		blk_run_queue_async(q);
 	else
 		__blk_run_queue(q);
-	spin_unlock(q->queue_lock);
+	spin_unlock_irq(q->queue_lock);
 }
 
 static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule)
@@ -3678,7 +3670,6 @@ EXPORT_SYMBOL(blk_check_plugged);
 void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 {
 	struct request_queue *q;
-	unsigned long flags;
 	struct request *rq;
 	LIST_HEAD(list);
 	unsigned int depth;
@@ -3698,11 +3689,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 	q = NULL;
 	depth = 0;
 
-	/*
-	 * Save and disable interrupts here, to avoid doing it for every
-	 * queue lock we have to take.
-	 */
-	local_irq_save(flags);
 	while (!list_empty(&list)) {
 		rq = list_entry_rq(list.next);
 		list_del_init(&rq->queuelist);
@@ -3715,7 +3701,7 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 				queue_unplugged(q, depth, from_schedule);
 			q = rq->q;
 			depth = 0;
-			spin_lock(q->queue_lock);
+			spin_lock_irq(q->queue_lock);
 		}
 
 		/*
@@ -3742,8 +3728,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 	 */
 	if (q)
 		queue_unplugged(q, depth, from_schedule);
-
-	local_irq_restore(flags);
 }
 
 void blk_finish_plug(struct blk_plug *plug)

+ 6 - 6
block/blk-integrity.c

@@ -333,34 +333,34 @@ static ssize_t integrity_device_show(struct blk_integrity *bi, char *page)
 }
 
 static struct integrity_sysfs_entry integrity_format_entry = {
-	.attr = { .name = "format", .mode = S_IRUGO },
+	.attr = { .name = "format", .mode = 0444 },
 	.show = integrity_format_show,
 };
 
 static struct integrity_sysfs_entry integrity_tag_size_entry = {
-	.attr = { .name = "tag_size", .mode = S_IRUGO },
+	.attr = { .name = "tag_size", .mode = 0444 },
 	.show = integrity_tag_size_show,
 };
 
 static struct integrity_sysfs_entry integrity_interval_entry = {
-	.attr = { .name = "protection_interval_bytes", .mode = S_IRUGO },
+	.attr = { .name = "protection_interval_bytes", .mode = 0444 },
 	.show = integrity_interval_show,
 };
 
 static struct integrity_sysfs_entry integrity_verify_entry = {
-	.attr = { .name = "read_verify", .mode = S_IRUGO | S_IWUSR },
+	.attr = { .name = "read_verify", .mode = 0644 },
 	.show = integrity_verify_show,
 	.store = integrity_verify_store,
 };
 
 static struct integrity_sysfs_entry integrity_generate_entry = {
-	.attr = { .name = "write_generate", .mode = S_IRUGO | S_IWUSR },
+	.attr = { .name = "write_generate", .mode = 0644 },
 	.show = integrity_generate_show,
 	.store = integrity_generate_store,
 };
 
 static struct integrity_sysfs_entry integrity_device_entry = {
-	.attr = { .name = "device_is_integrity_capable", .mode = S_IRUGO },
+	.attr = { .name = "device_is_integrity_capable", .mode = 0444 },
 	.show = integrity_device_show,
 };
 

+ 9 - 3
block/blk-lib.c

@@ -62,10 +62,16 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		unsigned int req_sects;
 		sector_t end_sect, tmp;
 
-		/* Make sure bi_size doesn't overflow */
-		req_sects = min_t(sector_t, nr_sects, UINT_MAX >> 9);
+		/*
+		 * Issue in chunks of the user defined max discard setting,
+		 * ensuring that bi_size doesn't overflow
+		 */
+		req_sects = min_t(sector_t, nr_sects,
+					q->limits.max_discard_sectors);
+		if (req_sects > UINT_MAX >> 9)
+			req_sects = UINT_MAX >> 9;
 
-		/**
+		/*
 		 * If splitting a request, and the next starting sector would be
 		 * misaligned, stop the discard at the previous aligned sector.
 		 */

+ 19 - 10
block/blk-merge.c

@@ -188,16 +188,16 @@ void blk_queue_split(struct request_queue *q, struct bio **bio)
 	switch (bio_op(*bio)) {
 	case REQ_OP_DISCARD:
 	case REQ_OP_SECURE_ERASE:
-		split = blk_bio_discard_split(q, *bio, q->bio_split, &nsegs);
+		split = blk_bio_discard_split(q, *bio, &q->bio_split, &nsegs);
 		break;
 	case REQ_OP_WRITE_ZEROES:
-		split = blk_bio_write_zeroes_split(q, *bio, q->bio_split, &nsegs);
+		split = blk_bio_write_zeroes_split(q, *bio, &q->bio_split, &nsegs);
 		break;
 	case REQ_OP_WRITE_SAME:
-		split = blk_bio_write_same_split(q, *bio, q->bio_split, &nsegs);
+		split = blk_bio_write_same_split(q, *bio, &q->bio_split, &nsegs);
 		break;
 	default:
-		split = blk_bio_segment_split(q, *bio, q->bio_split, &nsegs);
+		split = blk_bio_segment_split(q, *bio, &q->bio_split, &nsegs);
 		break;
 	}
 
@@ -210,6 +210,16 @@ void blk_queue_split(struct request_queue *q, struct bio **bio)
 		/* there isn't chance to merge the splitted bio */
 		split->bi_opf |= REQ_NOMERGE;
 
+		/*
+		 * Since we're recursing into make_request here, ensure
+		 * that we mark this bio as already having entered the queue.
+		 * If not, and the queue is going away, we can get stuck
+		 * forever on waiting for the queue reference to drop. But
+		 * that will never happen, as we're already holding a
+		 * reference to it.
+		 */
+		bio_set_flag(*bio, BIO_QUEUE_ENTERED);
+
 		bio_chain(split, *bio);
 		trace_block_split(q, split, (*bio)->bi_iter.bi_sector);
 		generic_make_request(*bio);
@@ -724,13 +734,12 @@ static struct request *attempt_merge(struct request_queue *q,
 	}
 
 	/*
-	 * At this point we have either done a back merge
-	 * or front merge. We need the smaller start_time of
-	 * the merged requests to be the current request
-	 * for accounting purposes.
+	 * At this point we have either done a back merge or front merge. We
+	 * need the smaller start_time_ns of the merged requests to be the
+	 * current request for accounting purposes.
 	 */
-	if (time_after(req->start_time, next->start_time))
-		req->start_time = next->start_time;
+	if (next->start_time_ns < req->start_time_ns)
+		req->start_time_ns = next->start_time_ns;
 
 	req->biotail->bi_next = next->bio;
 	req->biotail = next->biotail;

+ 0 - 1
block/blk-mq-debugfs.c

@@ -344,7 +344,6 @@ static const char *const rqf_name[] = {
 	RQF_NAME(STATS),
 	RQF_NAME(SPECIAL_PAYLOAD),
 	RQF_NAME(ZONE_WRITE_LOCKED),
-	RQF_NAME(MQ_TIMEOUT_EXPIRED),
 	RQF_NAME(MQ_POLL_SLEPT),
 };
 #undef RQF_NAME

+ 25 - 21
block/blk-mq-sched.c

@@ -268,19 +268,16 @@ bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
 EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge);
 
 /*
- * Reverse check our software queue for entries that we could potentially
- * merge with. Currently includes a hand-wavy stop count of 8, to not spend
- * too much time checking for merges.
+ * Iterate list of requests and see if we can merge this bio with any
+ * of them.
  */
-static bool blk_mq_attempt_merge(struct request_queue *q,
-				 struct blk_mq_ctx *ctx, struct bio *bio)
+bool blk_mq_bio_list_merge(struct request_queue *q, struct list_head *list,
+			   struct bio *bio)
 {
 	struct request *rq;
 	int checked = 8;
 
-	lockdep_assert_held(&ctx->lock);
-
-	list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) {
+	list_for_each_entry_reverse(rq, list, queuelist) {
 		bool merged = false;
 
 		if (!checked--)
@@ -305,13 +302,30 @@ static bool blk_mq_attempt_merge(struct request_queue *q,
 			continue;
 		}
 
-		if (merged)
-			ctx->rq_merged++;
 		return merged;
 	}
 
 	return false;
 }
+EXPORT_SYMBOL_GPL(blk_mq_bio_list_merge);
+
+/*
+ * Reverse check our software queue for entries that we could potentially
+ * merge with. Currently includes a hand-wavy stop count of 8, to not spend
+ * too much time checking for merges.
+ */
+static bool blk_mq_attempt_merge(struct request_queue *q,
+				 struct blk_mq_ctx *ctx, struct bio *bio)
+{
+	lockdep_assert_held(&ctx->lock);
+
+	if (blk_mq_bio_list_merge(q, &ctx->rq_list, bio)) {
+		ctx->rq_merged++;
+		return true;
+	}
+
+	return false;
+}
 
 bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
 {
@@ -571,6 +585,7 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
 
 	if (!e) {
 		q->elevator = NULL;
+		q->nr_requests = q->tag_set->queue_depth;
 		return 0;
 	}
 
@@ -633,14 +648,3 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
 	blk_mq_sched_tags_teardown(q);
 	q->elevator = NULL;
 }
-
-int blk_mq_sched_init(struct request_queue *q)
-{
-	int ret;
-
-	mutex_lock(&q->sysfs_lock);
-	ret = elevator_init(q, NULL);
-	mutex_unlock(&q->sysfs_lock);
-
-	return ret;
-}

+ 0 - 2
block/blk-mq-sched.h

@@ -33,8 +33,6 @@ int blk_mq_sched_init_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
 void blk_mq_sched_exit_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
 			    unsigned int hctx_idx);
 
-int blk_mq_sched_init(struct request_queue *q);
-
 static inline bool
 blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
 {

+ 3 - 3
block/blk-mq-sysfs.c

@@ -166,15 +166,15 @@ static struct attribute *default_ctx_attrs[] = {
 };
 
 static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_nr_tags = {
-	.attr = {.name = "nr_tags", .mode = S_IRUGO },
+	.attr = {.name = "nr_tags", .mode = 0444 },
 	.show = blk_mq_hw_sysfs_nr_tags_show,
 };
 static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_nr_reserved_tags = {
-	.attr = {.name = "nr_reserved_tags", .mode = S_IRUGO },
+	.attr = {.name = "nr_reserved_tags", .mode = 0444 },
 	.show = blk_mq_hw_sysfs_nr_reserved_tags_show,
 };
 static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_cpus = {
-	.attr = {.name = "cpu_list", .mode = S_IRUGO },
+	.attr = {.name = "cpu_list", .mode = 0444 },
 	.show = blk_mq_hw_sysfs_cpus_show,
 };
 

+ 13 - 1
block/blk-mq-tag.c

@@ -134,6 +134,8 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 	ws = bt_wait_ptr(bt, data->hctx);
 	drop_ctx = data->ctx == NULL;
 	do {
+		struct sbitmap_queue *bt_prev;
+
 		/*
 		 * We're out of tags on this hardware queue, kick any
 		 * pending IO submits before going to sleep waiting for
@@ -159,6 +161,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 		if (data->ctx)
 			blk_mq_put_ctx(data->ctx);
 
+		bt_prev = bt;
 		io_schedule();
 
 		data->ctx = blk_mq_get_ctx(data->q);
@@ -170,6 +173,15 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 			bt = &tags->bitmap_tags;
 
 		finish_wait(&ws->wait, &wait);
+
+		/*
+		 * If destination hw queue is changed, fake wake up on
+		 * previous queue for compensating the wake up miss, so
+		 * other allocations on previous queue won't be starved.
+		 */
+		if (bt != bt_prev)
+			sbitmap_queue_wake_up(bt_prev);
+
 		ws = bt_wait_ptr(bt, data->hctx);
 	} while (1);
 
@@ -259,7 +271,7 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
 	 * test and set the bit before assining ->rqs[].
 	 */
 	rq = tags->rqs[bitnr];
-	if (rq)
+	if (rq && blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT)
 		iter_data->fn(rq, iter_data->data, reserved);
 
 	return true;

+ 108 - 232
block/blk-mq.c

@@ -309,7 +309,8 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 	RB_CLEAR_NODE(&rq->rb_node);
 	rq->rq_disk = NULL;
 	rq->part = NULL;
-	rq->start_time = jiffies;
+	rq->start_time_ns = ktime_get_ns();
+	rq->io_start_time_ns = 0;
 	rq->nr_phys_segments = 0;
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
 	rq->nr_integrity_segments = 0;
@@ -328,11 +329,10 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 
 #ifdef CONFIG_BLK_CGROUP
 	rq->rl = NULL;
-	set_start_time_ns(rq);
-	rq->io_start_time_ns = 0;
 #endif
 
 	data->ctx->rq_dispatched[op_is_sync(op)]++;
+	refcount_set(&rq->ref, 1);
 	return rq;
 }
 
@@ -361,9 +361,11 @@ static struct request *blk_mq_get_request(struct request_queue *q,
 
 		/*
 		 * Flush requests are special and go directly to the
-		 * dispatch list.
+		 * dispatch list. Don't include reserved tags in the
+		 * limiting, as it isn't useful.
 		 */
-		if (!op_is_flush(op) && e->type->ops.mq.limit_depth)
+		if (!op_is_flush(op) && e->type->ops.mq.limit_depth &&
+		    !(data->flags & BLK_MQ_REQ_RESERVED))
 			e->type->ops.mq.limit_depth(op, data);
 	}
 
@@ -464,13 +466,27 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
 }
 EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);
 
+static void __blk_mq_free_request(struct request *rq)
+{
+	struct request_queue *q = rq->q;
+	struct blk_mq_ctx *ctx = rq->mq_ctx;
+	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
+	const int sched_tag = rq->internal_tag;
+
+	if (rq->tag != -1)
+		blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
+	if (sched_tag != -1)
+		blk_mq_put_tag(hctx, hctx->sched_tags, ctx, sched_tag);
+	blk_mq_sched_restart(hctx);
+	blk_queue_exit(q);
+}
+
 void blk_mq_free_request(struct request *rq)
 {
 	struct request_queue *q = rq->q;
 	struct elevator_queue *e = q->elevator;
 	struct blk_mq_ctx *ctx = rq->mq_ctx;
 	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
-	const int sched_tag = rq->internal_tag;
 
 	if (rq->rq_flags & RQF_ELVPRIV) {
 		if (e && e->type->ops.mq.finish_request)
@@ -488,27 +504,30 @@ void blk_mq_free_request(struct request *rq)
 	if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq)))
 		laptop_io_completion(q->backing_dev_info);
 
-	wbt_done(q->rq_wb, &rq->issue_stat);
+	wbt_done(q->rq_wb, rq);
 
 	if (blk_rq_rl(rq))
 		blk_put_rl(blk_rq_rl(rq));
 
-	blk_mq_rq_update_state(rq, MQ_RQ_IDLE);
-	if (rq->tag != -1)
-		blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
-	if (sched_tag != -1)
-		blk_mq_put_tag(hctx, hctx->sched_tags, ctx, sched_tag);
-	blk_mq_sched_restart(hctx);
-	blk_queue_exit(q);
+	WRITE_ONCE(rq->state, MQ_RQ_IDLE);
+	if (refcount_dec_and_test(&rq->ref))
+		__blk_mq_free_request(rq);
 }
 EXPORT_SYMBOL_GPL(blk_mq_free_request);
 
 inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
 {
-	blk_account_io_done(rq);
+	u64 now = ktime_get_ns();
+
+	if (rq->rq_flags & RQF_STATS) {
+		blk_mq_poll_stats_start(rq->q);
+		blk_stat_add(rq, now);
+	}
+
+	blk_account_io_done(rq, now);
 
 	if (rq->end_io) {
-		wbt_done(rq->q->rq_wb, &rq->issue_stat);
+		wbt_done(rq->q->rq_wb, rq);
 		rq->end_io(rq, error);
 	} else {
 		if (unlikely(blk_bidi_rq(rq)))
@@ -539,15 +558,12 @@ static void __blk_mq_complete_request(struct request *rq)
 	bool shared = false;
 	int cpu;
 
-	WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT);
-	blk_mq_rq_update_state(rq, MQ_RQ_COMPLETE);
+	if (cmpxchg(&rq->state, MQ_RQ_IN_FLIGHT, MQ_RQ_COMPLETE) !=
+			MQ_RQ_IN_FLIGHT)
+		return;
 
 	if (rq->internal_tag != -1)
 		blk_mq_sched_completed_request(rq);
-	if (rq->rq_flags & RQF_STATS) {
-		blk_mq_poll_stats_start(rq->q);
-		blk_stat_add(rq);
-	}
 
 	if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) {
 		rq->q->softirq_done_fn(rq);
@@ -589,36 +605,6 @@ static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx)
 		*srcu_idx = srcu_read_lock(hctx->srcu);
 }
 
-static void blk_mq_rq_update_aborted_gstate(struct request *rq, u64 gstate)
-{
-	unsigned long flags;
-
-	/*
-	 * blk_mq_rq_aborted_gstate() is used from the completion path and
-	 * can thus be called from irq context.  u64_stats_fetch in the
-	 * middle of update on the same CPU leads to lockup.  Disable irq
-	 * while updating.
-	 */
-	local_irq_save(flags);
-	u64_stats_update_begin(&rq->aborted_gstate_sync);
-	rq->aborted_gstate = gstate;
-	u64_stats_update_end(&rq->aborted_gstate_sync);
-	local_irq_restore(flags);
-}
-
-static u64 blk_mq_rq_aborted_gstate(struct request *rq)
-{
-	unsigned int start;
-	u64 aborted_gstate;
-
-	do {
-		start = u64_stats_fetch_begin(&rq->aborted_gstate_sync);
-		aborted_gstate = rq->aborted_gstate;
-	} while (u64_stats_fetch_retry(&rq->aborted_gstate_sync, start));
-
-	return aborted_gstate;
-}
-
 /**
  * blk_mq_complete_request - end I/O on a request
  * @rq:		the request being processed
@@ -629,28 +615,9 @@ static u64 blk_mq_rq_aborted_gstate(struct request *rq)
  **/
 void blk_mq_complete_request(struct request *rq)
 {
-	struct request_queue *q = rq->q;
-	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu);
-	int srcu_idx;
-
-	if (unlikely(blk_should_fake_timeout(q)))
+	if (unlikely(blk_should_fake_timeout(rq->q)))
 		return;
-
-	/*
-	 * If @rq->aborted_gstate equals the current instance, timeout is
-	 * claiming @rq and we lost.  This is synchronized through
-	 * hctx_lock().  See blk_mq_timeout_work() for details.
-	 *
-	 * Completion path never blocks and we can directly use RCU here
-	 * instead of hctx_lock() which can be either RCU or SRCU.
-	 * However, that would complicate paths which want to synchronize
-	 * against us.  Let stay in sync with the issue path so that
-	 * hctx_lock() covers both issue and completion paths.
-	 */
-	hctx_lock(hctx, &srcu_idx);
-	if (blk_mq_rq_aborted_gstate(rq) != rq->gstate)
-		__blk_mq_complete_request(rq);
-	hctx_unlock(hctx, srcu_idx);
+	__blk_mq_complete_request(rq);
 }
 EXPORT_SYMBOL(blk_mq_complete_request);
 
@@ -669,32 +636,18 @@ void blk_mq_start_request(struct request *rq)
 	trace_block_rq_issue(q, rq);
 
 	if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
-		blk_stat_set_issue(&rq->issue_stat, blk_rq_sectors(rq));
+		rq->io_start_time_ns = ktime_get_ns();
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+		rq->throtl_size = blk_rq_sectors(rq);
+#endif
 		rq->rq_flags |= RQF_STATS;
-		wbt_issue(q->rq_wb, &rq->issue_stat);
+		wbt_issue(q->rq_wb, rq);
 	}
 
 	WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE);
 
-	/*
-	 * Mark @rq in-flight which also advances the generation number,
-	 * and register for timeout.  Protect with a seqcount to allow the
-	 * timeout path to read both @rq->gstate and @rq->deadline
-	 * coherently.
-	 *
-	 * This is the only place where a request is marked in-flight.  If
-	 * the timeout path reads an in-flight @rq->gstate, the
-	 * @rq->deadline it reads together under @rq->gstate_seq is
-	 * guaranteed to be the matching one.
-	 */
-	preempt_disable();
-	write_seqcount_begin(&rq->gstate_seq);
-
-	blk_mq_rq_update_state(rq, MQ_RQ_IN_FLIGHT);
 	blk_add_timer(rq);
-
-	write_seqcount_end(&rq->gstate_seq);
-	preempt_enable();
+	WRITE_ONCE(rq->state, MQ_RQ_IN_FLIGHT);
 
 	if (q->dma_drain_size && blk_rq_bytes(rq)) {
 		/*
@@ -707,11 +660,6 @@ void blk_mq_start_request(struct request *rq)
 }
 EXPORT_SYMBOL(blk_mq_start_request);
 
-/*
- * When we reach here because queue is busy, it's safe to change the state
- * to IDLE without checking @rq->aborted_gstate because we should still be
- * holding the RCU read lock and thus protected against timeout.
- */
 static void __blk_mq_requeue_request(struct request *rq)
 {
 	struct request_queue *q = rq->q;
@@ -719,10 +667,10 @@ static void __blk_mq_requeue_request(struct request *rq)
 	blk_mq_put_driver_tag(rq);
 
 	trace_block_rq_requeue(q, rq);
-	wbt_requeue(q->rq_wb, &rq->issue_stat);
+	wbt_requeue(q->rq_wb, rq);
 
-	if (blk_mq_rq_state(rq) != MQ_RQ_IDLE) {
-		blk_mq_rq_update_state(rq, MQ_RQ_IDLE);
+	if (blk_mq_request_started(rq)) {
+		WRITE_ONCE(rq->state, MQ_RQ_IDLE);
 		if (q->dma_drain_size && blk_rq_bytes(rq))
 			rq->nr_phys_segments--;
 	}
@@ -820,101 +768,79 @@ struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
 }
 EXPORT_SYMBOL(blk_mq_tag_to_rq);
 
-struct blk_mq_timeout_data {
-	unsigned long next;
-	unsigned int next_set;
-	unsigned int nr_expired;
-};
-
 static void blk_mq_rq_timed_out(struct request *req, bool reserved)
 {
-	const struct blk_mq_ops *ops = req->q->mq_ops;
-	enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER;
-
-	req->rq_flags |= RQF_MQ_TIMEOUT_EXPIRED;
-
-	if (ops->timeout)
-		ret = ops->timeout(req, reserved);
+	if (req->q->mq_ops->timeout) {
+		enum blk_eh_timer_return ret;
 
-	switch (ret) {
-	case BLK_EH_HANDLED:
-		__blk_mq_complete_request(req);
-		break;
-	case BLK_EH_RESET_TIMER:
-		/*
-		 * As nothing prevents from completion happening while
-		 * ->aborted_gstate is set, this may lead to ignored
-		 * completions and further spurious timeouts.
-		 */
-		blk_mq_rq_update_aborted_gstate(req, 0);
-		blk_add_timer(req);
-		break;
-	case BLK_EH_NOT_HANDLED:
-		break;
-	default:
-		printk(KERN_ERR "block: bad eh return: %d\n", ret);
-		break;
+		ret = req->q->mq_ops->timeout(req, reserved);
+		if (ret == BLK_EH_DONE)
+			return;
+		WARN_ON_ONCE(ret != BLK_EH_RESET_TIMER);
 	}
+
+	blk_add_timer(req);
 }
 
-static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
-		struct request *rq, void *priv, bool reserved)
+static bool blk_mq_req_expired(struct request *rq, unsigned long *next)
 {
-	struct blk_mq_timeout_data *data = priv;
-	unsigned long gstate, deadline;
-	int start;
+	unsigned long deadline;
 
-	might_sleep();
-
-	if (rq->rq_flags & RQF_MQ_TIMEOUT_EXPIRED)
-		return;
+	if (blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT)
+		return false;
 
-	/* read coherent snapshots of @rq->state_gen and @rq->deadline */
-	while (true) {
-		start = read_seqcount_begin(&rq->gstate_seq);
-		gstate = READ_ONCE(rq->gstate);
-		deadline = blk_rq_deadline(rq);
-		if (!read_seqcount_retry(&rq->gstate_seq, start))
-			break;
-		cond_resched();
-	}
+	deadline = blk_rq_deadline(rq);
+	if (time_after_eq(jiffies, deadline))
+		return true;
 
-	/* if in-flight && overdue, mark for abortion */
-	if ((gstate & MQ_RQ_STATE_MASK) == MQ_RQ_IN_FLIGHT &&
-	    time_after_eq(jiffies, deadline)) {
-		blk_mq_rq_update_aborted_gstate(rq, gstate);
-		data->nr_expired++;
-		hctx->nr_expired++;
-	} else if (!data->next_set || time_after(data->next, deadline)) {
-		data->next = deadline;
-		data->next_set = 1;
-	}
+	if (*next == 0)
+		*next = deadline;
+	else if (time_after(*next, deadline))
+		*next = deadline;
+	return false;
 }
 
-static void blk_mq_terminate_expired(struct blk_mq_hw_ctx *hctx,
+static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
 		struct request *rq, void *priv, bool reserved)
 {
+	unsigned long *next = priv;
+
 	/*
-	 * We marked @rq->aborted_gstate and waited for RCU.  If there were
-	 * completions that we lost to, they would have finished and
-	 * updated @rq->gstate by now; otherwise, the completion path is
-	 * now guaranteed to see @rq->aborted_gstate and yield.  If
-	 * @rq->aborted_gstate still matches @rq->gstate, @rq is ours.
+	 * Just do a quick check if it is expired before locking the request in
+	 * so we're not unnecessarilly synchronizing across CPUs.
+	 */
+	if (!blk_mq_req_expired(rq, next))
+		return;
+
+	/*
+	 * We have reason to believe the request may be expired. Take a
+	 * reference on the request to lock this request lifetime into its
+	 * currently allocated context to prevent it from being reallocated in
+	 * the event the completion by-passes this timeout handler.
+	 *
+	 * If the reference was already released, then the driver beat the
+	 * timeout handler to posting a natural completion.
 	 */
-	if (!(rq->rq_flags & RQF_MQ_TIMEOUT_EXPIRED) &&
-	    READ_ONCE(rq->gstate) == rq->aborted_gstate)
+	if (!refcount_inc_not_zero(&rq->ref))
+		return;
+
+	/*
+	 * The request is now locked and cannot be reallocated underneath the
+	 * timeout handler's processing. Re-verify this exact request is truly
+	 * expired; if it is not expired, then the request was completed and
+	 * reallocated as a new request.
+	 */
+	if (blk_mq_req_expired(rq, next))
 		blk_mq_rq_timed_out(rq, reserved);
+	if (refcount_dec_and_test(&rq->ref))
+		__blk_mq_free_request(rq);
 }
 
 static void blk_mq_timeout_work(struct work_struct *work)
 {
 	struct request_queue *q =
 		container_of(work, struct request_queue, timeout_work);
-	struct blk_mq_timeout_data data = {
-		.next		= 0,
-		.next_set	= 0,
-		.nr_expired	= 0,
-	};
+	unsigned long next = 0;
 	struct blk_mq_hw_ctx *hctx;
 	int i;
 
@@ -934,39 +860,10 @@ static void blk_mq_timeout_work(struct work_struct *work)
 	if (!percpu_ref_tryget(&q->q_usage_counter))
 		return;
 
-	/* scan for the expired ones and set their ->aborted_gstate */
-	blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data);
-
-	if (data.nr_expired) {
-		bool has_rcu = false;
-
-		/*
-		 * Wait till everyone sees ->aborted_gstate.  The
-		 * sequential waits for SRCUs aren't ideal.  If this ever
-		 * becomes a problem, we can add per-hw_ctx rcu_head and
-		 * wait in parallel.
-		 */
-		queue_for_each_hw_ctx(q, hctx, i) {
-			if (!hctx->nr_expired)
-				continue;
-
-			if (!(hctx->flags & BLK_MQ_F_BLOCKING))
-				has_rcu = true;
-			else
-				synchronize_srcu(hctx->srcu);
-
-			hctx->nr_expired = 0;
-		}
-		if (has_rcu)
-			synchronize_rcu();
-
-		/* terminate the ones we won */
-		blk_mq_queue_tag_busy_iter(q, blk_mq_terminate_expired, NULL);
-	}
+	blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &next);
 
-	if (data.next_set) {
-		data.next = blk_rq_timeout(round_jiffies_up(data.next));
-		mod_timer(&q->timeout, data.next);
+	if (next != 0) {
+		mod_timer(&q->timeout, next);
 	} else {
 		/*
 		 * Request timeouts are handled as a forward rolling timer. If
@@ -1029,7 +926,7 @@ static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr,
 	struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
 
 	spin_lock(&ctx->lock);
-	if (unlikely(!list_empty(&ctx->rq_list))) {
+	if (!list_empty(&ctx->rq_list)) {
 		dispatch_data->rq = list_entry_rq(ctx->rq_list.next);
 		list_del_init(&dispatch_data->rq->queuelist);
 		if (list_empty(&ctx->rq_list))
@@ -1716,15 +1613,6 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
 	blk_account_io_start(rq, true);
 }
 
-static inline void blk_mq_queue_io(struct blk_mq_hw_ctx *hctx,
-				   struct blk_mq_ctx *ctx,
-				   struct request *rq)
-{
-	spin_lock(&ctx->lock);
-	__blk_mq_insert_request(hctx, rq, false);
-	spin_unlock(&ctx->lock);
-}
-
 static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq)
 {
 	if (rq->tag != -1)
@@ -1882,7 +1770,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 		return BLK_QC_T_NONE;
 	}
 
-	wbt_track(&rq->issue_stat, wb_acct);
+	wbt_track(rq, wb_acct);
 
 	cookie = request_to_qc_t(data.hctx, rq);
 
@@ -1949,15 +1837,10 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 		blk_mq_put_ctx(data.ctx);
 		blk_mq_bio_to_request(rq, bio);
 		blk_mq_try_issue_directly(data.hctx, rq, &cookie);
-	} else if (q->elevator) {
-		blk_mq_put_ctx(data.ctx);
-		blk_mq_bio_to_request(rq, bio);
-		blk_mq_sched_insert_request(rq, false, true, true);
 	} else {
 		blk_mq_put_ctx(data.ctx);
 		blk_mq_bio_to_request(rq, bio);
-		blk_mq_queue_io(data.hctx, data.ctx, rq);
-		blk_mq_run_hw_queue(data.hctx, true);
+		blk_mq_sched_insert_request(rq, false, true, true);
 	}
 
 	return cookie;
@@ -2056,15 +1939,7 @@ static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq,
 			return ret;
 	}
 
-	seqcount_init(&rq->gstate_seq);
-	u64_stats_init(&rq->aborted_gstate_sync);
-	/*
-	 * start gstate with gen 1 instead of 0, otherwise it will be equal
-	 * to aborted_gstate, and be identified timed out by
-	 * blk_mq_terminate_expired.
-	 */
-	WRITE_ONCE(rq->gstate, MQ_RQ_GEN_INC);
-
+	WRITE_ONCE(rq->state, MQ_RQ_IDLE);
 	return 0;
 }
 
@@ -2365,6 +2240,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
 	queue_for_each_hw_ctx(q, hctx, i) {
 		cpumask_clear(hctx->cpumask);
 		hctx->nr_ctx = 0;
+		hctx->dispatch_from = NULL;
 	}
 
 	/*
@@ -2697,7 +2573,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 	if (!(set->flags & BLK_MQ_F_NO_SCHED)) {
 		int ret;
 
-		ret = blk_mq_sched_init(q);
+		ret = elevator_init_mq(q);
 		if (ret)
 			return ERR_PTR(ret);
 	}

+ 2 - 40
block/blk-mq.h

@@ -30,20 +30,6 @@ struct blk_mq_ctx {
 	struct kobject		kobj;
 } ____cacheline_aligned_in_smp;
 
-/*
- * Bits for request->gstate.  The lower two bits carry MQ_RQ_* state value
- * and the upper bits the generation number.
- */
-enum mq_rq_state {
-	MQ_RQ_IDLE		= 0,
-	MQ_RQ_IN_FLIGHT		= 1,
-	MQ_RQ_COMPLETE		= 2,
-
-	MQ_RQ_STATE_BITS	= 2,
-	MQ_RQ_STATE_MASK	= (1 << MQ_RQ_STATE_BITS) - 1,
-	MQ_RQ_GEN_INC		= 1 << MQ_RQ_STATE_BITS,
-};
-
 void blk_mq_freeze_queue(struct request_queue *q);
 void blk_mq_free_queue(struct request_queue *q);
 int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
@@ -107,33 +93,9 @@ void blk_mq_release(struct request_queue *q);
  * blk_mq_rq_state() - read the current MQ_RQ_* state of a request
  * @rq: target request.
  */
-static inline int blk_mq_rq_state(struct request *rq)
+static inline enum mq_rq_state blk_mq_rq_state(struct request *rq)
 {
-	return READ_ONCE(rq->gstate) & MQ_RQ_STATE_MASK;
-}
-
-/**
- * blk_mq_rq_update_state() - set the current MQ_RQ_* state of a request
- * @rq: target request.
- * @state: new state to set.
- *
- * Set @rq's state to @state.  The caller is responsible for ensuring that
- * there are no other updaters.  A request can transition into IN_FLIGHT
- * only from IDLE and doing so increments the generation number.
- */
-static inline void blk_mq_rq_update_state(struct request *rq,
-					  enum mq_rq_state state)
-{
-	u64 old_val = READ_ONCE(rq->gstate);
-	u64 new_val = (old_val & ~MQ_RQ_STATE_MASK) | state;
-
-	if (state == MQ_RQ_IN_FLIGHT) {
-		WARN_ON_ONCE((old_val & MQ_RQ_STATE_MASK) != MQ_RQ_IDLE);
-		new_val += MQ_RQ_GEN_INC;
-	}
-
-	/* avoid exposing interim values */
-	WRITE_ONCE(rq->gstate, new_val);
+	return READ_ONCE(rq->state);
 }
 
 static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,

+ 3 - 7
block/blk-stat.c

@@ -47,19 +47,15 @@ static void __blk_stat_add(struct blk_rq_stat *stat, u64 value)
 	stat->nr_samples++;
 }
 
-void blk_stat_add(struct request *rq)
+void blk_stat_add(struct request *rq, u64 now)
 {
 	struct request_queue *q = rq->q;
 	struct blk_stat_callback *cb;
 	struct blk_rq_stat *stat;
 	int bucket;
-	u64 now, value;
+	u64 value;
 
-	now = __blk_stat_time(ktime_to_ns(ktime_get()));
-	if (now < blk_stat_time(&rq->issue_stat))
-		return;
-
-	value = now - blk_stat_time(&rq->issue_stat);
+	value = (now >= rq->io_start_time_ns) ? now - rq->io_start_time_ns : 0;
 
 	blk_throtl_stat_add(rq, value);
 

+ 1 - 44
block/blk-stat.h

@@ -8,21 +8,6 @@
 #include <linux/rcupdate.h>
 #include <linux/timer.h>
 
-/*
- * from upper:
- * 3 bits: reserved for other usage
- * 12 bits: size
- * 49 bits: time
- */
-#define BLK_STAT_RES_BITS	3
-#define BLK_STAT_SIZE_BITS	12
-#define BLK_STAT_RES_SHIFT	(64 - BLK_STAT_RES_BITS)
-#define BLK_STAT_SIZE_SHIFT	(BLK_STAT_RES_SHIFT - BLK_STAT_SIZE_BITS)
-#define BLK_STAT_TIME_MASK	((1ULL << BLK_STAT_SIZE_SHIFT) - 1)
-#define BLK_STAT_SIZE_MASK	\
-	(((1ULL << BLK_STAT_SIZE_BITS) - 1) << BLK_STAT_SIZE_SHIFT)
-#define BLK_STAT_RES_MASK	(~((1ULL << BLK_STAT_RES_SHIFT) - 1))
-
 /**
  * struct blk_stat_callback - Block statistics callback.
  *
@@ -80,35 +65,7 @@ struct blk_stat_callback {
 struct blk_queue_stats *blk_alloc_queue_stats(void);
 void blk_free_queue_stats(struct blk_queue_stats *);
 
-void blk_stat_add(struct request *);
-
-static inline u64 __blk_stat_time(u64 time)
-{
-	return time & BLK_STAT_TIME_MASK;
-}
-
-static inline u64 blk_stat_time(struct blk_issue_stat *stat)
-{
-	return __blk_stat_time(stat->stat);
-}
-
-static inline sector_t blk_capped_size(sector_t size)
-{
-	return size & ((1ULL << BLK_STAT_SIZE_BITS) - 1);
-}
-
-static inline sector_t blk_stat_size(struct blk_issue_stat *stat)
-{
-	return (stat->stat & BLK_STAT_SIZE_MASK) >> BLK_STAT_SIZE_SHIFT;
-}
-
-static inline void blk_stat_set_issue(struct blk_issue_stat *stat,
-	sector_t size)
-{
-	stat->stat = (stat->stat & BLK_STAT_RES_MASK) |
-		(ktime_to_ns(ktime_get()) & BLK_STAT_TIME_MASK) |
-		(((u64)blk_capped_size(size)) << BLK_STAT_SIZE_SHIFT);
-}
+void blk_stat_add(struct request *rq, u64 now);
 
 /* record time/size info in request but not add a callback */
 void blk_stat_enable_accounting(struct request_queue *q);

+ 45 - 35
block/blk-sysfs.c

@@ -491,188 +491,198 @@ static ssize_t queue_wc_store(struct request_queue *q, const char *page,
 	return count;
 }
 
+static ssize_t queue_fua_show(struct request_queue *q, char *page)
+{
+	return sprintf(page, "%u\n", test_bit(QUEUE_FLAG_FUA, &q->queue_flags));
+}
+
 static ssize_t queue_dax_show(struct request_queue *q, char *page)
 {
 	return queue_var_show(blk_queue_dax(q), page);
 }
 
 static struct queue_sysfs_entry queue_requests_entry = {
-	.attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },
+	.attr = {.name = "nr_requests", .mode = 0644 },
 	.show = queue_requests_show,
 	.store = queue_requests_store,
 };
 
 static struct queue_sysfs_entry queue_ra_entry = {
-	.attr = {.name = "read_ahead_kb", .mode = S_IRUGO | S_IWUSR },
+	.attr = {.name = "read_ahead_kb", .mode = 0644 },
 	.show = queue_ra_show,
 	.store = queue_ra_store,
 };
 
 static struct queue_sysfs_entry queue_max_sectors_entry = {
-	.attr = {.name = "max_sectors_kb", .mode = S_IRUGO | S_IWUSR },
+	.attr = {.name = "max_sectors_kb", .mode = 0644 },
 	.show = queue_max_sectors_show,
 	.store = queue_max_sectors_store,
 };
 
 static struct queue_sysfs_entry queue_max_hw_sectors_entry = {
-	.attr = {.name = "max_hw_sectors_kb", .mode = S_IRUGO },
+	.attr = {.name = "max_hw_sectors_kb", .mode = 0444 },
 	.show = queue_max_hw_sectors_show,
 };
 
 static struct queue_sysfs_entry queue_max_segments_entry = {
-	.attr = {.name = "max_segments", .mode = S_IRUGO },
+	.attr = {.name = "max_segments", .mode = 0444 },
 	.show = queue_max_segments_show,
 };
 
 static struct queue_sysfs_entry queue_max_discard_segments_entry = {
-	.attr = {.name = "max_discard_segments", .mode = S_IRUGO },
+	.attr = {.name = "max_discard_segments", .mode = 0444 },
 	.show = queue_max_discard_segments_show,
 };
 
 static struct queue_sysfs_entry queue_max_integrity_segments_entry = {
-	.attr = {.name = "max_integrity_segments", .mode = S_IRUGO },
+	.attr = {.name = "max_integrity_segments", .mode = 0444 },
 	.show = queue_max_integrity_segments_show,
 };
 
 static struct queue_sysfs_entry queue_max_segment_size_entry = {
-	.attr = {.name = "max_segment_size", .mode = S_IRUGO },
+	.attr = {.name = "max_segment_size", .mode = 0444 },
 	.show = queue_max_segment_size_show,
 };
 
 static struct queue_sysfs_entry queue_iosched_entry = {
-	.attr = {.name = "scheduler", .mode = S_IRUGO | S_IWUSR },
+	.attr = {.name = "scheduler", .mode = 0644 },
 	.show = elv_iosched_show,
 	.store = elv_iosched_store,
 };
 
 static struct queue_sysfs_entry queue_hw_sector_size_entry = {
-	.attr = {.name = "hw_sector_size", .mode = S_IRUGO },
+	.attr = {.name = "hw_sector_size", .mode = 0444 },
 	.show = queue_logical_block_size_show,
 };
 
 static struct queue_sysfs_entry queue_logical_block_size_entry = {
-	.attr = {.name = "logical_block_size", .mode = S_IRUGO },
+	.attr = {.name = "logical_block_size", .mode = 0444 },
 	.show = queue_logical_block_size_show,
 };
 
 static struct queue_sysfs_entry queue_physical_block_size_entry = {
-	.attr = {.name = "physical_block_size", .mode = S_IRUGO },
+	.attr = {.name = "physical_block_size", .mode = 0444 },
 	.show = queue_physical_block_size_show,
 };
 
 static struct queue_sysfs_entry queue_chunk_sectors_entry = {
-	.attr = {.name = "chunk_sectors", .mode = S_IRUGO },
+	.attr = {.name = "chunk_sectors", .mode = 0444 },
 	.show = queue_chunk_sectors_show,
 };
 
 static struct queue_sysfs_entry queue_io_min_entry = {
-	.attr = {.name = "minimum_io_size", .mode = S_IRUGO },
+	.attr = {.name = "minimum_io_size", .mode = 0444 },
 	.show = queue_io_min_show,
 };
 
 static struct queue_sysfs_entry queue_io_opt_entry = {
-	.attr = {.name = "optimal_io_size", .mode = S_IRUGO },
+	.attr = {.name = "optimal_io_size", .mode = 0444 },
 	.show = queue_io_opt_show,
 };
 
 static struct queue_sysfs_entry queue_discard_granularity_entry = {
-	.attr = {.name = "discard_granularity", .mode = S_IRUGO },
+	.attr = {.name = "discard_granularity", .mode = 0444 },
 	.show = queue_discard_granularity_show,
 };
 
 static struct queue_sysfs_entry queue_discard_max_hw_entry = {
-	.attr = {.name = "discard_max_hw_bytes", .mode = S_IRUGO },
+	.attr = {.name = "discard_max_hw_bytes", .mode = 0444 },
 	.show = queue_discard_max_hw_show,
 };
 
 static struct queue_sysfs_entry queue_discard_max_entry = {
-	.attr = {.name = "discard_max_bytes", .mode = S_IRUGO | S_IWUSR },
+	.attr = {.name = "discard_max_bytes", .mode = 0644 },
 	.show = queue_discard_max_show,
 	.store = queue_discard_max_store,
 };
 
 static struct queue_sysfs_entry queue_discard_zeroes_data_entry = {
-	.attr = {.name = "discard_zeroes_data", .mode = S_IRUGO },
+	.attr = {.name = "discard_zeroes_data", .mode = 0444 },
 	.show = queue_discard_zeroes_data_show,
 };
 
 static struct queue_sysfs_entry queue_write_same_max_entry = {
-	.attr = {.name = "write_same_max_bytes", .mode = S_IRUGO },
+	.attr = {.name = "write_same_max_bytes", .mode = 0444 },
 	.show = queue_write_same_max_show,
 };
 
 static struct queue_sysfs_entry queue_write_zeroes_max_entry = {
-	.attr = {.name = "write_zeroes_max_bytes", .mode = S_IRUGO },
+	.attr = {.name = "write_zeroes_max_bytes", .mode = 0444 },
 	.show = queue_write_zeroes_max_show,
 };
 
 static struct queue_sysfs_entry queue_nonrot_entry = {
-	.attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR },
+	.attr = {.name = "rotational", .mode = 0644 },
 	.show = queue_show_nonrot,
 	.store = queue_store_nonrot,
 };
 
 static struct queue_sysfs_entry queue_zoned_entry = {
-	.attr = {.name = "zoned", .mode = S_IRUGO },
+	.attr = {.name = "zoned", .mode = 0444 },
 	.show = queue_zoned_show,
 };
 
 static struct queue_sysfs_entry queue_nomerges_entry = {
-	.attr = {.name = "nomerges", .mode = S_IRUGO | S_IWUSR },
+	.attr = {.name = "nomerges", .mode = 0644 },
 	.show = queue_nomerges_show,
 	.store = queue_nomerges_store,
 };
 
 static struct queue_sysfs_entry queue_rq_affinity_entry = {
-	.attr = {.name = "rq_affinity", .mode = S_IRUGO | S_IWUSR },
+	.attr = {.name = "rq_affinity", .mode = 0644 },
 	.show = queue_rq_affinity_show,
 	.store = queue_rq_affinity_store,
 };
 
 static struct queue_sysfs_entry queue_iostats_entry = {
-	.attr = {.name = "iostats", .mode = S_IRUGO | S_IWUSR },
+	.attr = {.name = "iostats", .mode = 0644 },
 	.show = queue_show_iostats,
 	.store = queue_store_iostats,
 };
 
 static struct queue_sysfs_entry queue_random_entry = {
-	.attr = {.name = "add_random", .mode = S_IRUGO | S_IWUSR },
+	.attr = {.name = "add_random", .mode = 0644 },
 	.show = queue_show_random,
 	.store = queue_store_random,
 };
 
 static struct queue_sysfs_entry queue_poll_entry = {
-	.attr = {.name = "io_poll", .mode = S_IRUGO | S_IWUSR },
+	.attr = {.name = "io_poll", .mode = 0644 },
 	.show = queue_poll_show,
 	.store = queue_poll_store,
 };
 
 static struct queue_sysfs_entry queue_poll_delay_entry = {
-	.attr = {.name = "io_poll_delay", .mode = S_IRUGO | S_IWUSR },
+	.attr = {.name = "io_poll_delay", .mode = 0644 },
 	.show = queue_poll_delay_show,
 	.store = queue_poll_delay_store,
 };
 
 static struct queue_sysfs_entry queue_wc_entry = {
-	.attr = {.name = "write_cache", .mode = S_IRUGO | S_IWUSR },
+	.attr = {.name = "write_cache", .mode = 0644 },
 	.show = queue_wc_show,
 	.store = queue_wc_store,
 };
 
+static struct queue_sysfs_entry queue_fua_entry = {
+	.attr = {.name = "fua", .mode = 0444 },
+	.show = queue_fua_show,
+};
+
 static struct queue_sysfs_entry queue_dax_entry = {
-	.attr = {.name = "dax", .mode = S_IRUGO },
+	.attr = {.name = "dax", .mode = 0444 },
 	.show = queue_dax_show,
 };
 
 static struct queue_sysfs_entry queue_wb_lat_entry = {
-	.attr = {.name = "wbt_lat_usec", .mode = S_IRUGO | S_IWUSR },
+	.attr = {.name = "wbt_lat_usec", .mode = 0644 },
 	.show = queue_wb_lat_show,
 	.store = queue_wb_lat_store,
 };
 
 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
 static struct queue_sysfs_entry throtl_sample_time_entry = {
-	.attr = {.name = "throttle_sample_time", .mode = S_IRUGO | S_IWUSR },
+	.attr = {.name = "throttle_sample_time", .mode = 0644 },
 	.show = blk_throtl_sample_time_show,
 	.store = blk_throtl_sample_time_store,
 };
@@ -708,6 +718,7 @@ static struct attribute *default_attrs[] = {
 	&queue_random_entry.attr,
 	&queue_poll_entry.attr,
 	&queue_wc_entry.attr,
+	&queue_fua_entry.attr,
 	&queue_dax_entry.attr,
 	&queue_wb_lat_entry.attr,
 	&queue_poll_delay_entry.attr,
@@ -813,8 +824,7 @@ static void __blk_release_queue(struct work_struct *work)
 	if (q->mq_ops)
 		blk_mq_debugfs_unregister(q);
 
-	if (q->bio_split)
-		bioset_free(q->bio_split);
+	bioset_exit(&q->bio_split);
 
 	ida_simple_remove(&blk_queue_ida, q->id);
 	call_rcu(&q->rcu_head, blk_free_queue_rcu);

+ 16 - 19
block/blk-throttle.c

@@ -36,8 +36,6 @@ static int throtl_quantum = 32;
  */
 #define LATENCY_FILTERED_HD (1000L) /* 1ms */
 
-#define SKIP_LATENCY (((u64)1) << BLK_STAT_RES_SHIFT)
-
 static struct blkcg_policy blkcg_policy_throtl;
 
 /* A workqueue to queue throttle related work */
@@ -821,7 +819,7 @@ static bool throtl_slice_used(struct throtl_grp *tg, bool rw)
 	if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw]))
 		return false;
 
-	return 1;
+	return true;
 }
 
 /* Trim the used slices and adjust slice start accordingly */
@@ -931,7 +929,7 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
 
 	if (wait)
 		*wait = jiffy_wait;
-	return 0;
+	return false;
 }
 
 static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio,
@@ -974,7 +972,7 @@ static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio,
 	jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed);
 	if (wait)
 		*wait = jiffy_wait;
-	return 0;
+	return false;
 }
 
 /*
@@ -1024,7 +1022,7 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
 	    tg_with_in_iops_limit(tg, bio, &iops_wait)) {
 		if (wait)
 			*wait = 0;
-		return 1;
+		return true;
 	}
 
 	max_wait = max(bps_wait, iops_wait);
@@ -1035,7 +1033,7 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
 	if (time_before(tg->slice_end[rw], jiffies + max_wait))
 		throtl_extend_slice(tg, rw, jiffies + max_wait);
 
-	return 0;
+	return false;
 }
 
 static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
@@ -1209,7 +1207,7 @@ static int throtl_select_dispatch(struct throtl_service_queue *parent_sq)
 
 	while (1) {
 		struct throtl_grp *tg = throtl_rb_first(parent_sq);
-		struct throtl_service_queue *sq = &tg->service_queue;
+		struct throtl_service_queue *sq;
 
 		if (!tg)
 			break;
@@ -1221,6 +1219,7 @@ static int throtl_select_dispatch(struct throtl_service_queue *parent_sq)
 
 		nr_disp += throtl_dispatch_tg(tg);
 
+		sq = &tg->service_queue;
 		if (sq->nr_queued[0] || sq->nr_queued[1])
 			tg_update_disptime(tg);
 
@@ -2139,7 +2138,7 @@ static void blk_throtl_assoc_bio(struct throtl_grp *tg, struct bio *bio)
 		bio->bi_cg_private = tg;
 		blkg_get(tg_to_blkg(tg));
 	}
-	blk_stat_set_issue(&bio->bi_issue_stat, bio_sectors(bio));
+	bio_issue_init(&bio->bi_issue, bio_sectors(bio));
 #endif
 }
 
@@ -2251,7 +2250,7 @@ out:
 
 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
 	if (throttled || !td->track_bio_latency)
-		bio->bi_issue_stat.stat |= SKIP_LATENCY;
+		bio->bi_issue.value |= BIO_ISSUE_THROTL_SKIP_LATENCY;
 #endif
 	return throttled;
 }
@@ -2281,8 +2280,7 @@ void blk_throtl_stat_add(struct request *rq, u64 time_ns)
 	struct request_queue *q = rq->q;
 	struct throtl_data *td = q->td;
 
-	throtl_track_latency(td, blk_stat_size(&rq->issue_stat),
-		req_op(rq), time_ns >> 10);
+	throtl_track_latency(td, rq->throtl_size, req_op(rq), time_ns >> 10);
 }
 
 void blk_throtl_bio_endio(struct bio *bio)
@@ -2302,8 +2300,8 @@ void blk_throtl_bio_endio(struct bio *bio)
 	finish_time_ns = ktime_get_ns();
 	tg->last_finish_time = finish_time_ns >> 10;
 
-	start_time = blk_stat_time(&bio->bi_issue_stat) >> 10;
-	finish_time = __blk_stat_time(finish_time_ns) >> 10;
+	start_time = bio_issue_time(&bio->bi_issue) >> 10;
+	finish_time = __bio_issue_time(finish_time_ns) >> 10;
 	if (!start_time || finish_time <= start_time) {
 		blkg_put(tg_to_blkg(tg));
 		return;
@@ -2311,16 +2309,15 @@ void blk_throtl_bio_endio(struct bio *bio)
 
 	lat = finish_time - start_time;
 	/* this is only for bio based driver */
-	if (!(bio->bi_issue_stat.stat & SKIP_LATENCY))
-		throtl_track_latency(tg->td, blk_stat_size(&bio->bi_issue_stat),
-			bio_op(bio), lat);
+	if (!(bio->bi_issue.value & BIO_ISSUE_THROTL_SKIP_LATENCY))
+		throtl_track_latency(tg->td, bio_issue_size(&bio->bi_issue),
+				     bio_op(bio), lat);
 
 	if (tg->latency_target && lat >= tg->td->filtered_latency) {
 		int bucket;
 		unsigned int threshold;
 
-		bucket = request_bucket_index(
-			blk_stat_size(&bio->bi_issue_stat));
+		bucket = request_bucket_index(bio_issue_size(&bio->bi_issue));
 		threshold = tg->td->avg_buckets[rw][bucket].latency +
 			tg->latency_target;
 		if (lat > threshold)

+ 1 - 5
block/blk-timeout.c

@@ -86,14 +86,11 @@ static void blk_rq_timed_out(struct request *req)
 	if (q->rq_timed_out_fn)
 		ret = q->rq_timed_out_fn(req);
 	switch (ret) {
-	case BLK_EH_HANDLED:
-		__blk_complete_request(req);
-		break;
 	case BLK_EH_RESET_TIMER:
 		blk_add_timer(req);
 		blk_clear_rq_complete(req);
 		break;
-	case BLK_EH_NOT_HANDLED:
+	case BLK_EH_DONE:
 		/*
 		 * LLD handles this for now but in the future
 		 * we can send a request msg to abort the command
@@ -214,7 +211,6 @@ void blk_add_timer(struct request *req)
 		req->timeout = q->rq_timeout;
 
 	blk_rq_set_deadline(req, jiffies + req->timeout);
-	req->rq_flags &= ~RQF_MQ_TIMEOUT_EXPIRED;
 
 	/*
 	 * Only the non-mq case needs to add the request to a protected list.

+ 80 - 49
block/blk-wbt.c

@@ -29,6 +29,26 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/wbt.h>
 
+static inline void wbt_clear_state(struct request *rq)
+{
+	rq->wbt_flags = 0;
+}
+
+static inline enum wbt_flags wbt_flags(struct request *rq)
+{
+	return rq->wbt_flags;
+}
+
+static inline bool wbt_is_tracked(struct request *rq)
+{
+	return rq->wbt_flags & WBT_TRACKED;
+}
+
+static inline bool wbt_is_read(struct request *rq)
+{
+	return rq->wbt_flags & WBT_READ;
+}
+
 enum {
 	/*
 	 * Default setting, we'll scale up (to 75% of QD max) or down (min 1)
@@ -101,9 +121,15 @@ static bool wb_recent_wait(struct rq_wb *rwb)
 	return time_before(jiffies, wb->dirty_sleep + HZ);
 }
 
-static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb, bool is_kswapd)
+static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb,
+					  enum wbt_flags wb_acct)
 {
-	return &rwb->rq_wait[is_kswapd];
+	if (wb_acct & WBT_KSWAPD)
+		return &rwb->rq_wait[WBT_RWQ_KSWAPD];
+	else if (wb_acct & WBT_DISCARD)
+		return &rwb->rq_wait[WBT_RWQ_DISCARD];
+
+	return &rwb->rq_wait[WBT_RWQ_BG];
 }
 
 static void rwb_wake_all(struct rq_wb *rwb)
@@ -126,7 +152,7 @@ void __wbt_done(struct rq_wb *rwb, enum wbt_flags wb_acct)
 	if (!(wb_acct & WBT_TRACKED))
 		return;
 
-	rqw = get_rq_wait(rwb, wb_acct & WBT_KSWAPD);
+	rqw = get_rq_wait(rwb, wb_acct);
 	inflight = atomic_dec_return(&rqw->inflight);
 
 	/*
@@ -139,10 +165,13 @@ void __wbt_done(struct rq_wb *rwb, enum wbt_flags wb_acct)
 	}
 
 	/*
-	 * If the device does write back caching, drop further down
-	 * before we wake people up.
+	 * For discards, our limit is always the background. For writes, if
+	 * the device does write back caching, drop further down before we
+	 * wake people up.
 	 */
-	if (rwb->wc && !wb_recent_wait(rwb))
+	if (wb_acct & WBT_DISCARD)
+		limit = rwb->wb_background;
+	else if (rwb->wc && !wb_recent_wait(rwb))
 		limit = 0;
 	else
 		limit = rwb->wb_normal;
@@ -165,24 +194,24 @@ void __wbt_done(struct rq_wb *rwb, enum wbt_flags wb_acct)
  * Called on completion of a request. Note that it's also called when
  * a request is merged, when the request gets freed.
  */
-void wbt_done(struct rq_wb *rwb, struct blk_issue_stat *stat)
+void wbt_done(struct rq_wb *rwb, struct request *rq)
 {
 	if (!rwb)
 		return;
 
-	if (!wbt_is_tracked(stat)) {
-		if (rwb->sync_cookie == stat) {
+	if (!wbt_is_tracked(rq)) {
+		if (rwb->sync_cookie == rq) {
 			rwb->sync_issue = 0;
 			rwb->sync_cookie = NULL;
 		}
 
-		if (wbt_is_read(stat))
+		if (wbt_is_read(rq))
 			wb_timestamp(rwb, &rwb->last_comp);
 	} else {
-		WARN_ON_ONCE(stat == rwb->sync_cookie);
-		__wbt_done(rwb, wbt_stat_to_mask(stat));
+		WARN_ON_ONCE(rq == rwb->sync_cookie);
+		__wbt_done(rwb, wbt_flags(rq));
 	}
-	wbt_clear_state(stat);
+	wbt_clear_state(rq);
 }
 
 /*
@@ -479,6 +508,9 @@ static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw)
 {
 	unsigned int limit;
 
+	if ((rw & REQ_OP_MASK) == REQ_OP_DISCARD)
+		return rwb->wb_background;
+
 	/*
 	 * At this point we know it's a buffered write. If this is
 	 * kswapd trying to free memory, or REQ_SYNC is set, then
@@ -529,11 +561,12 @@ static inline bool may_queue(struct rq_wb *rwb, struct rq_wait *rqw,
  * Block if we will exceed our limit, or if we are currently waiting for
  * the timer to kick off queuing again.
  */
-static void __wbt_wait(struct rq_wb *rwb, unsigned long rw, spinlock_t *lock)
+static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct,
+		       unsigned long rw, spinlock_t *lock)
 	__releases(lock)
 	__acquires(lock)
 {
-	struct rq_wait *rqw = get_rq_wait(rwb, current_is_kswapd());
+	struct rq_wait *rqw = get_rq_wait(rwb, wb_acct);
 	DEFINE_WAIT(wait);
 
 	if (may_queue(rwb, rqw, &wait, rw))
@@ -559,21 +592,20 @@ static void __wbt_wait(struct rq_wb *rwb, unsigned long rw, spinlock_t *lock)
 
 static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio)
 {
-	const int op = bio_op(bio);
-
-	/*
-	 * If not a WRITE, do nothing
-	 */
-	if (op != REQ_OP_WRITE)
-		return false;
-
-	/*
-	 * Don't throttle WRITE_ODIRECT
-	 */
-	if ((bio->bi_opf & (REQ_SYNC | REQ_IDLE)) == (REQ_SYNC | REQ_IDLE))
+	switch (bio_op(bio)) {
+	case REQ_OP_WRITE:
+		/*
+		 * Don't throttle WRITE_ODIRECT
+		 */
+		if ((bio->bi_opf & (REQ_SYNC | REQ_IDLE)) ==
+		    (REQ_SYNC | REQ_IDLE))
+			return false;
+		/* fallthrough */
+	case REQ_OP_DISCARD:
+		return true;
+	default:
 		return false;
-
-	return true;
+	}
 }
 
 /*
@@ -584,7 +616,7 @@ static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio)
  */
 enum wbt_flags wbt_wait(struct rq_wb *rwb, struct bio *bio, spinlock_t *lock)
 {
-	unsigned int ret = 0;
+	enum wbt_flags ret = 0;
 
 	if (!rwb_enabled(rwb))
 		return 0;
@@ -598,41 +630,42 @@ enum wbt_flags wbt_wait(struct rq_wb *rwb, struct bio *bio, spinlock_t *lock)
 		return ret;
 	}
 
-	__wbt_wait(rwb, bio->bi_opf, lock);
+	if (current_is_kswapd())
+		ret |= WBT_KSWAPD;
+	if (bio_op(bio) == REQ_OP_DISCARD)
+		ret |= WBT_DISCARD;
+
+	__wbt_wait(rwb, ret, bio->bi_opf, lock);
 
 	if (!blk_stat_is_active(rwb->cb))
 		rwb_arm_timer(rwb);
 
-	if (current_is_kswapd())
-		ret |= WBT_KSWAPD;
-
 	return ret | WBT_TRACKED;
 }
 
-void wbt_issue(struct rq_wb *rwb, struct blk_issue_stat *stat)
+void wbt_issue(struct rq_wb *rwb, struct request *rq)
 {
 	if (!rwb_enabled(rwb))
 		return;
 
 	/*
-	 * Track sync issue, in case it takes a long time to complete. Allows
-	 * us to react quicker, if a sync IO takes a long time to complete.
-	 * Note that this is just a hint. 'stat' can go away when the
-	 * request completes, so it's important we never dereference it. We
-	 * only use the address to compare with, which is why we store the
-	 * sync_issue time locally.
+	 * Track sync issue, in case it takes a long time to complete. Allows us
+	 * to react quicker, if a sync IO takes a long time to complete. Note
+	 * that this is just a hint. The request can go away when it completes,
+	 * so it's important we never dereference it. We only use the address to
+	 * compare with, which is why we store the sync_issue time locally.
 	 */
-	if (wbt_is_read(stat) && !rwb->sync_issue) {
-		rwb->sync_cookie = stat;
-		rwb->sync_issue = blk_stat_time(stat);
+	if (wbt_is_read(rq) && !rwb->sync_issue) {
+		rwb->sync_cookie = rq;
+		rwb->sync_issue = rq->io_start_time_ns;
 	}
 }
 
-void wbt_requeue(struct rq_wb *rwb, struct blk_issue_stat *stat)
+void wbt_requeue(struct rq_wb *rwb, struct request *rq)
 {
 	if (!rwb_enabled(rwb))
 		return;
-	if (stat == rwb->sync_cookie) {
+	if (rq == rwb->sync_cookie) {
 		rwb->sync_issue = 0;
 		rwb->sync_cookie = NULL;
 	}
@@ -701,7 +734,7 @@ static int wbt_data_dir(const struct request *rq)
 
 	if (op == REQ_OP_READ)
 		return READ;
-	else if (op == REQ_OP_WRITE || op == REQ_OP_FLUSH)
+	else if (op_is_write(op))
 		return WRITE;
 
 	/* don't account */
@@ -713,8 +746,6 @@ int wbt_init(struct request_queue *q)
 	struct rq_wb *rwb;
 	int i;
 
-	BUILD_BUG_ON(WBT_NR_BITS > BLK_STAT_RES_BITS);
-
 	rwb = kzalloc(sizeof(*rwb), GFP_KERNEL);
 	if (!rwb)
 		return -ENOMEM;

+ 21 - 34
block/blk-wbt.h

@@ -14,12 +14,16 @@ enum wbt_flags {
 	WBT_TRACKED		= 1,	/* write, tracked for throttling */
 	WBT_READ		= 2,	/* read */
 	WBT_KSWAPD		= 4,	/* write, from kswapd */
+	WBT_DISCARD		= 8,	/* discard */
 
-	WBT_NR_BITS		= 3,	/* number of bits */
+	WBT_NR_BITS		= 4,	/* number of bits */
 };
 
 enum {
-	WBT_NUM_RWQ		= 2,
+	WBT_RWQ_BG		= 0,
+	WBT_RWQ_KSWAPD,
+	WBT_RWQ_DISCARD,
+	WBT_NUM_RWQ,
 };
 
 /*
@@ -31,31 +35,6 @@ enum {
 	WBT_STATE_ON_MANUAL	= 2,
 };
 
-static inline void wbt_clear_state(struct blk_issue_stat *stat)
-{
-	stat->stat &= ~BLK_STAT_RES_MASK;
-}
-
-static inline enum wbt_flags wbt_stat_to_mask(struct blk_issue_stat *stat)
-{
-	return (stat->stat & BLK_STAT_RES_MASK) >> BLK_STAT_RES_SHIFT;
-}
-
-static inline void wbt_track(struct blk_issue_stat *stat, enum wbt_flags wb_acct)
-{
-	stat->stat |= ((u64) wb_acct) << BLK_STAT_RES_SHIFT;
-}
-
-static inline bool wbt_is_tracked(struct blk_issue_stat *stat)
-{
-	return (stat->stat >> BLK_STAT_RES_SHIFT) & WBT_TRACKED;
-}
-
-static inline bool wbt_is_read(struct blk_issue_stat *stat)
-{
-	return (stat->stat >> BLK_STAT_RES_SHIFT) & WBT_READ;
-}
-
 struct rq_wait {
 	wait_queue_head_t wait;
 	atomic_t inflight;
@@ -84,7 +63,7 @@ struct rq_wb {
 
 	struct blk_stat_callback *cb;
 
-	s64 sync_issue;
+	u64 sync_issue;
 	void *sync_cookie;
 
 	unsigned int wc;
@@ -109,14 +88,19 @@ static inline unsigned int wbt_inflight(struct rq_wb *rwb)
 
 #ifdef CONFIG_BLK_WBT
 
+static inline void wbt_track(struct request *rq, enum wbt_flags flags)
+{
+	rq->wbt_flags |= flags;
+}
+
 void __wbt_done(struct rq_wb *, enum wbt_flags);
-void wbt_done(struct rq_wb *, struct blk_issue_stat *);
+void wbt_done(struct rq_wb *, struct request *);
 enum wbt_flags wbt_wait(struct rq_wb *, struct bio *, spinlock_t *);
 int wbt_init(struct request_queue *);
 void wbt_exit(struct request_queue *);
 void wbt_update_limits(struct rq_wb *);
-void wbt_requeue(struct rq_wb *, struct blk_issue_stat *);
-void wbt_issue(struct rq_wb *, struct blk_issue_stat *);
+void wbt_requeue(struct rq_wb *, struct request *);
+void wbt_issue(struct rq_wb *, struct request *);
 void wbt_disable_default(struct request_queue *);
 void wbt_enable_default(struct request_queue *);
 
@@ -127,10 +111,13 @@ u64 wbt_default_latency_nsec(struct request_queue *);
 
 #else
 
+static inline void wbt_track(struct request *rq, enum wbt_flags flags)
+{
+}
 static inline void __wbt_done(struct rq_wb *rwb, enum wbt_flags flags)
 {
 }
-static inline void wbt_done(struct rq_wb *rwb, struct blk_issue_stat *stat)
+static inline void wbt_done(struct rq_wb *rwb, struct request *rq)
 {
 }
 static inline enum wbt_flags wbt_wait(struct rq_wb *rwb, struct bio *bio,
@@ -148,10 +135,10 @@ static inline void wbt_exit(struct request_queue *q)
 static inline void wbt_update_limits(struct rq_wb *rwb)
 {
 }
-static inline void wbt_requeue(struct rq_wb *rwb, struct blk_issue_stat *stat)
+static inline void wbt_requeue(struct rq_wb *rwb, struct request *rq)
 {
 }
-static inline void wbt_issue(struct rq_wb *rwb, struct blk_issue_stat *stat)
+static inline void wbt_issue(struct rq_wb *rwb, struct request *rq)
 {
 }
 static inline void wbt_disable_default(struct request_queue *q)

+ 6 - 2
block/blk-zoned.c

@@ -328,7 +328,11 @@ int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode,
 	if (!rep.nr_zones)
 		return -EINVAL;
 
-	zones = kcalloc(rep.nr_zones, sizeof(struct blk_zone), GFP_KERNEL);
+	if (rep.nr_zones > INT_MAX / sizeof(struct blk_zone))
+		return -ERANGE;
+
+	zones = kvmalloc(rep.nr_zones * sizeof(struct blk_zone),
+			GFP_KERNEL | __GFP_ZERO);
 	if (!zones)
 		return -ENOMEM;
 
@@ -350,7 +354,7 @@ int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode,
 	}
 
  out:
-	kfree(zones);
+	kvfree(zones);
 
 	return ret;
 }

+ 4 - 1
block/blk.h

@@ -186,7 +186,7 @@ unsigned int blk_plug_queued_count(struct request_queue *q);
 
 void blk_account_io_start(struct request *req, bool new_io);
 void blk_account_io_completion(struct request *req, unsigned int bytes);
-void blk_account_io_done(struct request *req);
+void blk_account_io_done(struct request *req, u64 now);
 
 /*
  * EH timer and IO completion will both attempt to 'grab' the request, make
@@ -231,6 +231,9 @@ static inline void elv_deactivate_rq(struct request_queue *q, struct request *rq
 		e->type->ops.sq.elevator_deactivate_req_fn(q, rq);
 }
 
+int elevator_init(struct request_queue *);
+int elevator_init_mq(struct request_queue *q);
+void elevator_exit(struct request_queue *, struct elevator_queue *);
 int elv_register_queue(struct request_queue *q);
 void elv_unregister_queue(struct request_queue *q);
 

+ 26 - 26
block/bounce.c

@@ -28,28 +28,29 @@
 #define POOL_SIZE	64
 #define ISA_POOL_SIZE	16
 
-static struct bio_set *bounce_bio_set, *bounce_bio_split;
-static mempool_t *page_pool, *isa_page_pool;
+static struct bio_set bounce_bio_set, bounce_bio_split;
+static mempool_t page_pool, isa_page_pool;
 
 #if defined(CONFIG_HIGHMEM)
 static __init int init_emergency_pool(void)
 {
+	int ret;
 #if defined(CONFIG_HIGHMEM) && !defined(CONFIG_MEMORY_HOTPLUG)
 	if (max_pfn <= max_low_pfn)
 		return 0;
 #endif
 
-	page_pool = mempool_create_page_pool(POOL_SIZE, 0);
-	BUG_ON(!page_pool);
+	ret = mempool_init_page_pool(&page_pool, POOL_SIZE, 0);
+	BUG_ON(ret);
 	pr_info("pool size: %d pages\n", POOL_SIZE);
 
-	bounce_bio_set = bioset_create(BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
-	BUG_ON(!bounce_bio_set);
-	if (bioset_integrity_create(bounce_bio_set, BIO_POOL_SIZE))
+	ret = bioset_init(&bounce_bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
+	BUG_ON(ret);
+	if (bioset_integrity_create(&bounce_bio_set, BIO_POOL_SIZE))
 		BUG_ON(1);
 
-	bounce_bio_split = bioset_create(BIO_POOL_SIZE, 0, 0);
-	BUG_ON(!bounce_bio_split);
+	ret = bioset_init(&bounce_bio_split, BIO_POOL_SIZE, 0, 0);
+	BUG_ON(ret);
 
 	return 0;
 }
@@ -63,14 +64,11 @@ __initcall(init_emergency_pool);
  */
 static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom)
 {
-	unsigned long flags;
 	unsigned char *vto;
 
-	local_irq_save(flags);
 	vto = kmap_atomic(to->bv_page);
 	memcpy(vto + to->bv_offset, vfrom, to->bv_len);
 	kunmap_atomic(vto);
-	local_irq_restore(flags);
 }
 
 #else /* CONFIG_HIGHMEM */
@@ -94,12 +92,14 @@ static void *mempool_alloc_pages_isa(gfp_t gfp_mask, void *data)
  */
 int init_emergency_isa_pool(void)
 {
-	if (isa_page_pool)
+	int ret;
+
+	if (mempool_initialized(&isa_page_pool))
 		return 0;
 
-	isa_page_pool = mempool_create(ISA_POOL_SIZE, mempool_alloc_pages_isa,
-				       mempool_free_pages, (void *) 0);
-	BUG_ON(!isa_page_pool);
+	ret = mempool_init(&isa_page_pool, ISA_POOL_SIZE, mempool_alloc_pages_isa,
+			   mempool_free_pages, (void *) 0);
+	BUG_ON(ret);
 
 	pr_info("isa pool size: %d pages\n", ISA_POOL_SIZE);
 	return 0;
@@ -166,13 +166,13 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool)
 
 static void bounce_end_io_write(struct bio *bio)
 {
-	bounce_end_io(bio, page_pool);
+	bounce_end_io(bio, &page_pool);
 }
 
 static void bounce_end_io_write_isa(struct bio *bio)
 {
 
-	bounce_end_io(bio, isa_page_pool);
+	bounce_end_io(bio, &isa_page_pool);
 }
 
 static void __bounce_end_io_read(struct bio *bio, mempool_t *pool)
@@ -187,12 +187,12 @@ static void __bounce_end_io_read(struct bio *bio, mempool_t *pool)
 
 static void bounce_end_io_read(struct bio *bio)
 {
-	__bounce_end_io_read(bio, page_pool);
+	__bounce_end_io_read(bio, &page_pool);
 }
 
 static void bounce_end_io_read_isa(struct bio *bio)
 {
-	__bounce_end_io_read(bio, isa_page_pool);
+	__bounce_end_io_read(bio, &isa_page_pool);
 }
 
 static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
@@ -217,13 +217,13 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
 		return;
 
 	if (!passthrough && sectors < bio_sectors(*bio_orig)) {
-		bio = bio_split(*bio_orig, sectors, GFP_NOIO, bounce_bio_split);
+		bio = bio_split(*bio_orig, sectors, GFP_NOIO, &bounce_bio_split);
 		bio_chain(bio, *bio_orig);
 		generic_make_request(*bio_orig);
 		*bio_orig = bio;
 	}
 	bio = bio_clone_bioset(*bio_orig, GFP_NOIO, passthrough ? NULL :
-			bounce_bio_set);
+			&bounce_bio_set);
 
 	bio_for_each_segment_all(to, bio, i) {
 		struct page *page = to->bv_page;
@@ -250,7 +250,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
 
 	bio->bi_flags |= (1 << BIO_BOUNCED);
 
-	if (pool == page_pool) {
+	if (pool == &page_pool) {
 		bio->bi_end_io = bounce_end_io_write;
 		if (rw == READ)
 			bio->bi_end_io = bounce_end_io_read;
@@ -282,10 +282,10 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
 	if (!(q->bounce_gfp & GFP_DMA)) {
 		if (q->limits.bounce_pfn >= blk_max_pfn)
 			return;
-		pool = page_pool;
+		pool = &page_pool;
 	} else {
-		BUG_ON(!isa_page_pool);
-		pool = isa_page_pool;
+		BUG_ON(!mempool_initialized(&isa_page_pool));
+		pool = &isa_page_pool;
 	}
 
 	/*

+ 2 - 4
block/bsg-lib.c

@@ -303,11 +303,9 @@ static void bsg_exit_rq(struct request_queue *q, struct request *req)
  * @name: device to give bsg device
  * @job_fn: bsg job handler
  * @dd_job_size: size of LLD data needed for each job
- * @release: @dev release function
  */
 struct request_queue *bsg_setup_queue(struct device *dev, const char *name,
-		bsg_job_fn *job_fn, int dd_job_size,
-		void (*release)(struct device *))
+		bsg_job_fn *job_fn, int dd_job_size)
 {
 	struct request_queue *q;
 	int ret;
@@ -331,7 +329,7 @@ struct request_queue *bsg_setup_queue(struct device *dev, const char *name,
 	blk_queue_softirq_done(q, bsg_softirq_done);
 	blk_queue_rq_timeout(q, BLK_DEFAULT_SG_TIMEOUT);
 
-	ret = bsg_register_queue(q, dev, name, &bsg_transport_ops, release);
+	ret = bsg_register_queue(q, dev, name, &bsg_transport_ops);
 	if (ret) {
 		printk(KERN_ERR "%s: bsg interface failed to "
 		       "initialize - register queue\n", dev->kobj.name);

+ 7 - 37
block/bsg.c

@@ -226,8 +226,7 @@ bsg_map_hdr(struct request_queue *q, struct sg_io_v4 *hdr, fmode_t mode)
 		return ERR_PTR(ret);
 
 	rq = blk_get_request(q, hdr->dout_xfer_len ?
-			REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN,
-			GFP_KERNEL);
+			REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, 0);
 	if (IS_ERR(rq))
 		return rq;
 
@@ -249,7 +248,7 @@ bsg_map_hdr(struct request_queue *q, struct sg_io_v4 *hdr, fmode_t mode)
 			goto out;
 		}
 
-		next_rq = blk_get_request(q, REQ_OP_SCSI_IN, GFP_KERNEL);
+		next_rq = blk_get_request(q, REQ_OP_SCSI_IN, 0);
 		if (IS_ERR(next_rq)) {
 			ret = PTR_ERR(next_rq);
 			goto out;
@@ -650,18 +649,6 @@ static struct bsg_device *bsg_alloc_device(void)
 	return bd;
 }
 
-static void bsg_kref_release_function(struct kref *kref)
-{
-	struct bsg_class_device *bcd =
-		container_of(kref, struct bsg_class_device, ref);
-	struct device *parent = bcd->parent;
-
-	if (bcd->release)
-		bcd->release(bcd->parent);
-
-	put_device(parent);
-}
-
 static int bsg_put_device(struct bsg_device *bd)
 {
 	int ret = 0, do_free;
@@ -694,7 +681,6 @@ static int bsg_put_device(struct bsg_device *bd)
 
 	kfree(bd);
 out:
-	kref_put(&q->bsg_dev.ref, bsg_kref_release_function);
 	if (do_free)
 		blk_put_queue(q);
 	return ret;
@@ -760,8 +746,6 @@ static struct bsg_device *bsg_get_device(struct inode *inode, struct file *file)
 	 */
 	mutex_lock(&bsg_mutex);
 	bcd = idr_find(&bsg_minor_idr, iminor(inode));
-	if (bcd)
-		kref_get(&bcd->ref);
 	mutex_unlock(&bsg_mutex);
 
 	if (!bcd)
@@ -772,8 +756,6 @@ static struct bsg_device *bsg_get_device(struct inode *inode, struct file *file)
 		return bd;
 
 	bd = bsg_add_device(inode, bcd->queue, file);
-	if (IS_ERR(bd))
-		kref_put(&bcd->ref, bsg_kref_release_function);
 
 	return bd;
 }
@@ -913,25 +895,17 @@ void bsg_unregister_queue(struct request_queue *q)
 		sysfs_remove_link(&q->kobj, "bsg");
 	device_unregister(bcd->class_dev);
 	bcd->class_dev = NULL;
-	kref_put(&bcd->ref, bsg_kref_release_function);
 	mutex_unlock(&bsg_mutex);
 }
 EXPORT_SYMBOL_GPL(bsg_unregister_queue);
 
 int bsg_register_queue(struct request_queue *q, struct device *parent,
-		const char *name, const struct bsg_ops *ops,
-		void (*release)(struct device *))
+		const char *name, const struct bsg_ops *ops)
 {
 	struct bsg_class_device *bcd;
 	dev_t dev;
 	int ret;
 	struct device *class_dev = NULL;
-	const char *devname;
-
-	if (name)
-		devname = name;
-	else
-		devname = dev_name(parent);
 
 	/*
 	 * we need a proper transport to send commands, not a stacked device
@@ -955,15 +929,12 @@ int bsg_register_queue(struct request_queue *q, struct device *parent,
 
 	bcd->minor = ret;
 	bcd->queue = q;
-	bcd->parent = get_device(parent);
-	bcd->release = release;
 	bcd->ops = ops;
-	kref_init(&bcd->ref);
 	dev = MKDEV(bsg_major, bcd->minor);
-	class_dev = device_create(bsg_class, parent, dev, NULL, "%s", devname);
+	class_dev = device_create(bsg_class, parent, dev, NULL, "%s", name);
 	if (IS_ERR(class_dev)) {
 		ret = PTR_ERR(class_dev);
-		goto put_dev;
+		goto idr_remove;
 	}
 	bcd->class_dev = class_dev;
 
@@ -978,8 +949,7 @@ int bsg_register_queue(struct request_queue *q, struct device *parent,
 
 unregister_class_dev:
 	device_unregister(class_dev);
-put_dev:
-	put_device(parent);
+idr_remove:
 	idr_remove(&bsg_minor_idr, bcd->minor);
 unlock:
 	mutex_unlock(&bsg_mutex);
@@ -993,7 +963,7 @@ int bsg_scsi_register_queue(struct request_queue *q, struct device *parent)
 		return -EINVAL;
 	}
 
-	return bsg_register_queue(q, parent, NULL, &bsg_scsi_ops, NULL);
+	return bsg_register_queue(q, parent, dev_name(parent), &bsg_scsi_ops);
 }
 EXPORT_SYMBOL_GPL(bsg_scsi_register_queue);
 

+ 30 - 36
block/cfq-iosched.c

@@ -210,9 +210,9 @@ struct cfqg_stats {
 	/* total time with empty current active q with other requests queued */
 	struct blkg_stat		empty_time;
 	/* fields after this shouldn't be cleared on stat reset */
-	uint64_t			start_group_wait_time;
-	uint64_t			start_idle_time;
-	uint64_t			start_empty_time;
+	u64				start_group_wait_time;
+	u64				start_idle_time;
+	u64				start_empty_time;
 	uint16_t			flags;
 #endif	/* CONFIG_DEBUG_BLK_CGROUP */
 #endif	/* CONFIG_CFQ_GROUP_IOSCHED */
@@ -491,13 +491,13 @@ CFQG_FLAG_FNS(empty)
 /* This should be called with the queue_lock held. */
 static void cfqg_stats_update_group_wait_time(struct cfqg_stats *stats)
 {
-	unsigned long long now;
+	u64 now;
 
 	if (!cfqg_stats_waiting(stats))
 		return;
 
-	now = sched_clock();
-	if (time_after64(now, stats->start_group_wait_time))
+	now = ktime_get_ns();
+	if (now > stats->start_group_wait_time)
 		blkg_stat_add(&stats->group_wait_time,
 			      now - stats->start_group_wait_time);
 	cfqg_stats_clear_waiting(stats);
@@ -513,20 +513,20 @@ static void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg,
 		return;
 	if (cfqg == curr_cfqg)
 		return;
-	stats->start_group_wait_time = sched_clock();
+	stats->start_group_wait_time = ktime_get_ns();
 	cfqg_stats_mark_waiting(stats);
 }
 
 /* This should be called with the queue_lock held. */
 static void cfqg_stats_end_empty_time(struct cfqg_stats *stats)
 {
-	unsigned long long now;
+	u64 now;
 
 	if (!cfqg_stats_empty(stats))
 		return;
 
-	now = sched_clock();
-	if (time_after64(now, stats->start_empty_time))
+	now = ktime_get_ns();
+	if (now > stats->start_empty_time)
 		blkg_stat_add(&stats->empty_time,
 			      now - stats->start_empty_time);
 	cfqg_stats_clear_empty(stats);
@@ -552,7 +552,7 @@ static void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg)
 	if (cfqg_stats_empty(stats))
 		return;
 
-	stats->start_empty_time = sched_clock();
+	stats->start_empty_time = ktime_get_ns();
 	cfqg_stats_mark_empty(stats);
 }
 
@@ -561,9 +561,9 @@ static void cfqg_stats_update_idle_time(struct cfq_group *cfqg)
 	struct cfqg_stats *stats = &cfqg->stats;
 
 	if (cfqg_stats_idling(stats)) {
-		unsigned long long now = sched_clock();
+		u64 now = ktime_get_ns();
 
-		if (time_after64(now, stats->start_idle_time))
+		if (now > stats->start_idle_time)
 			blkg_stat_add(&stats->idle_time,
 				      now - stats->start_idle_time);
 		cfqg_stats_clear_idling(stats);
@@ -576,7 +576,7 @@ static void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg)
 
 	BUG_ON(cfqg_stats_idling(stats));
 
-	stats->start_idle_time = sched_clock();
+	stats->start_idle_time = ktime_get_ns();
 	cfqg_stats_mark_idling(stats);
 }
 
@@ -701,17 +701,19 @@ static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg,
 }
 
 static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
-			uint64_t start_time, uint64_t io_start_time,
-			unsigned int op)
+						u64 start_time_ns,
+						u64 io_start_time_ns,
+						unsigned int op)
 {
 	struct cfqg_stats *stats = &cfqg->stats;
-	unsigned long long now = sched_clock();
+	u64 now = ktime_get_ns();
 
-	if (time_after64(now, io_start_time))
-		blkg_rwstat_add(&stats->service_time, op, now - io_start_time);
-	if (time_after64(io_start_time, start_time))
+	if (now > io_start_time_ns)
+		blkg_rwstat_add(&stats->service_time, op,
+				now - io_start_time_ns);
+	if (io_start_time_ns > start_time_ns)
 		blkg_rwstat_add(&stats->wait_time, op,
-				io_start_time - start_time);
+				io_start_time_ns - start_time_ns);
 }
 
 /* @stats = 0 */
@@ -797,8 +799,9 @@ static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg,
 static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg,
 			unsigned int op) { }
 static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
-			uint64_t start_time, uint64_t io_start_time,
-			unsigned int op) { }
+						u64 start_time_ns,
+						u64 io_start_time_ns,
+						unsigned int op) { }
 
 #endif	/* CONFIG_CFQ_GROUP_IOSCHED */
 
@@ -4225,8 +4228,8 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
 	cfqd->rq_in_driver--;
 	cfqq->dispatched--;
 	(RQ_CFQG(rq))->dispatched--;
-	cfqg_stats_update_completion(cfqq->cfqg, rq_start_time_ns(rq),
-				     rq_io_start_time_ns(rq), rq->cmd_flags);
+	cfqg_stats_update_completion(cfqq->cfqg, rq->start_time_ns,
+				     rq->io_start_time_ns, rq->cmd_flags);
 
 	cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;
 
@@ -4242,16 +4245,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
 					cfqq_type(cfqq));
 
 		st->ttime.last_end_request = now;
-		/*
-		 * We have to do this check in jiffies since start_time is in
-		 * jiffies and it is not trivial to convert to ns. If
-		 * cfq_fifo_expire[1] ever comes close to 1 jiffie, this test
-		 * will become problematic but so far we are fine (the default
-		 * is 128 ms).
-		 */
-		if (!time_after(rq->start_time +
-				  nsecs_to_jiffies(cfqd->cfq_fifo_expire[1]),
-				jiffies))
+		if (rq->start_time_ns + cfqd->cfq_fifo_expire[1] <= now)
 			cfqd->last_delayed_sync = now;
 	}
 
@@ -4792,7 +4786,7 @@ USEC_STORE_FUNCTION(cfq_target_latency_us_store, &cfqd->cfq_target_latency, 1, U
 #undef USEC_STORE_FUNCTION
 
 #define CFQ_ATTR(name) \
-	__ATTR(name, S_IRUGO|S_IWUSR, cfq_##name##_show, cfq_##name##_store)
+	__ATTR(name, 0644, cfq_##name##_show, cfq_##name##_store)
 
 static struct elv_fs_entry cfq_attrs[] = {
 	CFQ_ATTR(quantum),

+ 1 - 2
block/deadline-iosched.c

@@ -512,8 +512,7 @@ STORE_FUNCTION(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX, 0);
 #undef STORE_FUNCTION
 
 #define DD_ATTR(name) \
-	__ATTR(name, S_IRUGO|S_IWUSR, deadline_##name##_show, \
-				      deadline_##name##_store)
+	__ATTR(name, 0644, deadline_##name##_show, deadline_##name##_store)
 
 static struct elv_fs_entry deadline_attrs[] = {
 	DD_ATTR(read_expire),

+ 52 - 49
block/elevator.c

@@ -199,76 +199,46 @@ static void elevator_release(struct kobject *kobj)
 	kfree(e);
 }
 
-int elevator_init(struct request_queue *q, char *name)
+/*
+ * Use the default elevator specified by config boot param for non-mq devices,
+ * or by config option.  Don't try to load modules as we could be running off
+ * async and request_module() isn't allowed from async.
+ */
+int elevator_init(struct request_queue *q)
 {
 	struct elevator_type *e = NULL;
-	int err;
+	int err = 0;
 
 	/*
 	 * q->sysfs_lock must be held to provide mutual exclusion between
 	 * elevator_switch() and here.
 	 */
-	lockdep_assert_held(&q->sysfs_lock);
-
+	mutex_lock(&q->sysfs_lock);
 	if (unlikely(q->elevator))
-		return 0;
-
-	INIT_LIST_HEAD(&q->queue_head);
-	q->last_merge = NULL;
-	q->end_sector = 0;
-	q->boundary_rq = NULL;
-
-	if (name) {
-		e = elevator_get(q, name, true);
-		if (!e)
-			return -EINVAL;
-	}
+		goto out_unlock;
 
-	/*
-	 * Use the default elevator specified by config boot param for
-	 * non-mq devices, or by config option. Don't try to load modules
-	 * as we could be running off async and request_module() isn't
-	 * allowed from async.
-	 */
-	if (!e && !q->mq_ops && *chosen_elevator) {
+	if (*chosen_elevator) {
 		e = elevator_get(q, chosen_elevator, false);
 		if (!e)
 			printk(KERN_ERR "I/O scheduler %s not found\n",
 							chosen_elevator);
 	}
 
+	if (!e)
+		e = elevator_get(q, CONFIG_DEFAULT_IOSCHED, false);
 	if (!e) {
-		/*
-		 * For blk-mq devices, we default to using mq-deadline,
-		 * if available, for single queue devices. If deadline
-		 * isn't available OR we have multiple queues, default
-		 * to "none".
-		 */
-		if (q->mq_ops) {
-			if (q->nr_hw_queues == 1)
-				e = elevator_get(q, "mq-deadline", false);
-			if (!e)
-				return 0;
-		} else
-			e = elevator_get(q, CONFIG_DEFAULT_IOSCHED, false);
-
-		if (!e) {
-			printk(KERN_ERR
-				"Default I/O scheduler not found. " \
-				"Using noop.\n");
-			e = elevator_get(q, "noop", false);
-		}
+		printk(KERN_ERR
+			"Default I/O scheduler not found. Using noop.\n");
+		e = elevator_get(q, "noop", false);
 	}
 
-	if (e->uses_mq)
-		err = blk_mq_init_sched(q, e);
-	else
-		err = e->ops.sq.elevator_init_fn(q, e);
+	err = e->ops.sq.elevator_init_fn(q, e);
 	if (err)
 		elevator_put(e);
+out_unlock:
+	mutex_unlock(&q->sysfs_lock);
 	return err;
 }
-EXPORT_SYMBOL(elevator_init);
 
 void elevator_exit(struct request_queue *q, struct elevator_queue *e)
 {
@@ -281,7 +251,6 @@ void elevator_exit(struct request_queue *q, struct elevator_queue *e)
 
 	kobject_put(&e->kobj);
 }
-EXPORT_SYMBOL(elevator_exit);
 
 static inline void __elv_rqhash_del(struct request *rq)
 {
@@ -1004,6 +973,40 @@ out:
 	return ret;
 }
 
+/*
+ * For blk-mq devices, we default to using mq-deadline, if available, for single
+ * queue devices.  If deadline isn't available OR we have multiple queues,
+ * default to "none".
+ */
+int elevator_init_mq(struct request_queue *q)
+{
+	struct elevator_type *e;
+	int err = 0;
+
+	if (q->nr_hw_queues != 1)
+		return 0;
+
+	/*
+	 * q->sysfs_lock must be held to provide mutual exclusion between
+	 * elevator_switch() and here.
+	 */
+	mutex_lock(&q->sysfs_lock);
+	if (unlikely(q->elevator))
+		goto out_unlock;
+
+	e = elevator_get(q, "mq-deadline", false);
+	if (!e)
+		goto out_unlock;
+
+	err = blk_mq_init_sched(q, e);
+	if (err)
+		elevator_put(e);
+out_unlock:
+	mutex_unlock(&q->sysfs_lock);
+	return err;
+}
+
+
 /*
  * switch to new_e io scheduler. be careful not to introduce deadlocks -
  * we don't free the old io scheduler, before we have allocated what we

+ 17 - 20
block/genhd.c

@@ -1139,28 +1139,25 @@ static ssize_t disk_discard_alignment_show(struct device *dev,
 	return sprintf(buf, "%d\n", queue_discard_alignment(disk->queue));
 }
 
-static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL);
-static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL);
-static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL);
-static DEVICE_ATTR(hidden, S_IRUGO, disk_hidden_show, NULL);
-static DEVICE_ATTR(ro, S_IRUGO, disk_ro_show, NULL);
-static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
-static DEVICE_ATTR(alignment_offset, S_IRUGO, disk_alignment_offset_show, NULL);
-static DEVICE_ATTR(discard_alignment, S_IRUGO, disk_discard_alignment_show,
-		   NULL);
-static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL);
-static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
-static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
-static DEVICE_ATTR(badblocks, S_IRUGO | S_IWUSR, disk_badblocks_show,
-		disk_badblocks_store);
+static DEVICE_ATTR(range, 0444, disk_range_show, NULL);
+static DEVICE_ATTR(ext_range, 0444, disk_ext_range_show, NULL);
+static DEVICE_ATTR(removable, 0444, disk_removable_show, NULL);
+static DEVICE_ATTR(hidden, 0444, disk_hidden_show, NULL);
+static DEVICE_ATTR(ro, 0444, disk_ro_show, NULL);
+static DEVICE_ATTR(size, 0444, part_size_show, NULL);
+static DEVICE_ATTR(alignment_offset, 0444, disk_alignment_offset_show, NULL);
+static DEVICE_ATTR(discard_alignment, 0444, disk_discard_alignment_show, NULL);
+static DEVICE_ATTR(capability, 0444, disk_capability_show, NULL);
+static DEVICE_ATTR(stat, 0444, part_stat_show, NULL);
+static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL);
+static DEVICE_ATTR(badblocks, 0644, disk_badblocks_show, disk_badblocks_store);
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 static struct device_attribute dev_attr_fail =
-	__ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store);
+	__ATTR(make-it-fail, 0644, part_fail_show, part_fail_store);
 #endif
 #ifdef CONFIG_FAIL_IO_TIMEOUT
 static struct device_attribute dev_attr_fail_timeout =
-	__ATTR(io-timeout-fail,  S_IRUGO|S_IWUSR, part_timeout_show,
-		part_timeout_store);
+	__ATTR(io-timeout-fail, 0644, part_timeout_show, part_timeout_store);
 #endif
 
 static struct attribute *disk_attrs[] = {
@@ -1924,9 +1921,9 @@ static ssize_t disk_events_poll_msecs_store(struct device *dev,
 	return count;
 }
 
-static const DEVICE_ATTR(events, S_IRUGO, disk_events_show, NULL);
-static const DEVICE_ATTR(events_async, S_IRUGO, disk_events_async_show, NULL);
-static const DEVICE_ATTR(events_poll_msecs, S_IRUGO|S_IWUSR,
+static const DEVICE_ATTR(events, 0444, disk_events_show, NULL);
+static const DEVICE_ATTR(events_async, 0444, disk_events_async_show, NULL);
+static const DEVICE_ATTR(events_poll_msecs, 0644,
 			 disk_events_poll_msecs_show,
 			 disk_events_poll_msecs_store);
 

+ 164 - 35
block/kyber-iosched.c

@@ -72,6 +72,19 @@ static const unsigned int kyber_batch_size[] = {
 	[KYBER_OTHER] = 8,
 };
 
+/*
+ * There is a same mapping between ctx & hctx and kcq & khd,
+ * we use request->mq_ctx->index_hw to index the kcq in khd.
+ */
+struct kyber_ctx_queue {
+	/*
+	 * Used to ensure operations on rq_list and kcq_map to be an atmoic one.
+	 * Also protect the rqs on rq_list when merge.
+	 */
+	spinlock_t lock;
+	struct list_head rq_list[KYBER_NUM_DOMAINS];
+} ____cacheline_aligned_in_smp;
+
 struct kyber_queue_data {
 	struct request_queue *q;
 
@@ -99,6 +112,8 @@ struct kyber_hctx_data {
 	struct list_head rqs[KYBER_NUM_DOMAINS];
 	unsigned int cur_domain;
 	unsigned int batching;
+	struct kyber_ctx_queue *kcqs;
+	struct sbitmap kcq_map[KYBER_NUM_DOMAINS];
 	wait_queue_entry_t domain_wait[KYBER_NUM_DOMAINS];
 	struct sbq_wait_state *domain_ws[KYBER_NUM_DOMAINS];
 	atomic_t wait_index[KYBER_NUM_DOMAINS];
@@ -107,10 +122,8 @@ struct kyber_hctx_data {
 static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
 			     void *key);
 
-static int rq_sched_domain(const struct request *rq)
+static unsigned int kyber_sched_domain(unsigned int op)
 {
-	unsigned int op = rq->cmd_flags;
-
 	if ((op & REQ_OP_MASK) == REQ_OP_READ)
 		return KYBER_READ;
 	else if ((op & REQ_OP_MASK) == REQ_OP_WRITE && op_is_sync(op))
@@ -284,6 +297,11 @@ static unsigned int kyber_sched_tags_shift(struct kyber_queue_data *kqd)
 	return kqd->q->queue_hw_ctx[0]->sched_tags->bitmap_tags.sb.shift;
 }
 
+static int kyber_bucket_fn(const struct request *rq)
+{
+	return kyber_sched_domain(rq->cmd_flags);
+}
+
 static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q)
 {
 	struct kyber_queue_data *kqd;
@@ -297,7 +315,7 @@ static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q)
 		goto err;
 	kqd->q = q;
 
-	kqd->cb = blk_stat_alloc_callback(kyber_stat_timer_fn, rq_sched_domain,
+	kqd->cb = blk_stat_alloc_callback(kyber_stat_timer_fn, kyber_bucket_fn,
 					  KYBER_NUM_DOMAINS, kqd);
 	if (!kqd->cb)
 		goto err_kqd;
@@ -376,8 +394,18 @@ static void kyber_exit_sched(struct elevator_queue *e)
 	kfree(kqd);
 }
 
+static void kyber_ctx_queue_init(struct kyber_ctx_queue *kcq)
+{
+	unsigned int i;
+
+	spin_lock_init(&kcq->lock);
+	for (i = 0; i < KYBER_NUM_DOMAINS; i++)
+		INIT_LIST_HEAD(&kcq->rq_list[i]);
+}
+
 static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
 {
+	struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data;
 	struct kyber_hctx_data *khd;
 	int i;
 
@@ -385,6 +413,24 @@ static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
 	if (!khd)
 		return -ENOMEM;
 
+	khd->kcqs = kmalloc_array_node(hctx->nr_ctx,
+				       sizeof(struct kyber_ctx_queue),
+				       GFP_KERNEL, hctx->numa_node);
+	if (!khd->kcqs)
+		goto err_khd;
+
+	for (i = 0; i < hctx->nr_ctx; i++)
+		kyber_ctx_queue_init(&khd->kcqs[i]);
+
+	for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
+		if (sbitmap_init_node(&khd->kcq_map[i], hctx->nr_ctx,
+				      ilog2(8), GFP_KERNEL, hctx->numa_node)) {
+			while (--i >= 0)
+				sbitmap_free(&khd->kcq_map[i]);
+			goto err_kcqs;
+		}
+	}
+
 	spin_lock_init(&khd->lock);
 
 	for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
@@ -400,12 +446,26 @@ static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
 	khd->batching = 0;
 
 	hctx->sched_data = khd;
+	sbitmap_queue_min_shallow_depth(&hctx->sched_tags->bitmap_tags,
+					kqd->async_depth);
 
 	return 0;
+
+err_kcqs:
+	kfree(khd->kcqs);
+err_khd:
+	kfree(khd);
+	return -ENOMEM;
 }
 
 static void kyber_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
 {
+	struct kyber_hctx_data *khd = hctx->sched_data;
+	int i;
+
+	for (i = 0; i < KYBER_NUM_DOMAINS; i++)
+		sbitmap_free(&khd->kcq_map[i]);
+	kfree(khd->kcqs);
 	kfree(hctx->sched_data);
 }
 
@@ -427,7 +487,7 @@ static void rq_clear_domain_token(struct kyber_queue_data *kqd,
 
 	nr = rq_get_domain_token(rq);
 	if (nr != -1) {
-		sched_domain = rq_sched_domain(rq);
+		sched_domain = kyber_sched_domain(rq->cmd_flags);
 		sbitmap_queue_clear(&kqd->domain_tokens[sched_domain], nr,
 				    rq->mq_ctx->cpu);
 	}
@@ -446,11 +506,51 @@ static void kyber_limit_depth(unsigned int op, struct blk_mq_alloc_data *data)
 	}
 }
 
+static bool kyber_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio)
+{
+	struct kyber_hctx_data *khd = hctx->sched_data;
+	struct blk_mq_ctx *ctx = blk_mq_get_ctx(hctx->queue);
+	struct kyber_ctx_queue *kcq = &khd->kcqs[ctx->index_hw];
+	unsigned int sched_domain = kyber_sched_domain(bio->bi_opf);
+	struct list_head *rq_list = &kcq->rq_list[sched_domain];
+	bool merged;
+
+	spin_lock(&kcq->lock);
+	merged = blk_mq_bio_list_merge(hctx->queue, rq_list, bio);
+	spin_unlock(&kcq->lock);
+	blk_mq_put_ctx(ctx);
+
+	return merged;
+}
+
 static void kyber_prepare_request(struct request *rq, struct bio *bio)
 {
 	rq_set_domain_token(rq, -1);
 }
 
+static void kyber_insert_requests(struct blk_mq_hw_ctx *hctx,
+				  struct list_head *rq_list, bool at_head)
+{
+	struct kyber_hctx_data *khd = hctx->sched_data;
+	struct request *rq, *next;
+
+	list_for_each_entry_safe(rq, next, rq_list, queuelist) {
+		unsigned int sched_domain = kyber_sched_domain(rq->cmd_flags);
+		struct kyber_ctx_queue *kcq = &khd->kcqs[rq->mq_ctx->index_hw];
+		struct list_head *head = &kcq->rq_list[sched_domain];
+
+		spin_lock(&kcq->lock);
+		if (at_head)
+			list_move(&rq->queuelist, head);
+		else
+			list_move_tail(&rq->queuelist, head);
+		sbitmap_set_bit(&khd->kcq_map[sched_domain],
+				rq->mq_ctx->index_hw);
+		blk_mq_sched_request_inserted(rq);
+		spin_unlock(&kcq->lock);
+	}
+}
+
 static void kyber_finish_request(struct request *rq)
 {
 	struct kyber_queue_data *kqd = rq->q->elevator->elevator_data;
@@ -469,7 +569,7 @@ static void kyber_completed_request(struct request *rq)
 	 * Check if this request met our latency goal. If not, quickly gather
 	 * some statistics and start throttling.
 	 */
-	sched_domain = rq_sched_domain(rq);
+	sched_domain = kyber_sched_domain(rq->cmd_flags);
 	switch (sched_domain) {
 	case KYBER_READ:
 		target = kqd->read_lat_nsec;
@@ -485,29 +585,48 @@ static void kyber_completed_request(struct request *rq)
 	if (blk_stat_is_active(kqd->cb))
 		return;
 
-	now = __blk_stat_time(ktime_to_ns(ktime_get()));
-	if (now < blk_stat_time(&rq->issue_stat))
+	now = ktime_get_ns();
+	if (now < rq->io_start_time_ns)
 		return;
 
-	latency = now - blk_stat_time(&rq->issue_stat);
+	latency = now - rq->io_start_time_ns;
 
 	if (latency > target)
 		blk_stat_activate_msecs(kqd->cb, 10);
 }
 
-static void kyber_flush_busy_ctxs(struct kyber_hctx_data *khd,
-				  struct blk_mq_hw_ctx *hctx)
+struct flush_kcq_data {
+	struct kyber_hctx_data *khd;
+	unsigned int sched_domain;
+	struct list_head *list;
+};
+
+static bool flush_busy_kcq(struct sbitmap *sb, unsigned int bitnr, void *data)
 {
-	LIST_HEAD(rq_list);
-	struct request *rq, *next;
+	struct flush_kcq_data *flush_data = data;
+	struct kyber_ctx_queue *kcq = &flush_data->khd->kcqs[bitnr];
 
-	blk_mq_flush_busy_ctxs(hctx, &rq_list);
-	list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
-		unsigned int sched_domain;
+	spin_lock(&kcq->lock);
+	list_splice_tail_init(&kcq->rq_list[flush_data->sched_domain],
+			      flush_data->list);
+	sbitmap_clear_bit(sb, bitnr);
+	spin_unlock(&kcq->lock);
 
-		sched_domain = rq_sched_domain(rq);
-		list_move_tail(&rq->queuelist, &khd->rqs[sched_domain]);
-	}
+	return true;
+}
+
+static void kyber_flush_busy_kcqs(struct kyber_hctx_data *khd,
+				  unsigned int sched_domain,
+				  struct list_head *list)
+{
+	struct flush_kcq_data data = {
+		.khd = khd,
+		.sched_domain = sched_domain,
+		.list = list,
+	};
+
+	sbitmap_for_each_set(&khd->kcq_map[sched_domain],
+			     flush_busy_kcq, &data);
 }
 
 static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
@@ -570,26 +689,23 @@ static int kyber_get_domain_token(struct kyber_queue_data *kqd,
 static struct request *
 kyber_dispatch_cur_domain(struct kyber_queue_data *kqd,
 			  struct kyber_hctx_data *khd,
-			  struct blk_mq_hw_ctx *hctx,
-			  bool *flushed)
+			  struct blk_mq_hw_ctx *hctx)
 {
 	struct list_head *rqs;
 	struct request *rq;
 	int nr;
 
 	rqs = &khd->rqs[khd->cur_domain];
-	rq = list_first_entry_or_null(rqs, struct request, queuelist);
 
 	/*
-	 * If there wasn't already a pending request and we haven't flushed the
-	 * software queues yet, flush the software queues and check again.
+	 * If we already have a flushed request, then we just need to get a
+	 * token for it. Otherwise, if there are pending requests in the kcqs,
+	 * flush the kcqs, but only if we can get a token. If not, we should
+	 * leave the requests in the kcqs so that they can be merged. Note that
+	 * khd->lock serializes the flushes, so if we observed any bit set in
+	 * the kcq_map, we will always get a request.
 	 */
-	if (!rq && !*flushed) {
-		kyber_flush_busy_ctxs(khd, hctx);
-		*flushed = true;
-		rq = list_first_entry_or_null(rqs, struct request, queuelist);
-	}
-
+	rq = list_first_entry_or_null(rqs, struct request, queuelist);
 	if (rq) {
 		nr = kyber_get_domain_token(kqd, khd, hctx);
 		if (nr >= 0) {
@@ -598,6 +714,16 @@ kyber_dispatch_cur_domain(struct kyber_queue_data *kqd,
 			list_del_init(&rq->queuelist);
 			return rq;
 		}
+	} else if (sbitmap_any_bit_set(&khd->kcq_map[khd->cur_domain])) {
+		nr = kyber_get_domain_token(kqd, khd, hctx);
+		if (nr >= 0) {
+			kyber_flush_busy_kcqs(khd, khd->cur_domain, rqs);
+			rq = list_first_entry(rqs, struct request, queuelist);
+			khd->batching++;
+			rq_set_domain_token(rq, nr);
+			list_del_init(&rq->queuelist);
+			return rq;
+		}
 	}
 
 	/* There were either no pending requests or no tokens. */
@@ -608,7 +734,6 @@ static struct request *kyber_dispatch_request(struct blk_mq_hw_ctx *hctx)
 {
 	struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data;
 	struct kyber_hctx_data *khd = hctx->sched_data;
-	bool flushed = false;
 	struct request *rq;
 	int i;
 
@@ -619,7 +744,7 @@ static struct request *kyber_dispatch_request(struct blk_mq_hw_ctx *hctx)
 	 * from the batch.
 	 */
 	if (khd->batching < kyber_batch_size[khd->cur_domain]) {
-		rq = kyber_dispatch_cur_domain(kqd, khd, hctx, &flushed);
+		rq = kyber_dispatch_cur_domain(kqd, khd, hctx);
 		if (rq)
 			goto out;
 	}
@@ -640,7 +765,7 @@ static struct request *kyber_dispatch_request(struct blk_mq_hw_ctx *hctx)
 		else
 			khd->cur_domain++;
 
-		rq = kyber_dispatch_cur_domain(kqd, khd, hctx, &flushed);
+		rq = kyber_dispatch_cur_domain(kqd, khd, hctx);
 		if (rq)
 			goto out;
 	}
@@ -657,10 +782,12 @@ static bool kyber_has_work(struct blk_mq_hw_ctx *hctx)
 	int i;
 
 	for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
-		if (!list_empty_careful(&khd->rqs[i]))
+		if (!list_empty_careful(&khd->rqs[i]) ||
+		    sbitmap_any_bit_set(&khd->kcq_map[i]))
 			return true;
 	}
-	return sbitmap_any_bit_set(&hctx->ctx_map);
+
+	return false;
 }
 
 #define KYBER_LAT_SHOW_STORE(op)					\
@@ -831,7 +958,9 @@ static struct elevator_type kyber_sched = {
 		.init_hctx = kyber_init_hctx,
 		.exit_hctx = kyber_exit_hctx,
 		.limit_depth = kyber_limit_depth,
+		.bio_merge = kyber_bio_merge,
 		.prepare_request = kyber_prepare_request,
+		.insert_requests = kyber_insert_requests,
 		.finish_request = kyber_finish_request,
 		.requeue_request = kyber_finish_request,
 		.completed_request = kyber_completed_request,

+ 1 - 2
block/mq-deadline.c

@@ -630,8 +630,7 @@ STORE_FUNCTION(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX, 0);
 #undef STORE_FUNCTION
 
 #define DD_ATTR(name) \
-	__ATTR(name, S_IRUGO|S_IWUSR, deadline_##name##_show, \
-				      deadline_##name##_store)
+	__ATTR(name, 0644, deadline_##name##_show, deadline_##name##_store)
 
 static struct elv_fs_entry deadline_attrs[] = {
 	DD_ATTR(read_expire),

+ 12 - 14
block/partition-generic.c

@@ -179,18 +179,17 @@ ssize_t part_fail_store(struct device *dev,
 }
 #endif
 
-static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL);
-static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
-static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
-static DEVICE_ATTR(ro, S_IRUGO, part_ro_show, NULL);
-static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL);
-static DEVICE_ATTR(discard_alignment, S_IRUGO, part_discard_alignment_show,
-		   NULL);
-static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
-static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
+static DEVICE_ATTR(partition, 0444, part_partition_show, NULL);
+static DEVICE_ATTR(start, 0444, part_start_show, NULL);
+static DEVICE_ATTR(size, 0444, part_size_show, NULL);
+static DEVICE_ATTR(ro, 0444, part_ro_show, NULL);
+static DEVICE_ATTR(alignment_offset, 0444, part_alignment_offset_show, NULL);
+static DEVICE_ATTR(discard_alignment, 0444, part_discard_alignment_show, NULL);
+static DEVICE_ATTR(stat, 0444, part_stat_show, NULL);
+static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL);
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 static struct device_attribute dev_attr_fail =
-	__ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store);
+	__ATTR(make-it-fail, 0644, part_fail_show, part_fail_store);
 #endif
 
 static struct attribute *part_attrs[] = {
@@ -291,8 +290,7 @@ static ssize_t whole_disk_show(struct device *dev,
 {
 	return 0;
 }
-static DEVICE_ATTR(whole_disk, S_IRUSR | S_IRGRP | S_IROTH,
-		   whole_disk_show, NULL);
+static DEVICE_ATTR(whole_disk, 0444, whole_disk_show, NULL);
 
 /*
  * Must be called either with bd_mutex held, before a disk can be opened or
@@ -518,7 +516,7 @@ rescan:
 
 	if (disk->fops->revalidate_disk)
 		disk->fops->revalidate_disk(disk);
-	check_disk_size_change(disk, bdev);
+	check_disk_size_change(disk, bdev, true);
 	bdev->bd_invalidated = 0;
 	if (!get_capacity(disk) || !(state = check_partition(disk, bdev)))
 		return 0;
@@ -643,7 +641,7 @@ int invalidate_partitions(struct gendisk *disk, struct block_device *bdev)
 		return res;
 
 	set_capacity(disk, 0);
-	check_disk_size_change(disk, bdev);
+	check_disk_size_change(disk, bdev, false);
 	bdev->bd_invalidated = 0;
 	/* tell userspace that the media / partition table may have changed */
 	kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE);

+ 4 - 6
block/scsi_ioctl.c

@@ -321,8 +321,7 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
 		at_head = 1;
 
 	ret = -ENOMEM;
-	rq = blk_get_request(q, writing ? REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN,
-			GFP_KERNEL);
+	rq = blk_get_request(q, writing ? REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, 0);
 	if (IS_ERR(rq))
 		return PTR_ERR(rq);
 	req = scsi_req(rq);
@@ -449,8 +448,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
 
 	}
 
-	rq = blk_get_request(q, in_len ? REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN,
-			__GFP_RECLAIM);
+	rq = blk_get_request(q, in_len ? REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, 0);
 	if (IS_ERR(rq)) {
 		err = PTR_ERR(rq);
 		goto error_free_buffer;
@@ -501,7 +499,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
 		break;
 	}
 
-	if (bytes && blk_rq_map_kern(q, rq, buffer, bytes, __GFP_RECLAIM)) {
+	if (bytes && blk_rq_map_kern(q, rq, buffer, bytes, GFP_NOIO)) {
 		err = DRIVER_ERROR << 24;
 		goto error;
 	}
@@ -538,7 +536,7 @@ static int __blk_send_generic(struct request_queue *q, struct gendisk *bd_disk,
 	struct request *rq;
 	int err;
 
-	rq = blk_get_request(q, REQ_OP_SCSI_OUT, __GFP_RECLAIM);
+	rq = blk_get_request(q, REQ_OP_SCSI_OUT, 0);
 	if (IS_ERR(rq))
 		return PTR_ERR(rq);
 	rq->timeout = BLK_DEFAULT_SG_TIMEOUT;

+ 0 - 51
drivers/ata/libata-eh.c

@@ -500,57 +500,6 @@ void ata_eh_release(struct ata_port *ap)
 	mutex_unlock(&ap->host->eh_mutex);
 }
 
-/**
- *	ata_scsi_timed_out - SCSI layer time out callback
- *	@cmd: timed out SCSI command
- *
- *	Handles SCSI layer timeout.  We race with normal completion of
- *	the qc for @cmd.  If the qc is already gone, we lose and let
- *	the scsi command finish (EH_HANDLED).  Otherwise, the qc has
- *	timed out and EH should be invoked.  Prevent ata_qc_complete()
- *	from finishing it by setting EH_SCHEDULED and return
- *	EH_NOT_HANDLED.
- *
- *	TODO: kill this function once old EH is gone.
- *
- *	LOCKING:
- *	Called from timer context
- *
- *	RETURNS:
- *	EH_HANDLED or EH_NOT_HANDLED
- */
-enum blk_eh_timer_return ata_scsi_timed_out(struct scsi_cmnd *cmd)
-{
-	struct Scsi_Host *host = cmd->device->host;
-	struct ata_port *ap = ata_shost_to_port(host);
-	unsigned long flags;
-	struct ata_queued_cmd *qc;
-	enum blk_eh_timer_return ret;
-
-	DPRINTK("ENTER\n");
-
-	if (ap->ops->error_handler) {
-		ret = BLK_EH_NOT_HANDLED;
-		goto out;
-	}
-
-	ret = BLK_EH_HANDLED;
-	spin_lock_irqsave(ap->lock, flags);
-	qc = ata_qc_from_tag(ap, ap->link.active_tag);
-	if (qc) {
-		WARN_ON(qc->scsicmd != cmd);
-		qc->flags |= ATA_QCFLAG_EH_SCHEDULED;
-		qc->err_mask |= AC_ERR_TIMEOUT;
-		ret = BLK_EH_NOT_HANDLED;
-	}
-	spin_unlock_irqrestore(ap->lock, flags);
-
- out:
-	DPRINTK("EXIT, ret=%d\n", ret);
-	return ret;
-}
-EXPORT_SYMBOL(ata_scsi_timed_out);
-
 static void ata_eh_unload(struct ata_port *ap)
 {
 	struct ata_link *link;

+ 3 - 8
drivers/block/DAC960.c

@@ -1179,7 +1179,6 @@ static bool DAC960_V1_EnableMemoryMailboxInterface(DAC960_Controller_T
 
   if (pci_set_dma_mask(Controller->PCIDevice, DMA_BIT_MASK(32)))
 	return DAC960_Failure(Controller, "DMA mask out of range");
-  Controller->BounceBufferLimit = DMA_BIT_MASK(32);
 
   if ((hw_type == DAC960_PD_Controller) || (hw_type == DAC960_P_Controller)) {
     CommandMailboxesSize =  0;
@@ -1380,11 +1379,8 @@ static bool DAC960_V2_EnableMemoryMailboxInterface(DAC960_Controller_T
   dma_addr_t	CommandMailboxDMA;
   DAC960_V2_CommandStatus_T CommandStatus;
 
-	if (!pci_set_dma_mask(Controller->PCIDevice, DMA_BIT_MASK(64)))
-		Controller->BounceBufferLimit = DMA_BIT_MASK(64);
-	else if (!pci_set_dma_mask(Controller->PCIDevice, DMA_BIT_MASK(32)))
-		Controller->BounceBufferLimit = DMA_BIT_MASK(32);
-	else
+	if (pci_set_dma_mask(Controller->PCIDevice, DMA_BIT_MASK(64)) &&
+	    pci_set_dma_mask(Controller->PCIDevice, DMA_BIT_MASK(32)))
 		return DAC960_Failure(Controller, "DMA mask out of range");
 
   /* This is a temporary dma mapping, used only in the scope of this function */
@@ -2540,7 +2536,6 @@ static bool DAC960_RegisterBlockDevice(DAC960_Controller_T *Controller)
 		continue;
   	}
   	Controller->RequestQueue[n] = RequestQueue;
-  	blk_queue_bounce_limit(RequestQueue, Controller->BounceBufferLimit);
   	RequestQueue->queuedata = Controller;
 	blk_queue_max_segments(RequestQueue, Controller->DriverScatterGatherLimit);
 	blk_queue_max_hw_sectors(RequestQueue, Controller->MaxBlocksPerCommand);
@@ -6594,7 +6589,7 @@ static void DAC960_CreateProcEntries(DAC960_Controller_T *Controller)
 					 DAC960_ProcDirectoryEntry);
 	proc_create_data("initial_status", 0, ControllerProcEntry, &dac960_initial_status_proc_fops, Controller);
 	proc_create_data("current_status", 0, ControllerProcEntry, &dac960_current_status_proc_fops, Controller);
-	proc_create_data("user_command", S_IWUSR | S_IRUSR, ControllerProcEntry, &dac960_user_command_proc_fops, Controller);
+	proc_create_data("user_command", 0600, ControllerProcEntry, &dac960_user_command_proc_fops, Controller);
 	Controller->ControllerProcEntry = ControllerProcEntry;
 }
 

+ 0 - 1
drivers/block/DAC960.h

@@ -2295,7 +2295,6 @@ typedef struct DAC960_Controller
   unsigned short MaxBlocksPerCommand;
   unsigned short ControllerScatterGatherLimit;
   unsigned short DriverScatterGatherLimit;
-  u64		BounceBufferLimit;
   unsigned int CombinedStatusBufferLength;
   unsigned int InitialStatusLength;
   unsigned int CurrentStatusLength;

+ 5 - 6
drivers/block/aoe/aoeblk.c

@@ -159,14 +159,14 @@ static int aoe_debugfs_open(struct inode *inode, struct file *file)
 	return single_open(file, aoedisk_debugfs_show, inode->i_private);
 }
 
-static DEVICE_ATTR(state, S_IRUGO, aoedisk_show_state, NULL);
-static DEVICE_ATTR(mac, S_IRUGO, aoedisk_show_mac, NULL);
-static DEVICE_ATTR(netif, S_IRUGO, aoedisk_show_netif, NULL);
+static DEVICE_ATTR(state, 0444, aoedisk_show_state, NULL);
+static DEVICE_ATTR(mac, 0444, aoedisk_show_mac, NULL);
+static DEVICE_ATTR(netif, 0444, aoedisk_show_netif, NULL);
 static struct device_attribute dev_attr_firmware_version = {
-	.attr = { .name = "firmware-version", .mode = S_IRUGO },
+	.attr = { .name = "firmware-version", .mode = 0444 },
 	.show = aoedisk_show_fwver,
 };
-static DEVICE_ATTR(payload, S_IRUGO, aoedisk_show_payload, NULL);
+static DEVICE_ATTR(payload, 0444, aoedisk_show_payload, NULL);
 
 static struct attribute *aoe_attrs[] = {
 	&dev_attr_state.attr,
@@ -388,7 +388,6 @@ aoeblk_gdalloc(void *vp)
 			d->aoemajor, d->aoeminor);
 		goto err_mempool;
 	}
-	blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH);
 
 	spin_lock_irqsave(&d->lock, flags);
 	WARN_ON(!(d->flags & DEVFL_GD_NOW));

+ 2 - 1
drivers/block/aoe/aoecmd.c

@@ -1032,8 +1032,9 @@ bvcpy(struct sk_buff *skb, struct bio *bio, struct bvec_iter iter, long cnt)
 	iter.bi_size = cnt;
 
 	__bio_for_each_segment(bv, bio, iter, iter) {
-		char *p = page_address(bv.bv_page) + bv.bv_offset;
+		char *p = kmap_atomic(bv.bv_page) + bv.bv_offset;
 		skb_copy_bits(skb, soff, p, bv.bv_len);
+		kunmap_atomic(p);
 		soff += bv.bv_len;
 	}
 }

+ 7 - 3
drivers/block/brd.c

@@ -331,15 +331,15 @@ static const struct block_device_operations brd_fops = {
  * And now the modules code and kernel interface.
  */
 static int rd_nr = CONFIG_BLK_DEV_RAM_COUNT;
-module_param(rd_nr, int, S_IRUGO);
+module_param(rd_nr, int, 0444);
 MODULE_PARM_DESC(rd_nr, "Maximum number of brd devices");
 
 unsigned long rd_size = CONFIG_BLK_DEV_RAM_SIZE;
-module_param(rd_size, ulong, S_IRUGO);
+module_param(rd_size, ulong, 0444);
 MODULE_PARM_DESC(rd_size, "Size of each RAM disk in kbytes.");
 
 static int max_part = 1;
-module_param(max_part, int, S_IRUGO);
+module_param(max_part, int, 0444);
 MODULE_PARM_DESC(max_part, "Num Minors to reserve between devices");
 
 MODULE_LICENSE("GPL");
@@ -402,6 +402,10 @@ static struct brd_device *brd_alloc(int i)
 	set_capacity(disk, rd_size * 2);
 	disk->queue->backing_dev_info->capabilities |= BDI_CAP_SYNCHRONOUS_IO;
 
+	/* Tell the block layer that this is not a rotational device */
+	blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue);
+	blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, disk->queue);
+
 	return brd;
 
 out_free_queue:

+ 3 - 2
drivers/block/drbd/drbd_bitmap.c

@@ -977,7 +977,7 @@ static void drbd_bm_endio(struct bio *bio)
 	bm_page_unlock_io(device, idx);
 
 	if (ctx->flags & BM_AIO_COPY_PAGES)
-		mempool_free(bio->bi_io_vec[0].bv_page, drbd_md_io_page_pool);
+		mempool_free(bio->bi_io_vec[0].bv_page, &drbd_md_io_page_pool);
 
 	bio_put(bio);
 
@@ -1014,7 +1014,8 @@ static void bm_page_io_async(struct drbd_bm_aio_ctx *ctx, int page_nr) __must_ho
 	bm_set_page_unchanged(b->bm_pages[page_nr]);
 
 	if (ctx->flags & BM_AIO_COPY_PAGES) {
-		page = mempool_alloc(drbd_md_io_page_pool, __GFP_HIGHMEM|__GFP_RECLAIM);
+		page = mempool_alloc(&drbd_md_io_page_pool,
+				GFP_NOIO | __GFP_HIGHMEM);
 		copy_highpage(page, b->bm_pages[page_nr]);
 		bm_store_page_idx(page, page_nr);
 	} else

+ 10 - 10
drivers/block/drbd/drbd_debugfs.c

@@ -481,9 +481,9 @@ void drbd_debugfs_resource_add(struct drbd_resource *resource)
 		goto fail;
 	resource->debugfs_res_connections = dentry;
 
-	dentry = debugfs_create_file("in_flight_summary", S_IRUSR|S_IRGRP,
-			resource->debugfs_res, resource,
-			&in_flight_summary_fops);
+	dentry = debugfs_create_file("in_flight_summary", 0440,
+				     resource->debugfs_res, resource,
+				     &in_flight_summary_fops);
 	if (IS_ERR_OR_NULL(dentry))
 		goto fail;
 	resource->debugfs_res_in_flight_summary = dentry;
@@ -645,16 +645,16 @@ void drbd_debugfs_connection_add(struct drbd_connection *connection)
 		goto fail;
 	connection->debugfs_conn = dentry;
 
-	dentry = debugfs_create_file("callback_history", S_IRUSR|S_IRGRP,
-			connection->debugfs_conn, connection,
-			&connection_callback_history_fops);
+	dentry = debugfs_create_file("callback_history", 0440,
+				     connection->debugfs_conn, connection,
+				     &connection_callback_history_fops);
 	if (IS_ERR_OR_NULL(dentry))
 		goto fail;
 	connection->debugfs_conn_callback_history = dentry;
 
-	dentry = debugfs_create_file("oldest_requests", S_IRUSR|S_IRGRP,
-			connection->debugfs_conn, connection,
-			&connection_oldest_requests_fops);
+	dentry = debugfs_create_file("oldest_requests", 0440,
+				     connection->debugfs_conn, connection,
+				     &connection_oldest_requests_fops);
 	if (IS_ERR_OR_NULL(dentry))
 		goto fail;
 	connection->debugfs_conn_oldest_requests = dentry;
@@ -824,7 +824,7 @@ void drbd_debugfs_device_add(struct drbd_device *device)
 	device->debugfs_minor = dentry;
 
 #define DCF(name)	do {					\
-	dentry = debugfs_create_file(#name, S_IRUSR|S_IRGRP,	\
+	dentry = debugfs_create_file(#name, 0440,	\
 			device->debugfs_vol, device,		\
 			&device_ ## name ## _fops);		\
 	if (IS_ERR_OR_NULL(dentry))				\

+ 5 - 5
drivers/block/drbd/drbd_int.h

@@ -1405,8 +1405,8 @@ extern struct kmem_cache *drbd_request_cache;
 extern struct kmem_cache *drbd_ee_cache;	/* peer requests */
 extern struct kmem_cache *drbd_bm_ext_cache;	/* bitmap extents */
 extern struct kmem_cache *drbd_al_ext_cache;	/* activity log extents */
-extern mempool_t *drbd_request_mempool;
-extern mempool_t *drbd_ee_mempool;
+extern mempool_t drbd_request_mempool;
+extern mempool_t drbd_ee_mempool;
 
 /* drbd's page pool, used to buffer data received from the peer,
  * or data requested by the peer.
@@ -1432,16 +1432,16 @@ extern wait_queue_head_t drbd_pp_wait;
  * 128 should be plenty, currently we probably can get away with as few as 1.
  */
 #define DRBD_MIN_POOL_PAGES	128
-extern mempool_t *drbd_md_io_page_pool;
+extern mempool_t drbd_md_io_page_pool;
 
 /* We also need to make sure we get a bio
  * when we need it for housekeeping purposes */
-extern struct bio_set *drbd_md_io_bio_set;
+extern struct bio_set drbd_md_io_bio_set;
 /* to allocate from that set */
 extern struct bio *bio_alloc_drbd(gfp_t gfp_mask);
 
 /* And a bio_set for cloning */
-extern struct bio_set *drbd_io_bio_set;
+extern struct bio_set drbd_io_bio_set;
 
 extern struct mutex resources_mutex;
 

+ 26 - 47
drivers/block/drbd/drbd_main.c

@@ -124,11 +124,11 @@ struct kmem_cache *drbd_request_cache;
 struct kmem_cache *drbd_ee_cache;	/* peer requests */
 struct kmem_cache *drbd_bm_ext_cache;	/* bitmap extents */
 struct kmem_cache *drbd_al_ext_cache;	/* activity log extents */
-mempool_t *drbd_request_mempool;
-mempool_t *drbd_ee_mempool;
-mempool_t *drbd_md_io_page_pool;
-struct bio_set *drbd_md_io_bio_set;
-struct bio_set *drbd_io_bio_set;
+mempool_t drbd_request_mempool;
+mempool_t drbd_ee_mempool;
+mempool_t drbd_md_io_page_pool;
+struct bio_set drbd_md_io_bio_set;
+struct bio_set drbd_io_bio_set;
 
 /* I do not use a standard mempool, because:
    1) I want to hand out the pre-allocated objects first.
@@ -153,10 +153,10 @@ struct bio *bio_alloc_drbd(gfp_t gfp_mask)
 {
 	struct bio *bio;
 
-	if (!drbd_md_io_bio_set)
+	if (!bioset_initialized(&drbd_md_io_bio_set))
 		return bio_alloc(gfp_mask, 1);
 
-	bio = bio_alloc_bioset(gfp_mask, 1, drbd_md_io_bio_set);
+	bio = bio_alloc_bioset(gfp_mask, 1, &drbd_md_io_bio_set);
 	if (!bio)
 		return NULL;
 	return bio;
@@ -2097,16 +2097,11 @@ static void drbd_destroy_mempools(void)
 
 	/* D_ASSERT(device, atomic_read(&drbd_pp_vacant)==0); */
 
-	if (drbd_io_bio_set)
-		bioset_free(drbd_io_bio_set);
-	if (drbd_md_io_bio_set)
-		bioset_free(drbd_md_io_bio_set);
-	if (drbd_md_io_page_pool)
-		mempool_destroy(drbd_md_io_page_pool);
-	if (drbd_ee_mempool)
-		mempool_destroy(drbd_ee_mempool);
-	if (drbd_request_mempool)
-		mempool_destroy(drbd_request_mempool);
+	bioset_exit(&drbd_io_bio_set);
+	bioset_exit(&drbd_md_io_bio_set);
+	mempool_exit(&drbd_md_io_page_pool);
+	mempool_exit(&drbd_ee_mempool);
+	mempool_exit(&drbd_request_mempool);
 	if (drbd_ee_cache)
 		kmem_cache_destroy(drbd_ee_cache);
 	if (drbd_request_cache)
@@ -2116,11 +2111,6 @@ static void drbd_destroy_mempools(void)
 	if (drbd_al_ext_cache)
 		kmem_cache_destroy(drbd_al_ext_cache);
 
-	drbd_io_bio_set      = NULL;
-	drbd_md_io_bio_set   = NULL;
-	drbd_md_io_page_pool = NULL;
-	drbd_ee_mempool      = NULL;
-	drbd_request_mempool = NULL;
 	drbd_ee_cache        = NULL;
 	drbd_request_cache   = NULL;
 	drbd_bm_ext_cache    = NULL;
@@ -2133,18 +2123,7 @@ static int drbd_create_mempools(void)
 {
 	struct page *page;
 	const int number = (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * drbd_minor_count;
-	int i;
-
-	/* prepare our caches and mempools */
-	drbd_request_mempool = NULL;
-	drbd_ee_cache        = NULL;
-	drbd_request_cache   = NULL;
-	drbd_bm_ext_cache    = NULL;
-	drbd_al_ext_cache    = NULL;
-	drbd_pp_pool         = NULL;
-	drbd_md_io_page_pool = NULL;
-	drbd_md_io_bio_set   = NULL;
-	drbd_io_bio_set      = NULL;
+	int i, ret;
 
 	/* caches */
 	drbd_request_cache = kmem_cache_create(
@@ -2168,26 +2147,26 @@ static int drbd_create_mempools(void)
 		goto Enomem;
 
 	/* mempools */
-	drbd_io_bio_set = bioset_create(BIO_POOL_SIZE, 0, 0);
-	if (drbd_io_bio_set == NULL)
+	ret = bioset_init(&drbd_io_bio_set, BIO_POOL_SIZE, 0, 0);
+	if (ret)
 		goto Enomem;
 
-	drbd_md_io_bio_set = bioset_create(DRBD_MIN_POOL_PAGES, 0,
-					   BIOSET_NEED_BVECS);
-	if (drbd_md_io_bio_set == NULL)
+	ret = bioset_init(&drbd_md_io_bio_set, DRBD_MIN_POOL_PAGES, 0,
+			  BIOSET_NEED_BVECS);
+	if (ret)
 		goto Enomem;
 
-	drbd_md_io_page_pool = mempool_create_page_pool(DRBD_MIN_POOL_PAGES, 0);
-	if (drbd_md_io_page_pool == NULL)
+	ret = mempool_init_page_pool(&drbd_md_io_page_pool, DRBD_MIN_POOL_PAGES, 0);
+	if (ret)
 		goto Enomem;
 
-	drbd_request_mempool = mempool_create_slab_pool(number,
-		drbd_request_cache);
-	if (drbd_request_mempool == NULL)
+	ret = mempool_init_slab_pool(&drbd_request_mempool, number,
+				     drbd_request_cache);
+	if (ret)
 		goto Enomem;
 
-	drbd_ee_mempool = mempool_create_slab_pool(number, drbd_ee_cache);
-	if (drbd_ee_mempool == NULL)
+	ret = mempool_init_slab_pool(&drbd_ee_mempool, number, drbd_ee_cache);
+	if (ret)
 		goto Enomem;
 
 	/* drbd's page pool */
@@ -3010,7 +2989,7 @@ static int __init drbd_init(void)
 		goto fail;
 
 	err = -ENOMEM;
-	drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
+	drbd_proc = proc_create_data("drbd", S_IFREG | 0444 , NULL, &drbd_proc_fops, NULL);
 	if (!drbd_proc)	{
 		pr_err("unable to register proc file\n");
 		goto fail;

+ 3 - 3
drivers/block/drbd/drbd_receiver.c

@@ -378,7 +378,7 @@ drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t secto
 	if (drbd_insert_fault(device, DRBD_FAULT_AL_EE))
 		return NULL;
 
-	peer_req = mempool_alloc(drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM);
+	peer_req = mempool_alloc(&drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM);
 	if (!peer_req) {
 		if (!(gfp_mask & __GFP_NOWARN))
 			drbd_err(device, "%s: allocation failed\n", __func__);
@@ -409,7 +409,7 @@ drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t secto
 	return peer_req;
 
  fail:
-	mempool_free(peer_req, drbd_ee_mempool);
+	mempool_free(peer_req, &drbd_ee_mempool);
 	return NULL;
 }
 
@@ -426,7 +426,7 @@ void __drbd_free_peer_req(struct drbd_device *device, struct drbd_peer_request *
 		peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO;
 		drbd_al_complete_io(device, &peer_req->i);
 	}
-	mempool_free(peer_req, drbd_ee_mempool);
+	mempool_free(peer_req, &drbd_ee_mempool);
 }
 
 int drbd_free_peer_reqs(struct drbd_device *device, struct list_head *list)

+ 2 - 2
drivers/block/drbd/drbd_req.c

@@ -55,7 +55,7 @@ static struct drbd_request *drbd_req_new(struct drbd_device *device, struct bio
 {
 	struct drbd_request *req;
 
-	req = mempool_alloc(drbd_request_mempool, GFP_NOIO);
+	req = mempool_alloc(&drbd_request_mempool, GFP_NOIO);
 	if (!req)
 		return NULL;
 	memset(req, 0, sizeof(*req));
@@ -184,7 +184,7 @@ void drbd_req_destroy(struct kref *kref)
 		}
 	}
 
-	mempool_free(req, drbd_request_mempool);
+	mempool_free(req, &drbd_request_mempool);
 }
 
 static void wake_all_senders(struct drbd_connection *connection)

+ 1 - 1
drivers/block/drbd/drbd_req.h

@@ -269,7 +269,7 @@ enum drbd_req_state_bits {
 static inline void drbd_req_make_private_bio(struct drbd_request *req, struct bio *bio_src)
 {
 	struct bio *bio;
-	bio = bio_clone_fast(bio_src, GFP_NOIO, drbd_io_bio_set);
+	bio = bio_clone_fast(bio_src, GFP_NOIO, &drbd_io_bio_set);
 
 	req->private_bio = bio;
 

+ 1 - 1
drivers/block/floppy.c

@@ -4450,7 +4450,7 @@ static ssize_t floppy_cmos_show(struct device *dev,
 	return sprintf(buf, "%X\n", UDP->cmos);
 }
 
-static DEVICE_ATTR(cmos, S_IRUGO, floppy_cmos_show, NULL);
+static DEVICE_ATTR(cmos, 0444, floppy_cmos_show, NULL);
 
 static struct attribute *floppy_dev_attrs[] = {
 	&dev_attr_cmos.attr,

+ 9 - 8
drivers/block/loop.c

@@ -732,7 +732,7 @@ static ssize_t loop_attr_do_show_##_name(struct device *d,		\
 	return loop_attr_show(d, b, loop_attr_##_name##_show);		\
 }									\
 static struct device_attribute loop_attr_##_name =			\
-	__ATTR(_name, S_IRUGO, loop_attr_do_show_##_name, NULL);
+	__ATTR(_name, 0444, loop_attr_do_show_##_name, NULL);
 
 static ssize_t loop_attr_backing_file_show(struct loop_device *lo, char *buf)
 {
@@ -809,16 +809,17 @@ static struct attribute_group loop_attribute_group = {
 	.attrs= loop_attrs,
 };
 
-static int loop_sysfs_init(struct loop_device *lo)
+static void loop_sysfs_init(struct loop_device *lo)
 {
-	return sysfs_create_group(&disk_to_dev(lo->lo_disk)->kobj,
-				  &loop_attribute_group);
+	lo->sysfs_inited = !sysfs_create_group(&disk_to_dev(lo->lo_disk)->kobj,
+						&loop_attribute_group);
 }
 
 static void loop_sysfs_exit(struct loop_device *lo)
 {
-	sysfs_remove_group(&disk_to_dev(lo->lo_disk)->kobj,
-			   &loop_attribute_group);
+	if (lo->sysfs_inited)
+		sysfs_remove_group(&disk_to_dev(lo->lo_disk)->kobj,
+				   &loop_attribute_group);
 }
 
 static void loop_config_discard(struct loop_device *lo)
@@ -1677,9 +1678,9 @@ static const struct block_device_operations lo_fops = {
  * And now the modules code and kernel interface.
  */
 static int max_loop;
-module_param(max_loop, int, S_IRUGO);
+module_param(max_loop, int, 0444);
 MODULE_PARM_DESC(max_loop, "Maximum number of loop devices");
-module_param(max_part, int, S_IRUGO);
+module_param(max_part, int, 0444);
 MODULE_PARM_DESC(max_part, "Maximum number of partitions per loop device");
 MODULE_LICENSE("GPL");
 MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR);

+ 1 - 0
drivers/block/loop.h

@@ -58,6 +58,7 @@ struct loop_device {
 	struct kthread_worker	worker;
 	struct task_struct	*worker_task;
 	bool			use_dio;
+	bool			sysfs_inited;
 
 	struct request_queue	*lo_queue;
 	struct blk_mq_tag_set	tag_set;

+ 10 - 19
drivers/block/mtip32xx/mtip32xx.c

@@ -2285,7 +2285,7 @@ static ssize_t mtip_hw_show_status(struct device *dev,
 	return size;
 }
 
-static DEVICE_ATTR(status, S_IRUGO, mtip_hw_show_status, NULL);
+static DEVICE_ATTR(status, 0444, mtip_hw_show_status, NULL);
 
 /* debugsfs entries */
 
@@ -2566,10 +2566,9 @@ static int mtip_hw_debugfs_init(struct driver_data *dd)
 		return -1;
 	}
 
-	debugfs_create_file("flags", S_IRUGO, dd->dfs_node, dd,
-							&mtip_flags_fops);
-	debugfs_create_file("registers", S_IRUGO, dd->dfs_node, dd,
-							&mtip_regs_fops);
+	debugfs_create_file("flags", 0444, dd->dfs_node, dd, &mtip_flags_fops);
+	debugfs_create_file("registers", 0444, dd->dfs_node, dd,
+			    &mtip_regs_fops);
 
 	return 0;
 }
@@ -2726,15 +2725,11 @@ static void mtip_softirq_done_fn(struct request *rq)
 	blk_mq_end_request(rq, cmd->status);
 }
 
-static void mtip_abort_cmd(struct request *req, void *data,
-							bool reserved)
+static void mtip_abort_cmd(struct request *req, void *data, bool reserved)
 {
 	struct mtip_cmd *cmd = blk_mq_rq_to_pdu(req);
 	struct driver_data *dd = data;
 
-	if (!blk_mq_request_started(req))
-		return;
-
 	dbg_printk(MTIP_DRV_NAME " Aborting request, tag = %d\n", req->tag);
 
 	clear_bit(req->tag, dd->port->cmds_to_issue);
@@ -2742,14 +2737,10 @@ static void mtip_abort_cmd(struct request *req, void *data,
 	mtip_softirq_done_fn(req);
 }
 
-static void mtip_queue_cmd(struct request *req, void *data,
-							bool reserved)
+static void mtip_queue_cmd(struct request *req, void *data, bool reserved)
 {
 	struct driver_data *dd = data;
 
-	if (!blk_mq_request_started(req))
-		return;
-
 	set_bit(req->tag, dd->port->cmds_to_issue);
 	blk_abort_request(req);
 }
@@ -3720,7 +3711,8 @@ static enum blk_eh_timer_return mtip_cmd_timeout(struct request *req,
 		struct mtip_cmd *cmd = blk_mq_rq_to_pdu(req);
 
 		cmd->status = BLK_STS_TIMEOUT;
-		return BLK_EH_HANDLED;
+		blk_mq_complete_request(req);
+		return BLK_EH_DONE;
 	}
 
 	if (test_bit(req->tag, dd->port->cmds_to_issue))
@@ -3862,7 +3854,6 @@ skip_create_disk:
 	blk_queue_max_hw_sectors(dd->queue, 0xffff);
 	blk_queue_max_segment_size(dd->queue, 0x400000);
 	blk_queue_io_min(dd->queue, 4096);
-	blk_queue_bounce_limit(dd->queue, dd->pdev->dma_mask);
 
 	/* Signal trim support */
 	if (dd->trim_supp == true) {
@@ -4273,7 +4264,7 @@ static int mtip_pci_probe(struct pci_dev *pdev,
 	if (!dd->isr_workq) {
 		dev_warn(&pdev->dev, "Can't create wq %d\n", dd->instance);
 		rv = -ENOMEM;
-		goto block_initialize_err;
+		goto setmask_err;
 	}
 
 	memset(cpu_list, 0, sizeof(cpu_list));
@@ -4614,7 +4605,7 @@ static int __init mtip_init(void)
 	}
 	if (dfs_parent) {
 		dfs_device_status = debugfs_create_file("device_status",
-					S_IRUGO, dfs_parent, NULL,
+					0444, dfs_parent, NULL,
 					&mtip_device_status_fops);
 		if (IS_ERR_OR_NULL(dfs_device_status)) {
 			pr_err("Error creating device_status node\n");

+ 52 - 25
drivers/block/nbd.c

@@ -166,16 +166,19 @@ static ssize_t pid_show(struct device *dev,
 }
 
 static const struct device_attribute pid_attr = {
-	.attr = { .name = "pid", .mode = S_IRUGO},
+	.attr = { .name = "pid", .mode = 0444},
 	.show = pid_show,
 };
 
 static void nbd_dev_remove(struct nbd_device *nbd)
 {
 	struct gendisk *disk = nbd->disk;
+	struct request_queue *q;
+
 	if (disk) {
+		q = disk->queue;
 		del_gendisk(disk);
-		blk_cleanup_queue(disk->queue);
+		blk_cleanup_queue(q);
 		blk_mq_free_tag_set(&nbd->tag_set);
 		disk->private_data = NULL;
 		put_disk(disk);
@@ -213,7 +216,15 @@ static void nbd_mark_nsock_dead(struct nbd_device *nbd, struct nbd_sock *nsock,
 	}
 	if (!nsock->dead) {
 		kernel_sock_shutdown(nsock->sock, SHUT_RDWR);
-		atomic_dec(&nbd->config->live_connections);
+		if (atomic_dec_return(&nbd->config->live_connections) == 0) {
+			if (test_and_clear_bit(NBD_DISCONNECT_REQUESTED,
+					       &nbd->config->runtime_flags)) {
+				set_bit(NBD_DISCONNECTED,
+					&nbd->config->runtime_flags);
+				dev_info(nbd_to_dev(nbd),
+					"Disconnected due to user request.\n");
+			}
+		}
 	}
 	nsock->dead = true;
 	nsock->pending = NULL;
@@ -231,9 +242,22 @@ static void nbd_size_clear(struct nbd_device *nbd)
 static void nbd_size_update(struct nbd_device *nbd)
 {
 	struct nbd_config *config = nbd->config;
+	struct block_device *bdev = bdget_disk(nbd->disk, 0);
+
+	if (config->flags & NBD_FLAG_SEND_TRIM) {
+		nbd->disk->queue->limits.discard_granularity = config->blksize;
+		blk_queue_max_discard_sectors(nbd->disk->queue, UINT_MAX);
+	}
 	blk_queue_logical_block_size(nbd->disk->queue, config->blksize);
 	blk_queue_physical_block_size(nbd->disk->queue, config->blksize);
 	set_capacity(nbd->disk, config->bytesize >> 9);
+	if (bdev) {
+		if (bdev->bd_disk)
+			bd_set_size(bdev, config->bytesize);
+		else
+			bdev->bd_invalidated = 1;
+		bdput(bdev);
+	}
 	kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
 }
 
@@ -243,6 +267,8 @@ static void nbd_size_set(struct nbd_device *nbd, loff_t blocksize,
 	struct nbd_config *config = nbd->config;
 	config->blksize = blocksize;
 	config->bytesize = blocksize * nr_blocks;
+	if (nbd->task_recv != NULL)
+		nbd_size_update(nbd);
 }
 
 static void nbd_complete_rq(struct request *req)
@@ -286,13 +312,15 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
 
 	if (!refcount_inc_not_zero(&nbd->config_refs)) {
 		cmd->status = BLK_STS_TIMEOUT;
-		return BLK_EH_HANDLED;
+		goto done;
 	}
 	config = nbd->config;
 
 	if (config->num_connections > 1) {
 		dev_err_ratelimited(nbd_to_dev(nbd),
-				    "Connection timed out, retrying\n");
+				    "Connection timed out, retrying (%d/%d alive)\n",
+				    atomic_read(&config->live_connections),
+				    config->num_connections);
 		/*
 		 * Hooray we have more connections, requeue this IO, the submit
 		 * path will put it on a real connection.
@@ -314,7 +342,7 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
 			}
 			blk_mq_requeue_request(req, true);
 			nbd_config_put(nbd);
-			return BLK_EH_NOT_HANDLED;
+			return BLK_EH_DONE;
 		}
 	} else {
 		dev_err_ratelimited(nbd_to_dev(nbd),
@@ -324,8 +352,9 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
 	cmd->status = BLK_STS_IOERR;
 	sock_shutdown(nbd);
 	nbd_config_put(nbd);
-
-	return BLK_EH_HANDLED;
+done:
+	blk_mq_complete_request(req);
+	return BLK_EH_DONE;
 }
 
 /*
@@ -647,11 +676,8 @@ static void recv_work(struct work_struct *work)
 
 static void nbd_clear_req(struct request *req, void *data, bool reserved)
 {
-	struct nbd_cmd *cmd;
+	struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
 
-	if (!blk_mq_request_started(req))
-		return;
-	cmd = blk_mq_rq_to_pdu(req);
 	cmd->status = BLK_STS_IOERR;
 	blk_mq_complete_request(req);
 }
@@ -714,10 +740,9 @@ static int wait_for_reconnect(struct nbd_device *nbd)
 		return 0;
 	if (test_bit(NBD_DISCONNECTED, &config->runtime_flags))
 		return 0;
-	wait_event_timeout(config->conn_wait,
-			   atomic_read(&config->live_connections),
-			   config->dead_conn_timeout);
-	return atomic_read(&config->live_connections);
+	return wait_event_timeout(config->conn_wait,
+				  atomic_read(&config->live_connections) > 0,
+				  config->dead_conn_timeout) > 0;
 }
 
 static int nbd_handle_cmd(struct nbd_cmd *cmd, int index)
@@ -950,10 +975,6 @@ static void nbd_bdev_reset(struct block_device *bdev)
 	if (bdev->bd_openers > 1)
 		return;
 	bd_set_size(bdev, 0);
-	if (max_part > 0) {
-		blkdev_reread_part(bdev);
-		bdev->bd_invalidated = 1;
-	}
 }
 
 static void nbd_parse_flags(struct nbd_device *nbd)
@@ -1040,6 +1061,8 @@ static void nbd_config_put(struct nbd_device *nbd)
 		nbd->config = NULL;
 
 		nbd->tag_set.timeout = 0;
+		nbd->disk->queue->limits.discard_granularity = 0;
+		blk_queue_max_discard_sectors(nbd->disk->queue, UINT_MAX);
 		blk_queue_flag_clear(QUEUE_FLAG_DISCARD, nbd->disk->queue);
 
 		mutex_unlock(&nbd->config_lock);
@@ -1109,7 +1132,6 @@ static int nbd_start_device_ioctl(struct nbd_device *nbd, struct block_device *b
 	if (ret)
 		return ret;
 
-	bd_set_size(bdev, config->bytesize);
 	if (max_part)
 		bdev->bd_invalidated = 1;
 	mutex_unlock(&nbd->config_lock);
@@ -1118,7 +1140,7 @@ static int nbd_start_device_ioctl(struct nbd_device *nbd, struct block_device *b
 	if (ret)
 		sock_shutdown(nbd);
 	mutex_lock(&nbd->config_lock);
-	bd_set_size(bdev, 0);
+	nbd_bdev_reset(bdev);
 	/* user requested, ignore socket errors */
 	if (test_bit(NBD_DISCONNECT_REQUESTED, &config->runtime_flags))
 		ret = 0;
@@ -1269,6 +1291,9 @@ static int nbd_open(struct block_device *bdev, fmode_t mode)
 		refcount_set(&nbd->config_refs, 1);
 		refcount_inc(&nbd->refs);
 		mutex_unlock(&nbd->config_lock);
+		bdev->bd_invalidated = 1;
+	} else if (nbd_disconnected(nbd->config)) {
+		bdev->bd_invalidated = 1;
 	}
 out:
 	mutex_unlock(&nbd_index_mutex);
@@ -1490,8 +1515,8 @@ static int nbd_dev_add(int index)
 	 */
 	blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue);
 	blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, disk->queue);
-	disk->queue->limits.discard_granularity = 512;
-	blk_queue_max_discard_sectors(disk->queue, UINT_MAX);
+	disk->queue->limits.discard_granularity = 0;
+	blk_queue_max_discard_sectors(disk->queue, 0);
 	blk_queue_max_segment_size(disk->queue, UINT_MAX);
 	blk_queue_max_segments(disk->queue, USHRT_MAX);
 	blk_queue_max_hw_sectors(disk->queue, 65536);
@@ -1755,6 +1780,7 @@ static int nbd_genl_disconnect(struct sk_buff *skb, struct genl_info *info)
 	}
 	mutex_lock(&nbd->config_lock);
 	nbd_disconnect(nbd);
+	nbd_clear_sock(nbd);
 	mutex_unlock(&nbd->config_lock);
 	if (test_and_clear_bit(NBD_HAS_CONFIG_REF,
 			       &nbd->config->runtime_flags))
@@ -2093,7 +2119,8 @@ static int __init nbd_init(void)
 	if (nbds_max > 1UL << (MINORBITS - part_shift))
 		return -EINVAL;
 	recv_workqueue = alloc_workqueue("knbd-recv",
-					 WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
+					 WQ_MEM_RECLAIM | WQ_HIGHPRI |
+					 WQ_UNBOUND, 0);
 	if (!recv_workqueue)
 		return -ENOMEM;
 

+ 19 - 17
drivers/block/null_blk.c

@@ -157,23 +157,23 @@ enum {
 };
 
 static int g_no_sched;
-module_param_named(no_sched, g_no_sched, int, S_IRUGO);
+module_param_named(no_sched, g_no_sched, int, 0444);
 MODULE_PARM_DESC(no_sched, "No io scheduler");
 
 static int g_submit_queues = 1;
-module_param_named(submit_queues, g_submit_queues, int, S_IRUGO);
+module_param_named(submit_queues, g_submit_queues, int, 0444);
 MODULE_PARM_DESC(submit_queues, "Number of submission queues");
 
 static int g_home_node = NUMA_NO_NODE;
-module_param_named(home_node, g_home_node, int, S_IRUGO);
+module_param_named(home_node, g_home_node, int, 0444);
 MODULE_PARM_DESC(home_node, "Home node for the device");
 
 #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
 static char g_timeout_str[80];
-module_param_string(timeout, g_timeout_str, sizeof(g_timeout_str), S_IRUGO);
+module_param_string(timeout, g_timeout_str, sizeof(g_timeout_str), 0444);
 
 static char g_requeue_str[80];
-module_param_string(requeue, g_requeue_str, sizeof(g_requeue_str), S_IRUGO);
+module_param_string(requeue, g_requeue_str, sizeof(g_requeue_str), 0444);
 #endif
 
 static int g_queue_mode = NULL_Q_MQ;
@@ -203,27 +203,27 @@ static const struct kernel_param_ops null_queue_mode_param_ops = {
 	.get	= param_get_int,
 };
 
-device_param_cb(queue_mode, &null_queue_mode_param_ops, &g_queue_mode, S_IRUGO);
+device_param_cb(queue_mode, &null_queue_mode_param_ops, &g_queue_mode, 0444);
 MODULE_PARM_DESC(queue_mode, "Block interface to use (0=bio,1=rq,2=multiqueue)");
 
 static int g_gb = 250;
-module_param_named(gb, g_gb, int, S_IRUGO);
+module_param_named(gb, g_gb, int, 0444);
 MODULE_PARM_DESC(gb, "Size in GB");
 
 static int g_bs = 512;
-module_param_named(bs, g_bs, int, S_IRUGO);
+module_param_named(bs, g_bs, int, 0444);
 MODULE_PARM_DESC(bs, "Block size (in bytes)");
 
 static int nr_devices = 1;
-module_param(nr_devices, int, S_IRUGO);
+module_param(nr_devices, int, 0444);
 MODULE_PARM_DESC(nr_devices, "Number of devices to register");
 
 static bool g_blocking;
-module_param_named(blocking, g_blocking, bool, S_IRUGO);
+module_param_named(blocking, g_blocking, bool, 0444);
 MODULE_PARM_DESC(blocking, "Register as a blocking blk-mq driver device");
 
 static bool shared_tags;
-module_param(shared_tags, bool, S_IRUGO);
+module_param(shared_tags, bool, 0444);
 MODULE_PARM_DESC(shared_tags, "Share tag set between devices for blk-mq");
 
 static int g_irqmode = NULL_IRQ_SOFTIRQ;
@@ -239,19 +239,19 @@ static const struct kernel_param_ops null_irqmode_param_ops = {
 	.get	= param_get_int,
 };
 
-device_param_cb(irqmode, &null_irqmode_param_ops, &g_irqmode, S_IRUGO);
+device_param_cb(irqmode, &null_irqmode_param_ops, &g_irqmode, 0444);
 MODULE_PARM_DESC(irqmode, "IRQ completion handler. 0-none, 1-softirq, 2-timer");
 
 static unsigned long g_completion_nsec = 10000;
-module_param_named(completion_nsec, g_completion_nsec, ulong, S_IRUGO);
+module_param_named(completion_nsec, g_completion_nsec, ulong, 0444);
 MODULE_PARM_DESC(completion_nsec, "Time in ns to complete a request in hardware. Default: 10,000ns");
 
 static int g_hw_queue_depth = 64;
-module_param_named(hw_queue_depth, g_hw_queue_depth, int, S_IRUGO);
+module_param_named(hw_queue_depth, g_hw_queue_depth, int, 0444);
 MODULE_PARM_DESC(hw_queue_depth, "Queue depth for each hardware queue. Default: 64");
 
 static bool g_use_per_node_hctx;
-module_param_named(use_per_node_hctx, g_use_per_node_hctx, bool, S_IRUGO);
+module_param_named(use_per_node_hctx, g_use_per_node_hctx, bool, 0444);
 MODULE_PARM_DESC(use_per_node_hctx, "Use per-node allocation for hardware context queues. Default: false");
 
 static struct nullb_device *null_alloc_dev(void);
@@ -1365,7 +1365,8 @@ static blk_qc_t null_queue_bio(struct request_queue *q, struct bio *bio)
 static enum blk_eh_timer_return null_rq_timed_out_fn(struct request *rq)
 {
 	pr_info("null: rq %p timed out\n", rq);
-	return BLK_EH_HANDLED;
+	blk_mq_complete_request(rq);
+	return BLK_EH_DONE;
 }
 
 static int null_rq_prep_fn(struct request_queue *q, struct request *req)
@@ -1427,7 +1428,8 @@ static void null_request_fn(struct request_queue *q)
 static enum blk_eh_timer_return null_timeout_rq(struct request *rq, bool res)
 {
 	pr_info("null: rq %p timed out\n", rq);
-	return BLK_EH_HANDLED;
+	blk_mq_complete_request(rq);
+	return BLK_EH_DONE;
 }
 
 static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx,

+ 1 - 1
drivers/block/paride/pd.c

@@ -740,7 +740,7 @@ static int pd_special_command(struct pd_unit *disk,
 {
 	struct request *rq;
 
-	rq = blk_get_request(disk->gd->queue, REQ_OP_DRV_IN, __GFP_RECLAIM);
+	rq = blk_get_request(disk->gd->queue, REQ_OP_DRV_IN, 0);
 	if (IS_ERR(rq))
 		return PTR_ERR(rq);
 

+ 30 - 30
drivers/block/pktcdvd.c

@@ -97,8 +97,8 @@ static int pktdev_major;
 static int write_congestion_on  = PKT_WRITE_CONGESTION_ON;
 static int write_congestion_off = PKT_WRITE_CONGESTION_OFF;
 static struct mutex ctl_mutex;	/* Serialize open/close/setup/teardown */
-static mempool_t *psd_pool;
-static struct bio_set *pkt_bio_set;
+static mempool_t psd_pool;
+static struct bio_set pkt_bio_set;
 
 static struct class	*class_pktcdvd = NULL;    /* /sys/class/pktcdvd */
 static struct dentry	*pkt_debugfs_root = NULL; /* /sys/kernel/debug/pktcdvd */
@@ -478,8 +478,8 @@ static void pkt_debugfs_dev_new(struct pktcdvd_device *pd)
 	if (!pd->dfs_d_root)
 		return;
 
-	pd->dfs_f_info = debugfs_create_file("info", S_IRUGO,
-				pd->dfs_d_root, pd, &debug_fops);
+	pd->dfs_f_info = debugfs_create_file("info", 0444,
+					     pd->dfs_d_root, pd, &debug_fops);
 }
 
 static void pkt_debugfs_dev_remove(struct pktcdvd_device *pd)
@@ -631,7 +631,7 @@ static inline struct pkt_rb_node *pkt_rbtree_next(struct pkt_rb_node *node)
 static void pkt_rbtree_erase(struct pktcdvd_device *pd, struct pkt_rb_node *node)
 {
 	rb_erase(&node->rb_node, &pd->bio_queue);
-	mempool_free(node, pd->rb_pool);
+	mempool_free(node, &pd->rb_pool);
 	pd->bio_queue_size--;
 	BUG_ON(pd->bio_queue_size < 0);
 }
@@ -704,13 +704,13 @@ static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command *
 	int ret = 0;
 
 	rq = blk_get_request(q, (cgc->data_direction == CGC_DATA_WRITE) ?
-			     REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, __GFP_RECLAIM);
+			     REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, 0);
 	if (IS_ERR(rq))
 		return PTR_ERR(rq);
 
 	if (cgc->buflen) {
 		ret = blk_rq_map_kern(q, rq, cgc->buffer, cgc->buflen,
-				      __GFP_RECLAIM);
+				      GFP_NOIO);
 		if (ret)
 			goto out;
 	}
@@ -1285,7 +1285,7 @@ static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt)
 	 * Fill-in bvec with data from orig_bios.
 	 */
 	spin_lock(&pkt->lock);
-	bio_copy_data(pkt->w_bio, pkt->orig_bios.head);
+	bio_list_copy_data(pkt->w_bio, pkt->orig_bios.head);
 
 	pkt_set_state(pkt, PACKET_WRITE_WAIT_STATE);
 	spin_unlock(&pkt->lock);
@@ -2303,14 +2303,14 @@ static void pkt_end_io_read_cloned(struct bio *bio)
 	psd->bio->bi_status = bio->bi_status;
 	bio_put(bio);
 	bio_endio(psd->bio);
-	mempool_free(psd, psd_pool);
+	mempool_free(psd, &psd_pool);
 	pkt_bio_finished(pd);
 }
 
 static void pkt_make_request_read(struct pktcdvd_device *pd, struct bio *bio)
 {
-	struct bio *cloned_bio = bio_clone_fast(bio, GFP_NOIO, pkt_bio_set);
-	struct packet_stacked_data *psd = mempool_alloc(psd_pool, GFP_NOIO);
+	struct bio *cloned_bio = bio_clone_fast(bio, GFP_NOIO, &pkt_bio_set);
+	struct packet_stacked_data *psd = mempool_alloc(&psd_pool, GFP_NOIO);
 
 	psd->pd = pd;
 	psd->bio = bio;
@@ -2381,7 +2381,7 @@ static void pkt_make_request_write(struct request_queue *q, struct bio *bio)
 	/*
 	 * No matching packet found. Store the bio in the work queue.
 	 */
-	node = mempool_alloc(pd->rb_pool, GFP_NOIO);
+	node = mempool_alloc(&pd->rb_pool, GFP_NOIO);
 	node->bio = bio;
 	spin_lock(&pd->lock);
 	BUG_ON(pd->bio_queue_size < 0);
@@ -2451,7 +2451,7 @@ static blk_qc_t pkt_make_request(struct request_queue *q, struct bio *bio)
 
 			split = bio_split(bio, last_zone -
 					  bio->bi_iter.bi_sector,
-					  GFP_NOIO, pkt_bio_set);
+					  GFP_NOIO, &pkt_bio_set);
 			bio_chain(split, bio);
 		} else {
 			split = bio;
@@ -2707,9 +2707,9 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev)
 	if (!pd)
 		goto out_mutex;
 
-	pd->rb_pool = mempool_create_kmalloc_pool(PKT_RB_POOL_SIZE,
-						  sizeof(struct pkt_rb_node));
-	if (!pd->rb_pool)
+	ret = mempool_init_kmalloc_pool(&pd->rb_pool, PKT_RB_POOL_SIZE,
+					sizeof(struct pkt_rb_node));
+	if (ret)
 		goto out_mem;
 
 	INIT_LIST_HEAD(&pd->cdrw.pkt_free_list);
@@ -2766,7 +2766,7 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev)
 out_mem2:
 	put_disk(disk);
 out_mem:
-	mempool_destroy(pd->rb_pool);
+	mempool_exit(&pd->rb_pool);
 	kfree(pd);
 out_mutex:
 	mutex_unlock(&ctl_mutex);
@@ -2817,7 +2817,7 @@ static int pkt_remove_dev(dev_t pkt_dev)
 	blk_cleanup_queue(pd->disk->queue);
 	put_disk(pd->disk);
 
-	mempool_destroy(pd->rb_pool);
+	mempool_exit(&pd->rb_pool);
 	kfree(pd);
 
 	/* This is safe: open() is still holding a reference. */
@@ -2914,14 +2914,14 @@ static int __init pkt_init(void)
 
 	mutex_init(&ctl_mutex);
 
-	psd_pool = mempool_create_kmalloc_pool(PSD_POOL_SIZE,
-					sizeof(struct packet_stacked_data));
-	if (!psd_pool)
-		return -ENOMEM;
-	pkt_bio_set = bioset_create(BIO_POOL_SIZE, 0, 0);
-	if (!pkt_bio_set) {
-		mempool_destroy(psd_pool);
-		return -ENOMEM;
+	ret = mempool_init_kmalloc_pool(&psd_pool, PSD_POOL_SIZE,
+				    sizeof(struct packet_stacked_data));
+	if (ret)
+		return ret;
+	ret = bioset_init(&pkt_bio_set, BIO_POOL_SIZE, 0, 0);
+	if (ret) {
+		mempool_exit(&psd_pool);
+		return ret;
 	}
 
 	ret = register_blkdev(pktdev_major, DRIVER_NAME);
@@ -2954,8 +2954,8 @@ out_misc:
 out:
 	unregister_blkdev(pktdev_major, DRIVER_NAME);
 out2:
-	mempool_destroy(psd_pool);
-	bioset_free(pkt_bio_set);
+	mempool_exit(&psd_pool);
+	bioset_exit(&pkt_bio_set);
 	return ret;
 }
 
@@ -2968,8 +2968,8 @@ static void __exit pkt_exit(void)
 	pkt_sysfs_cleanup();
 
 	unregister_blkdev(pktdev_major, DRIVER_NAME);
-	mempool_destroy(psd_pool);
-	bioset_free(pkt_bio_set);
+	mempool_exit(&psd_pool);
+	bioset_exit(&pkt_bio_set);
 }
 
 MODULE_DESCRIPTION("Packet writing layer for CD/DVD drives");

+ 0 - 2
drivers/block/ps3disk.c

@@ -465,8 +465,6 @@ static int ps3disk_probe(struct ps3_system_bus_device *_dev)
 	priv->queue = queue;
 	queue->queuedata = dev;
 
-	blk_queue_bounce_limit(queue, BLK_BOUNCE_HIGH);
-
 	blk_queue_max_hw_sectors(queue, dev->bounce_size >> 9);
 	blk_queue_segment_boundary(queue, -1UL);
 	blk_queue_dma_alignment(queue, dev->blk_size-1);

+ 22 - 22
drivers/block/rbd.c

@@ -424,7 +424,7 @@ static struct workqueue_struct *rbd_wq;
  * single-major requires >= 0.75 version of userspace rbd utility.
  */
 static bool single_major = true;
-module_param(single_major, bool, S_IRUGO);
+module_param(single_major, bool, 0444);
 MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: true)");
 
 static ssize_t rbd_add(struct bus_type *bus, const char *buf,
@@ -468,11 +468,11 @@ static ssize_t rbd_supported_features_show(struct bus_type *bus, char *buf)
 	return sprintf(buf, "0x%llx\n", RBD_FEATURES_SUPPORTED);
 }
 
-static BUS_ATTR(add, S_IWUSR, NULL, rbd_add);
-static BUS_ATTR(remove, S_IWUSR, NULL, rbd_remove);
-static BUS_ATTR(add_single_major, S_IWUSR, NULL, rbd_add_single_major);
-static BUS_ATTR(remove_single_major, S_IWUSR, NULL, rbd_remove_single_major);
-static BUS_ATTR(supported_features, S_IRUGO, rbd_supported_features_show, NULL);
+static BUS_ATTR(add, 0200, NULL, rbd_add);
+static BUS_ATTR(remove, 0200, NULL, rbd_remove);
+static BUS_ATTR(add_single_major, 0200, NULL, rbd_add_single_major);
+static BUS_ATTR(remove_single_major, 0200, NULL, rbd_remove_single_major);
+static BUS_ATTR(supported_features, 0444, rbd_supported_features_show, NULL);
 
 static struct attribute *rbd_bus_attrs[] = {
 	&bus_attr_add.attr,
@@ -4204,22 +4204,22 @@ static ssize_t rbd_image_refresh(struct device *dev,
 	return size;
 }
 
-static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
-static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
-static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
-static DEVICE_ATTR(minor, S_IRUGO, rbd_minor_show, NULL);
-static DEVICE_ATTR(client_addr, S_IRUGO, rbd_client_addr_show, NULL);
-static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
-static DEVICE_ATTR(cluster_fsid, S_IRUGO, rbd_cluster_fsid_show, NULL);
-static DEVICE_ATTR(config_info, S_IRUSR, rbd_config_info_show, NULL);
-static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
-static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
-static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
-static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
-static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
-static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
-static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
-static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
+static DEVICE_ATTR(size, 0444, rbd_size_show, NULL);
+static DEVICE_ATTR(features, 0444, rbd_features_show, NULL);
+static DEVICE_ATTR(major, 0444, rbd_major_show, NULL);
+static DEVICE_ATTR(minor, 0444, rbd_minor_show, NULL);
+static DEVICE_ATTR(client_addr, 0444, rbd_client_addr_show, NULL);
+static DEVICE_ATTR(client_id, 0444, rbd_client_id_show, NULL);
+static DEVICE_ATTR(cluster_fsid, 0444, rbd_cluster_fsid_show, NULL);
+static DEVICE_ATTR(config_info, 0400, rbd_config_info_show, NULL);
+static DEVICE_ATTR(pool, 0444, rbd_pool_show, NULL);
+static DEVICE_ATTR(pool_id, 0444, rbd_pool_id_show, NULL);
+static DEVICE_ATTR(name, 0444, rbd_name_show, NULL);
+static DEVICE_ATTR(image_id, 0444, rbd_image_id_show, NULL);
+static DEVICE_ATTR(refresh, 0200, NULL, rbd_image_refresh);
+static DEVICE_ATTR(current_snap, 0444, rbd_snap_show, NULL);
+static DEVICE_ATTR(snap_id, 0444, rbd_snap_id_show, NULL);
+static DEVICE_ATTR(parent, 0444, rbd_parent_show, NULL);
 
 static struct attribute *rbd_attrs[] = {
 	&dev_attr_size.attr,

+ 3 - 3
drivers/block/rsxx/core.c

@@ -247,19 +247,19 @@ static void rsxx_debugfs_dev_new(struct rsxx_cardinfo *card)
 	if (IS_ERR_OR_NULL(card->debugfs_dir))
 		goto failed_debugfs_dir;
 
-	debugfs_stats = debugfs_create_file("stats", S_IRUGO,
+	debugfs_stats = debugfs_create_file("stats", 0444,
 					    card->debugfs_dir, card,
 					    &debugfs_stats_fops);
 	if (IS_ERR_OR_NULL(debugfs_stats))
 		goto failed_debugfs_stats;
 
-	debugfs_pci_regs = debugfs_create_file("pci_regs", S_IRUGO,
+	debugfs_pci_regs = debugfs_create_file("pci_regs", 0444,
 					       card->debugfs_dir, card,
 					       &debugfs_pci_regs_fops);
 	if (IS_ERR_OR_NULL(debugfs_pci_regs))
 		goto failed_debugfs_pci_regs;
 
-	debugfs_cram = debugfs_create_file("cram", S_IRUGO | S_IWUSR,
+	debugfs_cram = debugfs_create_file("cram", 0644,
 					   card->debugfs_dir, card,
 					   &debugfs_cram_fops);
 	if (IS_ERR_OR_NULL(debugfs_cram))

+ 1 - 1
drivers/block/sx8.c

@@ -567,7 +567,7 @@ static struct carm_request *carm_get_special(struct carm_host *host)
 	if (!crq)
 		return NULL;
 
-	rq = blk_get_request(host->oob_q, REQ_OP_DRV_OUT, GFP_KERNEL);
+	rq = blk_get_request(host->oob_q, REQ_OP_DRV_OUT, 0);
 	if (IS_ERR(rq)) {
 		spin_lock_irqsave(&host->lock, flags);
 		carm_put_request(host, crq);

+ 4 - 4
drivers/block/virtio_blk.c

@@ -298,7 +298,7 @@ static int virtblk_get_id(struct gendisk *disk, char *id_str)
 	struct request *req;
 	int err;
 
-	req = blk_get_request(q, REQ_OP_DRV_IN, GFP_KERNEL);
+	req = blk_get_request(q, REQ_OP_DRV_IN, 0);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
@@ -371,7 +371,7 @@ static ssize_t virtblk_serial_show(struct device *dev,
 	return err;
 }
 
-static DEVICE_ATTR(serial, S_IRUGO, virtblk_serial_show, NULL);
+static DEVICE_ATTR(serial, 0444, virtblk_serial_show, NULL);
 
 /* The queue's logical block size must be set before calling this */
 static void virtblk_update_capacity(struct virtio_blk *vblk, bool resize)
@@ -576,10 +576,10 @@ virtblk_cache_type_show(struct device *dev, struct device_attribute *attr,
 }
 
 static const struct device_attribute dev_attr_cache_type_ro =
-	__ATTR(cache_type, S_IRUGO,
+	__ATTR(cache_type, 0444,
 	       virtblk_cache_type_show, NULL);
 static const struct device_attribute dev_attr_cache_type_rw =
-	__ATTR(cache_type, S_IRUGO|S_IWUSR,
+	__ATTR(cache_type, 0644,
 	       virtblk_cache_type_show, virtblk_cache_type_store);
 
 static int virtblk_init_request(struct blk_mq_tag_set *set, struct request *rq,

+ 1 - 1
drivers/block/xen-blkback/blkback.c

@@ -98,7 +98,7 @@ MODULE_PARM_DESC(max_queues,
  * backend, 4KB page granularity is used.
  */
 unsigned int xen_blkif_max_ring_order = XENBUS_MAX_RING_GRANT_ORDER;
-module_param_named(max_ring_page_order, xen_blkif_max_ring_order, int, S_IRUGO);
+module_param_named(max_ring_page_order, xen_blkif_max_ring_order, int, 0444);
 MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the shared ring");
 /*
  * The LRU mechanism to clean the lists of persistent grants needs to

+ 2 - 2
drivers/block/xen-blkback/xenbus.c

@@ -367,7 +367,7 @@ int __init xen_blkif_interface_init(void)
 out:									\
 		return sprintf(buf, format, result);			\
 	}								\
-	static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
+	static DEVICE_ATTR(name, 0444, show_##name, NULL)
 
 VBD_SHOW_ALLRING(oo_req,  "%llu\n");
 VBD_SHOW_ALLRING(rd_req,  "%llu\n");
@@ -403,7 +403,7 @@ static const struct attribute_group xen_vbdstat_group = {
 									\
 		return sprintf(buf, format, ##args);			\
 	}								\
-	static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
+	static DEVICE_ATTR(name, 0444, show_##name, NULL)
 
 VBD_SHOW(physical_device, "%x:%x\n", be->major, be->minor);
 VBD_SHOW(mode, "%s\n", be->mode);

+ 3 - 4
drivers/block/xen-blkfront.c

@@ -129,13 +129,12 @@ static const struct block_device_operations xlvbd_block_fops;
  */
 
 static unsigned int xen_blkif_max_segments = 32;
-module_param_named(max_indirect_segments, xen_blkif_max_segments, uint,
-		   S_IRUGO);
+module_param_named(max_indirect_segments, xen_blkif_max_segments, uint, 0444);
 MODULE_PARM_DESC(max_indirect_segments,
 		 "Maximum amount of segments in indirect requests (default is 32)");
 
 static unsigned int xen_blkif_max_queues = 4;
-module_param_named(max_queues, xen_blkif_max_queues, uint, S_IRUGO);
+module_param_named(max_queues, xen_blkif_max_queues, uint, 0444);
 MODULE_PARM_DESC(max_queues, "Maximum number of hardware queues/rings used per virtual disk");
 
 /*
@@ -143,7 +142,7 @@ MODULE_PARM_DESC(max_queues, "Maximum number of hardware queues/rings used per v
  * backend, 4KB page granularity is used.
  */
 static unsigned int xen_blkif_max_ring_order;
-module_param_named(max_ring_page_order, xen_blkif_max_ring_order, int, S_IRUGO);
+module_param_named(max_ring_page_order, xen_blkif_max_ring_order, int, 0444);
 MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the shared ring");
 
 #define BLK_RING_SIZE(info)	\

+ 1 - 1
drivers/cdrom/cdrom.c

@@ -2192,7 +2192,7 @@ static int cdrom_read_cdda_bpc(struct cdrom_device_info *cdi, __u8 __user *ubuf,
 
 		len = nr * CD_FRAMESIZE_RAW;
 
-		rq = blk_get_request(q, REQ_OP_SCSI_IN, GFP_KERNEL);
+		rq = blk_get_request(q, REQ_OP_SCSI_IN, 0);
 		if (IS_ERR(rq)) {
 			ret = PTR_ERR(rq);
 			break;

+ 1 - 1
drivers/ide/ide-atapi.c

@@ -92,7 +92,7 @@ int ide_queue_pc_tail(ide_drive_t *drive, struct gendisk *disk,
 	struct request *rq;
 	int error;
 
-	rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, __GFP_RECLAIM);
+	rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, 0);
 	ide_req(rq)->type = ATA_PRIV_MISC;
 	rq->special = (char *)pc;
 

+ 1 - 1
drivers/ide/ide-cd.c

@@ -437,7 +437,7 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd,
 		bool delay = false;
 
 		rq = blk_get_request(drive->queue,
-			write ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN,  __GFP_RECLAIM);
+			write ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0);
 		memcpy(scsi_req(rq)->cmd, cmd, BLK_MAX_CDB);
 		ide_req(rq)->type = ATA_PRIV_PC;
 		rq->rq_flags |= rq_flags;

+ 1 - 1
drivers/ide/ide-cd_ioctl.c

@@ -304,7 +304,7 @@ int ide_cdrom_reset(struct cdrom_device_info *cdi)
 	struct request *rq;
 	int ret;
 
-	rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, __GFP_RECLAIM);
+	rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, 0);
 	ide_req(rq)->type = ATA_PRIV_MISC;
 	rq->rq_flags = RQF_QUIET;
 	blk_execute_rq(drive->queue, cd->disk, rq, 0);

+ 1 - 1
drivers/ide/ide-devsets.c

@@ -166,7 +166,7 @@ int ide_devset_execute(ide_drive_t *drive, const struct ide_devset *setting,
 	if (!(setting->flags & DS_SYNC))
 		return setting->set(drive, arg);
 
-	rq = blk_get_request(q, REQ_OP_DRV_IN, __GFP_RECLAIM);
+	rq = blk_get_request(q, REQ_OP_DRV_IN, 0);
 	ide_req(rq)->type = ATA_PRIV_MISC;
 	scsi_req(rq)->cmd_len = 5;
 	scsi_req(rq)->cmd[0] = REQ_DEVSET_EXEC;

+ 1 - 1
drivers/ide/ide-disk.c

@@ -478,7 +478,7 @@ static int set_multcount(ide_drive_t *drive, int arg)
 	if (drive->special_flags & IDE_SFLAG_SET_MULTMODE)
 		return -EBUSY;
 
-	rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, __GFP_RECLAIM);
+	rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, 0);
 	ide_req(rq)->type = ATA_PRIV_TASKFILE;
 
 	drive->mult_req = arg;

+ 2 - 2
drivers/ide/ide-ioctls.c

@@ -125,7 +125,7 @@ static int ide_cmd_ioctl(ide_drive_t *drive, unsigned long arg)
 	if (NULL == (void *) arg) {
 		struct request *rq;
 
-		rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, __GFP_RECLAIM);
+		rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, 0);
 		ide_req(rq)->type = ATA_PRIV_TASKFILE;
 		blk_execute_rq(drive->queue, NULL, rq, 0);
 		err = scsi_req(rq)->result ? -EIO : 0;
@@ -222,7 +222,7 @@ static int generic_drive_reset(ide_drive_t *drive)
 	struct request *rq;
 	int ret = 0;
 
-	rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, __GFP_RECLAIM);
+	rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, 0);
 	ide_req(rq)->type = ATA_PRIV_MISC;
 	scsi_req(rq)->cmd_len = 1;
 	scsi_req(rq)->cmd[0] = REQ_DRIVE_RESET;

+ 2 - 2
drivers/ide/ide-park.c

@@ -32,7 +32,7 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout)
 	}
 	spin_unlock_irq(&hwif->lock);
 
-	rq = blk_get_request(q, REQ_OP_DRV_IN, __GFP_RECLAIM);
+	rq = blk_get_request(q, REQ_OP_DRV_IN, 0);
 	scsi_req(rq)->cmd[0] = REQ_PARK_HEADS;
 	scsi_req(rq)->cmd_len = 1;
 	ide_req(rq)->type = ATA_PRIV_MISC;
@@ -47,7 +47,7 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout)
 	 * Make sure that *some* command is sent to the drive after the
 	 * timeout has expired, so power management will be reenabled.
 	 */
-	rq = blk_get_request(q, REQ_OP_DRV_IN, GFP_NOWAIT);
+	rq = blk_get_request(q, REQ_OP_DRV_IN, BLK_MQ_REQ_NOWAIT);
 	if (IS_ERR(rq))
 		goto out;
 

+ 2 - 3
drivers/ide/ide-pm.c

@@ -19,7 +19,7 @@ int generic_ide_suspend(struct device *dev, pm_message_t mesg)
 	}
 
 	memset(&rqpm, 0, sizeof(rqpm));
-	rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, __GFP_RECLAIM);
+	rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, 0);
 	ide_req(rq)->type = ATA_PRIV_PM_SUSPEND;
 	rq->special = &rqpm;
 	rqpm.pm_step = IDE_PM_START_SUSPEND;
@@ -90,8 +90,7 @@ int generic_ide_resume(struct device *dev)
 	}
 
 	memset(&rqpm, 0, sizeof(rqpm));
-	rq = blk_get_request_flags(drive->queue, REQ_OP_DRV_IN,
-				   BLK_MQ_REQ_PREEMPT);
+	rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, BLK_MQ_REQ_PREEMPT);
 	ide_req(rq)->type = ATA_PRIV_PM_RESUME;
 	rq->special = &rqpm;
 	rqpm.pm_step = IDE_PM_START_RESUME;

+ 2 - 2
drivers/ide/ide-tape.c

@@ -854,7 +854,7 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int size)
 	BUG_ON(cmd != REQ_IDETAPE_READ && cmd != REQ_IDETAPE_WRITE);
 	BUG_ON(size < 0 || size % tape->blk_size);
 
-	rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, __GFP_RECLAIM);
+	rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, 0);
 	ide_req(rq)->type = ATA_PRIV_MISC;
 	scsi_req(rq)->cmd[13] = cmd;
 	rq->rq_disk = tape->disk;
@@ -862,7 +862,7 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int size)
 
 	if (size) {
 		ret = blk_rq_map_kern(drive->queue, rq, tape->buf, size,
-				      __GFP_RECLAIM);
+				      GFP_NOIO);
 		if (ret)
 			goto out_put;
 	}

+ 2 - 2
drivers/ide/ide-taskfile.c

@@ -431,7 +431,7 @@ int ide_raw_taskfile(ide_drive_t *drive, struct ide_cmd *cmd, u8 *buf,
 
 	rq = blk_get_request(drive->queue,
 		(cmd->tf_flags & IDE_TFLAG_WRITE) ?
-			REQ_OP_DRV_OUT : REQ_OP_DRV_IN, __GFP_RECLAIM);
+			REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0);
 	ide_req(rq)->type = ATA_PRIV_TASKFILE;
 
 	/*
@@ -442,7 +442,7 @@ int ide_raw_taskfile(ide_drive_t *drive, struct ide_cmd *cmd, u8 *buf,
 	 */
 	if (nsect) {
 		error = blk_rq_map_kern(drive->queue, rq, buf,
-					nsect * SECTOR_SIZE, __GFP_RECLAIM);
+					nsect * SECTOR_SIZE, GFP_NOIO);
 		if (error)
 			goto put_req;
 	}

+ 5 - 5
drivers/lightnvm/core.c

@@ -431,7 +431,7 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
 	return 0;
 err_sysfs:
 	if (tt->exit)
-		tt->exit(targetdata);
+		tt->exit(targetdata, true);
 err_init:
 	blk_cleanup_queue(tqueue);
 	tdisk->queue = NULL;
@@ -446,7 +446,7 @@ err_reserve:
 	return ret;
 }
 
-static void __nvm_remove_target(struct nvm_target *t)
+static void __nvm_remove_target(struct nvm_target *t, bool graceful)
 {
 	struct nvm_tgt_type *tt = t->type;
 	struct gendisk *tdisk = t->disk;
@@ -459,7 +459,7 @@ static void __nvm_remove_target(struct nvm_target *t)
 		tt->sysfs_exit(tdisk);
 
 	if (tt->exit)
-		tt->exit(tdisk->private_data);
+		tt->exit(tdisk->private_data, graceful);
 
 	nvm_remove_tgt_dev(t->dev, 1);
 	put_disk(tdisk);
@@ -489,7 +489,7 @@ static int nvm_remove_tgt(struct nvm_dev *dev, struct nvm_ioctl_remove *remove)
 		mutex_unlock(&dev->mlock);
 		return 1;
 	}
-	__nvm_remove_target(t);
+	__nvm_remove_target(t, true);
 	mutex_unlock(&dev->mlock);
 
 	return 0;
@@ -963,7 +963,7 @@ void nvm_unregister(struct nvm_dev *dev)
 	list_for_each_entry_safe(t, tmp, &dev->targets, list) {
 		if (t->dev->parent != dev)
 			continue;
-		__nvm_remove_target(t);
+		__nvm_remove_target(t, false);
 	}
 	mutex_unlock(&dev->mlock);
 

+ 6 - 4
drivers/lightnvm/pblk-cache.c

@@ -44,13 +44,15 @@ retry:
 		goto out;
 	}
 
-	if (unlikely(!bio_has_data(bio)))
-		goto out;
-
 	pblk_ppa_set_empty(&w_ctx.ppa);
 	w_ctx.flags = flags;
-	if (bio->bi_opf & REQ_PREFLUSH)
+	if (bio->bi_opf & REQ_PREFLUSH) {
 		w_ctx.flags |= PBLK_FLUSH_ENTRY;
+		pblk_write_kick(pblk);
+	}
+
+	if (unlikely(!bio_has_data(bio)))
+		goto out;
 
 	for (i = 0; i < nr_entries; i++) {
 		void *data = bio_data(bio);

+ 154 - 79
drivers/lightnvm/pblk-core.c

@@ -40,7 +40,7 @@ static void pblk_line_mark_bb(struct work_struct *work)
 	}
 
 	kfree(ppa);
-	mempool_free(line_ws, pblk->gen_ws_pool);
+	mempool_free(line_ws, &pblk->gen_ws_pool);
 }
 
 static void pblk_mark_bb(struct pblk *pblk, struct pblk_line *line,
@@ -102,7 +102,7 @@ static void pblk_end_io_erase(struct nvm_rq *rqd)
 	struct pblk *pblk = rqd->private;
 
 	__pblk_end_io_erase(pblk, rqd);
-	mempool_free(rqd, pblk->e_rq_pool);
+	mempool_free(rqd, &pblk->e_rq_pool);
 }
 
 /*
@@ -237,15 +237,15 @@ struct nvm_rq *pblk_alloc_rqd(struct pblk *pblk, int type)
 	switch (type) {
 	case PBLK_WRITE:
 	case PBLK_WRITE_INT:
-		pool = pblk->w_rq_pool;
+		pool = &pblk->w_rq_pool;
 		rq_size = pblk_w_rq_size;
 		break;
 	case PBLK_READ:
-		pool = pblk->r_rq_pool;
+		pool = &pblk->r_rq_pool;
 		rq_size = pblk_g_rq_size;
 		break;
 	default:
-		pool = pblk->e_rq_pool;
+		pool = &pblk->e_rq_pool;
 		rq_size = pblk_g_rq_size;
 	}
 
@@ -265,20 +265,22 @@ void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int type)
 	case PBLK_WRITE:
 		kfree(((struct pblk_c_ctx *)nvm_rq_to_pdu(rqd))->lun_bitmap);
 	case PBLK_WRITE_INT:
-		pool = pblk->w_rq_pool;
+		pool = &pblk->w_rq_pool;
 		break;
 	case PBLK_READ:
-		pool = pblk->r_rq_pool;
+		pool = &pblk->r_rq_pool;
 		break;
 	case PBLK_ERASE:
-		pool = pblk->e_rq_pool;
+		pool = &pblk->e_rq_pool;
 		break;
 	default:
 		pr_err("pblk: trying to free unknown rqd type\n");
 		return;
 	}
 
-	nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list);
+	if (rqd->meta_list)
+		nvm_dev_dma_free(dev->parent, rqd->meta_list,
+				rqd->dma_meta_list);
 	mempool_free(rqd, pool);
 }
 
@@ -292,7 +294,7 @@ void pblk_bio_free_pages(struct pblk *pblk, struct bio *bio, int off,
 
 	for (i = off; i < nr_pages + off; i++) {
 		bv = bio->bi_io_vec[i];
-		mempool_free(bv.bv_page, pblk->page_bio_pool);
+		mempool_free(bv.bv_page, &pblk->page_bio_pool);
 	}
 }
 
@@ -304,23 +306,23 @@ int pblk_bio_add_pages(struct pblk *pblk, struct bio *bio, gfp_t flags,
 	int i, ret;
 
 	for (i = 0; i < nr_pages; i++) {
-		page = mempool_alloc(pblk->page_bio_pool, flags);
+		page = mempool_alloc(&pblk->page_bio_pool, flags);
 
 		ret = bio_add_pc_page(q, bio, page, PBLK_EXPOSED_PAGE_SIZE, 0);
 		if (ret != PBLK_EXPOSED_PAGE_SIZE) {
 			pr_err("pblk: could not add page to bio\n");
-			mempool_free(page, pblk->page_bio_pool);
+			mempool_free(page, &pblk->page_bio_pool);
 			goto err;
 		}
 	}
 
 	return 0;
 err:
-	pblk_bio_free_pages(pblk, bio, 0, i - 1);
+	pblk_bio_free_pages(pblk, bio, (bio->bi_vcnt - i), i);
 	return -1;
 }
 
-static void pblk_write_kick(struct pblk *pblk)
+void pblk_write_kick(struct pblk *pblk)
 {
 	wake_up_process(pblk->writer_ts);
 	mod_timer(&pblk->wtimer, jiffies + msecs_to_jiffies(1000));
@@ -342,13 +344,6 @@ void pblk_write_should_kick(struct pblk *pblk)
 		pblk_write_kick(pblk);
 }
 
-void pblk_end_io_sync(struct nvm_rq *rqd)
-{
-	struct completion *waiting = rqd->private;
-
-	complete(waiting);
-}
-
 static void pblk_wait_for_meta(struct pblk *pblk)
 {
 	do {
@@ -380,7 +375,13 @@ struct list_head *pblk_line_gc_list(struct pblk *pblk, struct pblk_line *line)
 
 	lockdep_assert_held(&line->lock);
 
-	if (!vsc) {
+	if (line->w_err_gc->has_write_err) {
+		if (line->gc_group != PBLK_LINEGC_WERR) {
+			line->gc_group = PBLK_LINEGC_WERR;
+			move_list = &l_mg->gc_werr_list;
+			pblk_rl_werr_line_in(&pblk->rl);
+		}
+	} else if (!vsc) {
 		if (line->gc_group != PBLK_LINEGC_FULL) {
 			line->gc_group = PBLK_LINEGC_FULL;
 			move_list = &l_mg->gc_full_list;
@@ -467,16 +468,13 @@ int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd)
 {
 	struct nvm_tgt_dev *dev = pblk->dev;
 
-#ifdef CONFIG_NVM_DEBUG
-	int ret;
+	atomic_inc(&pblk->inflight_io);
 
-	ret = pblk_check_io(pblk, rqd);
-	if (ret)
-		return ret;
+#ifdef CONFIG_NVM_DEBUG
+	if (pblk_check_io(pblk, rqd))
+		return NVM_IO_ERR;
 #endif
 
-	atomic_inc(&pblk->inflight_io);
-
 	return nvm_submit_io(dev, rqd);
 }
 
@@ -484,16 +482,13 @@ int pblk_submit_io_sync(struct pblk *pblk, struct nvm_rq *rqd)
 {
 	struct nvm_tgt_dev *dev = pblk->dev;
 
-#ifdef CONFIG_NVM_DEBUG
-	int ret;
+	atomic_inc(&pblk->inflight_io);
 
-	ret = pblk_check_io(pblk, rqd);
-	if (ret)
-		return ret;
+#ifdef CONFIG_NVM_DEBUG
+	if (pblk_check_io(pblk, rqd))
+		return NVM_IO_ERR;
 #endif
 
-	atomic_inc(&pblk->inflight_io);
-
 	return nvm_submit_io_sync(dev, rqd);
 }
 
@@ -856,9 +851,10 @@ static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line,
 	atomic_dec(&pblk->inflight_io);
 
 	if (rqd.error) {
-		if (dir == PBLK_WRITE)
+		if (dir == PBLK_WRITE) {
 			pblk_log_write_err(pblk, &rqd);
-		else if (dir == PBLK_READ)
+			ret = 1;
+		} else if (dir == PBLK_READ)
 			pblk_log_read_err(pblk, &rqd);
 	}
 
@@ -1071,6 +1067,25 @@ static int pblk_line_init_metadata(struct pblk *pblk, struct pblk_line *line,
 	return 1;
 }
 
+static int pblk_line_alloc_bitmaps(struct pblk *pblk, struct pblk_line *line)
+{
+	struct pblk_line_meta *lm = &pblk->lm;
+
+	line->map_bitmap = kzalloc(lm->sec_bitmap_len, GFP_KERNEL);
+	if (!line->map_bitmap)
+		return -ENOMEM;
+
+	/* will be initialized using bb info from map_bitmap */
+	line->invalid_bitmap = kmalloc(lm->sec_bitmap_len, GFP_KERNEL);
+	if (!line->invalid_bitmap) {
+		kfree(line->map_bitmap);
+		line->map_bitmap = NULL;
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
 /* For now lines are always assumed full lines. Thus, smeta former and current
  * lun bitmaps are omitted.
  */
@@ -1108,7 +1123,7 @@ static int pblk_line_init_bb(struct pblk *pblk, struct pblk_line *line,
 
 	if (init && pblk_line_submit_smeta_io(pblk, line, off, PBLK_WRITE)) {
 		pr_debug("pblk: line smeta I/O failed. Retry\n");
-		return 1;
+		return 0;
 	}
 
 	bitmap_copy(line->invalid_bitmap, line->map_bitmap, lm->sec_per_line);
@@ -1174,19 +1189,9 @@ static int pblk_prepare_new_line(struct pblk *pblk, struct pblk_line *line)
 static int pblk_line_prepare(struct pblk *pblk, struct pblk_line *line)
 {
 	struct pblk_line_meta *lm = &pblk->lm;
+	int blk_in_line = atomic_read(&line->blk_in_line);
 	int blk_to_erase;
 
-	line->map_bitmap = kzalloc(lm->sec_bitmap_len, GFP_ATOMIC);
-	if (!line->map_bitmap)
-		return -ENOMEM;
-
-	/* will be initialized using bb info from map_bitmap */
-	line->invalid_bitmap = kmalloc(lm->sec_bitmap_len, GFP_ATOMIC);
-	if (!line->invalid_bitmap) {
-		kfree(line->map_bitmap);
-		return -ENOMEM;
-	}
-
 	/* Bad blocks do not need to be erased */
 	bitmap_copy(line->erase_bitmap, line->blk_bitmap, lm->blk_per_line);
 
@@ -1199,16 +1204,19 @@ static int pblk_line_prepare(struct pblk *pblk, struct pblk_line *line)
 		blk_to_erase = pblk_prepare_new_line(pblk, line);
 		line->state = PBLK_LINESTATE_FREE;
 	} else {
-		blk_to_erase = atomic_read(&line->blk_in_line);
+		blk_to_erase = blk_in_line;
 	}
 
-	if (line->state != PBLK_LINESTATE_FREE) {
-		kfree(line->map_bitmap);
-		kfree(line->invalid_bitmap);
+	if (blk_in_line < lm->min_blk_line) {
 		spin_unlock(&line->lock);
+		return -EAGAIN;
+	}
+
+	if (line->state != PBLK_LINESTATE_FREE) {
 		WARN(1, "pblk: corrupted line %d, state %d\n",
 							line->id, line->state);
-		return -EAGAIN;
+		spin_unlock(&line->lock);
+		return -EINTR;
 	}
 
 	line->state = PBLK_LINESTATE_OPEN;
@@ -1241,13 +1249,16 @@ int pblk_line_recov_alloc(struct pblk *pblk, struct pblk_line *line)
 	}
 	spin_unlock(&l_mg->free_lock);
 
-	pblk_rl_free_lines_dec(&pblk->rl, line, true);
+	ret = pblk_line_alloc_bitmaps(pblk, line);
+	if (ret)
+		return ret;
 
 	if (!pblk_line_init_bb(pblk, line, 0)) {
 		list_add(&line->list, &l_mg->free_list);
 		return -EINTR;
 	}
 
+	pblk_rl_free_lines_dec(&pblk->rl, line, true);
 	return 0;
 }
 
@@ -1259,6 +1270,24 @@ void pblk_line_recov_close(struct pblk *pblk, struct pblk_line *line)
 	line->emeta = NULL;
 }
 
+static void pblk_line_reinit(struct pblk_line *line)
+{
+	*line->vsc = cpu_to_le32(EMPTY_ENTRY);
+
+	line->map_bitmap = NULL;
+	line->invalid_bitmap = NULL;
+	line->smeta = NULL;
+	line->emeta = NULL;
+}
+
+void pblk_line_free(struct pblk_line *line)
+{
+	kfree(line->map_bitmap);
+	kfree(line->invalid_bitmap);
+
+	pblk_line_reinit(line);
+}
+
 struct pblk_line *pblk_line_get(struct pblk *pblk)
 {
 	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
@@ -1292,10 +1321,14 @@ retry:
 
 	ret = pblk_line_prepare(pblk, line);
 	if (ret) {
-		if (ret == -EAGAIN) {
+		switch (ret) {
+		case -EAGAIN:
+			list_add(&line->list, &l_mg->bad_list);
+			goto retry;
+		case -EINTR:
 			list_add(&line->list, &l_mg->corrupt_list);
 			goto retry;
-		} else {
+		default:
 			pr_err("pblk: failed to prepare line %d\n", line->id);
 			list_add(&line->list, &l_mg->free_list);
 			l_mg->nr_free_lines++;
@@ -1321,11 +1354,14 @@ retry:
 		return NULL;
 	}
 
+	retry_line->map_bitmap = line->map_bitmap;
+	retry_line->invalid_bitmap = line->invalid_bitmap;
 	retry_line->smeta = line->smeta;
 	retry_line->emeta = line->emeta;
 	retry_line->meta_line = line->meta_line;
 
-	pblk_line_free(pblk, line);
+	pblk_line_reinit(line);
+
 	l_mg->data_line = retry_line;
 	spin_unlock(&l_mg->free_lock);
 
@@ -1378,6 +1414,9 @@ struct pblk_line *pblk_line_get_first_data(struct pblk *pblk)
 	}
 	spin_unlock(&l_mg->free_lock);
 
+	if (pblk_line_alloc_bitmaps(pblk, line))
+		return NULL;
+
 	if (pblk_line_erase(pblk, line)) {
 		line = pblk_line_retry(pblk, line);
 		if (!line)
@@ -1449,7 +1488,7 @@ static void pblk_line_close_meta_sync(struct pblk *pblk)
 	flush_workqueue(pblk->close_wq);
 }
 
-void pblk_pipeline_stop(struct pblk *pblk)
+void __pblk_pipeline_flush(struct pblk *pblk)
 {
 	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
 	int ret;
@@ -1474,6 +1513,11 @@ void pblk_pipeline_stop(struct pblk *pblk)
 
 	flush_workqueue(pblk->bb_wq);
 	pblk_line_close_meta_sync(pblk);
+}
+
+void __pblk_pipeline_stop(struct pblk *pblk)
+{
+	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
 
 	spin_lock(&l_mg->free_lock);
 	pblk->state = PBLK_STATE_STOPPED;
@@ -1482,6 +1526,12 @@ void pblk_pipeline_stop(struct pblk *pblk)
 	spin_unlock(&l_mg->free_lock);
 }
 
+void pblk_pipeline_stop(struct pblk *pblk)
+{
+	__pblk_pipeline_flush(pblk);
+	__pblk_pipeline_stop(pblk);
+}
+
 struct pblk_line *pblk_line_replace_data(struct pblk *pblk)
 {
 	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
@@ -1511,6 +1561,9 @@ retry_erase:
 		goto retry_erase;
 	}
 
+	if (pblk_line_alloc_bitmaps(pblk, new))
+		return NULL;
+
 retry_setup:
 	if (!pblk_line_init_metadata(pblk, new, cur)) {
 		new = pblk_line_retry(pblk, new);
@@ -1550,19 +1603,6 @@ out:
 	return new;
 }
 
-void pblk_line_free(struct pblk *pblk, struct pblk_line *line)
-{
-	kfree(line->map_bitmap);
-	kfree(line->invalid_bitmap);
-
-	*line->vsc = cpu_to_le32(EMPTY_ENTRY);
-
-	line->map_bitmap = NULL;
-	line->invalid_bitmap = NULL;
-	line->smeta = NULL;
-	line->emeta = NULL;
-}
-
 static void __pblk_line_put(struct pblk *pblk, struct pblk_line *line)
 {
 	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
@@ -1572,9 +1612,14 @@ static void __pblk_line_put(struct pblk *pblk, struct pblk_line *line)
 	WARN_ON(line->state != PBLK_LINESTATE_GC);
 	line->state = PBLK_LINESTATE_FREE;
 	line->gc_group = PBLK_LINEGC_NONE;
-	pblk_line_free(pblk, line);
-	spin_unlock(&line->lock);
+	pblk_line_free(line);
+
+	if (line->w_err_gc->has_write_err) {
+		pblk_rl_werr_line_out(&pblk->rl);
+		line->w_err_gc->has_write_err = 0;
+	}
 
+	spin_unlock(&line->lock);
 	atomic_dec(&gc->pipeline_gc);
 
 	spin_lock(&l_mg->free_lock);
@@ -1593,7 +1638,7 @@ static void pblk_line_put_ws(struct work_struct *work)
 	struct pblk_line *line = line_put_ws->line;
 
 	__pblk_line_put(pblk, line);
-	mempool_free(line_put_ws, pblk->gen_ws_pool);
+	mempool_free(line_put_ws, &pblk->gen_ws_pool);
 }
 
 void pblk_line_put(struct kref *ref)
@@ -1610,7 +1655,7 @@ void pblk_line_put_wq(struct kref *ref)
 	struct pblk *pblk = line->pblk;
 	struct pblk_line_ws *line_put_ws;
 
-	line_put_ws = mempool_alloc(pblk->gen_ws_pool, GFP_ATOMIC);
+	line_put_ws = mempool_alloc(&pblk->gen_ws_pool, GFP_ATOMIC);
 	if (!line_put_ws)
 		return;
 
@@ -1737,11 +1782,34 @@ void pblk_line_close_meta(struct pblk *pblk, struct pblk_line *line)
 
 	spin_lock(&l_mg->close_lock);
 	spin_lock(&line->lock);
+
+	/* Update the in-memory start address for emeta, in case it has
+	 * shifted due to write errors
+	 */
+	if (line->emeta_ssec != line->cur_sec)
+		line->emeta_ssec = line->cur_sec;
+
 	list_add_tail(&line->list, &l_mg->emeta_list);
 	spin_unlock(&line->lock);
 	spin_unlock(&l_mg->close_lock);
 
 	pblk_line_should_sync_meta(pblk);
+
+
+}
+
+static void pblk_save_lba_list(struct pblk *pblk, struct pblk_line *line)
+{
+	struct pblk_line_meta *lm = &pblk->lm;
+	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+	unsigned int lba_list_size = lm->emeta_len[2];
+	struct pblk_w_err_gc *w_err_gc = line->w_err_gc;
+	struct pblk_emeta *emeta = line->emeta;
+
+	w_err_gc->lba_list = pblk_malloc(lba_list_size,
+					 l_mg->emeta_alloc_type, GFP_KERNEL);
+	memcpy(w_err_gc->lba_list, emeta_to_lbas(pblk, emeta->buf),
+				lba_list_size);
 }
 
 void pblk_line_close_ws(struct work_struct *work)
@@ -1750,9 +1818,16 @@ void pblk_line_close_ws(struct work_struct *work)
 									ws);
 	struct pblk *pblk = line_ws->pblk;
 	struct pblk_line *line = line_ws->line;
+	struct pblk_w_err_gc *w_err_gc = line->w_err_gc;
+
+	/* Write errors makes the emeta start address stored in smeta invalid,
+	 * so keep a copy of the lba list until we've gc'd the line
+	 */
+	if (w_err_gc->has_write_err)
+		pblk_save_lba_list(pblk, line);
 
 	pblk_line_close(pblk, line);
-	mempool_free(line_ws, pblk->gen_ws_pool);
+	mempool_free(line_ws, &pblk->gen_ws_pool);
 }
 
 void pblk_gen_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv,
@@ -1761,7 +1836,7 @@ void pblk_gen_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv,
 {
 	struct pblk_line_ws *line_ws;
 
-	line_ws = mempool_alloc(pblk->gen_ws_pool, gfp_mask);
+	line_ws = mempool_alloc(&pblk->gen_ws_pool, gfp_mask);
 
 	line_ws->pblk = pblk;
 	line_ws->line = line;

+ 71 - 41
drivers/lightnvm/pblk-gc.c

@@ -129,6 +129,53 @@ out:
 	kfree(gc_rq_ws);
 }
 
+static __le64 *get_lba_list_from_emeta(struct pblk *pblk,
+				       struct pblk_line *line)
+{
+	struct line_emeta *emeta_buf;
+	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+	struct pblk_line_meta *lm = &pblk->lm;
+	unsigned int lba_list_size = lm->emeta_len[2];
+	__le64 *lba_list;
+	int ret;
+
+	emeta_buf = pblk_malloc(lm->emeta_len[0],
+				l_mg->emeta_alloc_type, GFP_KERNEL);
+	if (!emeta_buf)
+		return NULL;
+
+	ret = pblk_line_read_emeta(pblk, line, emeta_buf);
+	if (ret) {
+		pr_err("pblk: line %d read emeta failed (%d)\n",
+				line->id, ret);
+		pblk_mfree(emeta_buf, l_mg->emeta_alloc_type);
+		return NULL;
+	}
+
+	/* If this read fails, it means that emeta is corrupted.
+	 * For now, leave the line untouched.
+	 * TODO: Implement a recovery routine that scans and moves
+	 * all sectors on the line.
+	 */
+
+	ret = pblk_recov_check_emeta(pblk, emeta_buf);
+	if (ret) {
+		pr_err("pblk: inconsistent emeta (line %d)\n",
+				line->id);
+		pblk_mfree(emeta_buf, l_mg->emeta_alloc_type);
+		return NULL;
+	}
+
+	lba_list = pblk_malloc(lba_list_size,
+			       l_mg->emeta_alloc_type, GFP_KERNEL);
+	if (lba_list)
+		memcpy(lba_list, emeta_to_lbas(pblk, emeta_buf), lba_list_size);
+
+	pblk_mfree(emeta_buf, l_mg->emeta_alloc_type);
+
+	return lba_list;
+}
+
 static void pblk_gc_line_prepare_ws(struct work_struct *work)
 {
 	struct pblk_line_ws *line_ws = container_of(work, struct pblk_line_ws,
@@ -138,46 +185,26 @@ static void pblk_gc_line_prepare_ws(struct work_struct *work)
 	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
 	struct pblk_line_meta *lm = &pblk->lm;
 	struct pblk_gc *gc = &pblk->gc;
-	struct line_emeta *emeta_buf;
 	struct pblk_line_ws *gc_rq_ws;
 	struct pblk_gc_rq *gc_rq;
 	__le64 *lba_list;
 	unsigned long *invalid_bitmap;
 	int sec_left, nr_secs, bit;
-	int ret;
 
 	invalid_bitmap = kmalloc(lm->sec_bitmap_len, GFP_KERNEL);
 	if (!invalid_bitmap)
 		goto fail_free_ws;
 
-	emeta_buf = pblk_malloc(lm->emeta_len[0], l_mg->emeta_alloc_type,
-								GFP_KERNEL);
-	if (!emeta_buf) {
-		pr_err("pblk: cannot use GC emeta\n");
-		goto fail_free_bitmap;
-	}
-
-	ret = pblk_line_read_emeta(pblk, line, emeta_buf);
-	if (ret) {
-		pr_err("pblk: line %d read emeta failed (%d)\n", line->id, ret);
-		goto fail_free_emeta;
-	}
-
-	/* If this read fails, it means that emeta is corrupted. For now, leave
-	 * the line untouched. TODO: Implement a recovery routine that scans and
-	 * moves all sectors on the line.
-	 */
-
-	ret = pblk_recov_check_emeta(pblk, emeta_buf);
-	if (ret) {
-		pr_err("pblk: inconsistent emeta (line %d)\n", line->id);
-		goto fail_free_emeta;
-	}
-
-	lba_list = emeta_to_lbas(pblk, emeta_buf);
-	if (!lba_list) {
-		pr_err("pblk: could not interpret emeta (line %d)\n", line->id);
-		goto fail_free_emeta;
+	if (line->w_err_gc->has_write_err) {
+		lba_list = line->w_err_gc->lba_list;
+		line->w_err_gc->lba_list = NULL;
+	} else {
+		lba_list = get_lba_list_from_emeta(pblk, line);
+		if (!lba_list) {
+			pr_err("pblk: could not interpret emeta (line %d)\n",
+					line->id);
+			goto fail_free_ws;
+		}
 	}
 
 	spin_lock(&line->lock);
@@ -187,14 +214,14 @@ static void pblk_gc_line_prepare_ws(struct work_struct *work)
 
 	if (sec_left < 0) {
 		pr_err("pblk: corrupted GC line (%d)\n", line->id);
-		goto fail_free_emeta;
+		goto fail_free_lba_list;
 	}
 
 	bit = -1;
 next_rq:
 	gc_rq = kmalloc(sizeof(struct pblk_gc_rq), GFP_KERNEL);
 	if (!gc_rq)
-		goto fail_free_emeta;
+		goto fail_free_lba_list;
 
 	nr_secs = 0;
 	do {
@@ -240,7 +267,7 @@ next_rq:
 		goto next_rq;
 
 out:
-	pblk_mfree(emeta_buf, l_mg->emeta_alloc_type);
+	pblk_mfree(lba_list, l_mg->emeta_alloc_type);
 	kfree(line_ws);
 	kfree(invalid_bitmap);
 
@@ -251,9 +278,8 @@ out:
 
 fail_free_gc_rq:
 	kfree(gc_rq);
-fail_free_emeta:
-	pblk_mfree(emeta_buf, l_mg->emeta_alloc_type);
-fail_free_bitmap:
+fail_free_lba_list:
+	pblk_mfree(lba_list, l_mg->emeta_alloc_type);
 	kfree(invalid_bitmap);
 fail_free_ws:
 	kfree(line_ws);
@@ -349,12 +375,14 @@ static struct pblk_line *pblk_gc_get_victim_line(struct pblk *pblk,
 static bool pblk_gc_should_run(struct pblk_gc *gc, struct pblk_rl *rl)
 {
 	unsigned int nr_blocks_free, nr_blocks_need;
+	unsigned int werr_lines = atomic_read(&rl->werr_lines);
 
 	nr_blocks_need = pblk_rl_high_thrs(rl);
 	nr_blocks_free = pblk_rl_nr_free_blks(rl);
 
 	/* This is not critical, no need to take lock here */
-	return ((gc->gc_active) && (nr_blocks_need > nr_blocks_free));
+	return ((werr_lines > 0) ||
+		((gc->gc_active) && (nr_blocks_need > nr_blocks_free)));
 }
 
 void pblk_gc_free_full_lines(struct pblk *pblk)
@@ -649,7 +677,7 @@ fail_free_main_kthread:
 	return ret;
 }
 
-void pblk_gc_exit(struct pblk *pblk)
+void pblk_gc_exit(struct pblk *pblk, bool graceful)
 {
 	struct pblk_gc *gc = &pblk->gc;
 
@@ -663,10 +691,12 @@ void pblk_gc_exit(struct pblk *pblk)
 	if (gc->gc_reader_ts)
 		kthread_stop(gc->gc_reader_ts);
 
-	flush_workqueue(gc->gc_reader_wq);
-	destroy_workqueue(gc->gc_reader_wq);
+	if (graceful) {
+		flush_workqueue(gc->gc_reader_wq);
+		flush_workqueue(gc->gc_line_reader_wq);
+	}
 
-	flush_workqueue(gc->gc_line_reader_wq);
+	destroy_workqueue(gc->gc_reader_wq);
 	destroy_workqueue(gc->gc_line_reader_wq);
 
 	if (gc->gc_writer_ts)

+ 105 - 67
drivers/lightnvm/pblk-init.c

@@ -20,10 +20,15 @@
 
 #include "pblk.h"
 
+unsigned int write_buffer_size;
+
+module_param(write_buffer_size, uint, 0644);
+MODULE_PARM_DESC(write_buffer_size, "number of entries in a write buffer");
+
 static struct kmem_cache *pblk_ws_cache, *pblk_rec_cache, *pblk_g_rq_cache,
 				*pblk_w_rq_cache;
 static DECLARE_RWSEM(pblk_lock);
-struct bio_set *pblk_bio_set;
+struct bio_set pblk_bio_set;
 
 static int pblk_rw_io(struct request_queue *q, struct pblk *pblk,
 			  struct bio *bio)
@@ -127,10 +132,8 @@ static int pblk_l2p_recover(struct pblk *pblk, bool factory_init)
 	if (!line) {
 		/* Configure next line for user data */
 		line = pblk_line_get_first_data(pblk);
-		if (!line) {
-			pr_err("pblk: line list corrupted\n");
+		if (!line)
 			return -EFAULT;
-		}
 	}
 
 	return 0;
@@ -141,6 +144,7 @@ static int pblk_l2p_init(struct pblk *pblk, bool factory_init)
 	sector_t i;
 	struct ppa_addr ppa;
 	size_t map_size;
+	int ret = 0;
 
 	map_size = pblk_trans_map_size(pblk);
 	pblk->trans_map = vmalloc(map_size);
@@ -152,7 +156,11 @@ static int pblk_l2p_init(struct pblk *pblk, bool factory_init)
 	for (i = 0; i < pblk->rl.nr_secs; i++)
 		pblk_trans_map_set(pblk, i, ppa);
 
-	return pblk_l2p_recover(pblk, factory_init);
+	ret = pblk_l2p_recover(pblk, factory_init);
+	if (ret)
+		vfree(pblk->trans_map);
+
+	return ret;
 }
 
 static void pblk_rwb_free(struct pblk *pblk)
@@ -169,10 +177,15 @@ static int pblk_rwb_init(struct pblk *pblk)
 	struct nvm_tgt_dev *dev = pblk->dev;
 	struct nvm_geo *geo = &dev->geo;
 	struct pblk_rb_entry *entries;
-	unsigned long nr_entries;
+	unsigned long nr_entries, buffer_size;
 	unsigned int power_size, power_seg_sz;
 
-	nr_entries = pblk_rb_calculate_size(pblk->pgs_in_buffer);
+	if (write_buffer_size && (write_buffer_size > pblk->pgs_in_buffer))
+		buffer_size = write_buffer_size;
+	else
+		buffer_size = pblk->pgs_in_buffer;
+
+	nr_entries = pblk_rb_calculate_size(buffer_size);
 
 	entries = vzalloc(nr_entries * sizeof(struct pblk_rb_entry));
 	if (!entries)
@@ -341,7 +354,7 @@ static int pblk_core_init(struct pblk *pblk)
 {
 	struct nvm_tgt_dev *dev = pblk->dev;
 	struct nvm_geo *geo = &dev->geo;
-	int max_write_ppas;
+	int ret, max_write_ppas;
 
 	atomic64_set(&pblk->user_wa, 0);
 	atomic64_set(&pblk->pad_wa, 0);
@@ -375,33 +388,33 @@ static int pblk_core_init(struct pblk *pblk)
 		goto fail_free_pad_dist;
 
 	/* Internal bios can be at most the sectors signaled by the device. */
-	pblk->page_bio_pool = mempool_create_page_pool(NVM_MAX_VLBA, 0);
-	if (!pblk->page_bio_pool)
+	ret = mempool_init_page_pool(&pblk->page_bio_pool, NVM_MAX_VLBA, 0);
+	if (ret)
 		goto free_global_caches;
 
-	pblk->gen_ws_pool = mempool_create_slab_pool(PBLK_GEN_WS_POOL_SIZE,
-							pblk_ws_cache);
-	if (!pblk->gen_ws_pool)
+	ret = mempool_init_slab_pool(&pblk->gen_ws_pool, PBLK_GEN_WS_POOL_SIZE,
+				     pblk_ws_cache);
+	if (ret)
 		goto free_page_bio_pool;
 
-	pblk->rec_pool = mempool_create_slab_pool(geo->all_luns,
-							pblk_rec_cache);
-	if (!pblk->rec_pool)
+	ret = mempool_init_slab_pool(&pblk->rec_pool, geo->all_luns,
+				     pblk_rec_cache);
+	if (ret)
 		goto free_gen_ws_pool;
 
-	pblk->r_rq_pool = mempool_create_slab_pool(geo->all_luns,
-							pblk_g_rq_cache);
-	if (!pblk->r_rq_pool)
+	ret = mempool_init_slab_pool(&pblk->r_rq_pool, geo->all_luns,
+				     pblk_g_rq_cache);
+	if (ret)
 		goto free_rec_pool;
 
-	pblk->e_rq_pool = mempool_create_slab_pool(geo->all_luns,
-							pblk_g_rq_cache);
-	if (!pblk->e_rq_pool)
+	ret = mempool_init_slab_pool(&pblk->e_rq_pool, geo->all_luns,
+				     pblk_g_rq_cache);
+	if (ret)
 		goto free_r_rq_pool;
 
-	pblk->w_rq_pool = mempool_create_slab_pool(geo->all_luns,
-							pblk_w_rq_cache);
-	if (!pblk->w_rq_pool)
+	ret = mempool_init_slab_pool(&pblk->w_rq_pool, geo->all_luns,
+				     pblk_w_rq_cache);
+	if (ret)
 		goto free_e_rq_pool;
 
 	pblk->close_wq = alloc_workqueue("pblk-close-wq",
@@ -423,6 +436,7 @@ static int pblk_core_init(struct pblk *pblk)
 		goto free_r_end_wq;
 
 	INIT_LIST_HEAD(&pblk->compl_list);
+	INIT_LIST_HEAD(&pblk->resubmit_list);
 
 	return 0;
 
@@ -433,17 +447,17 @@ free_bb_wq:
 free_close_wq:
 	destroy_workqueue(pblk->close_wq);
 free_w_rq_pool:
-	mempool_destroy(pblk->w_rq_pool);
+	mempool_exit(&pblk->w_rq_pool);
 free_e_rq_pool:
-	mempool_destroy(pblk->e_rq_pool);
+	mempool_exit(&pblk->e_rq_pool);
 free_r_rq_pool:
-	mempool_destroy(pblk->r_rq_pool);
+	mempool_exit(&pblk->r_rq_pool);
 free_rec_pool:
-	mempool_destroy(pblk->rec_pool);
+	mempool_exit(&pblk->rec_pool);
 free_gen_ws_pool:
-	mempool_destroy(pblk->gen_ws_pool);
+	mempool_exit(&pblk->gen_ws_pool);
 free_page_bio_pool:
-	mempool_destroy(pblk->page_bio_pool);
+	mempool_exit(&pblk->page_bio_pool);
 free_global_caches:
 	pblk_free_global_caches(pblk);
 fail_free_pad_dist:
@@ -462,12 +476,12 @@ static void pblk_core_free(struct pblk *pblk)
 	if (pblk->bb_wq)
 		destroy_workqueue(pblk->bb_wq);
 
-	mempool_destroy(pblk->page_bio_pool);
-	mempool_destroy(pblk->gen_ws_pool);
-	mempool_destroy(pblk->rec_pool);
-	mempool_destroy(pblk->r_rq_pool);
-	mempool_destroy(pblk->e_rq_pool);
-	mempool_destroy(pblk->w_rq_pool);
+	mempool_exit(&pblk->page_bio_pool);
+	mempool_exit(&pblk->gen_ws_pool);
+	mempool_exit(&pblk->rec_pool);
+	mempool_exit(&pblk->r_rq_pool);
+	mempool_exit(&pblk->e_rq_pool);
+	mempool_exit(&pblk->w_rq_pool);
 
 	pblk_free_global_caches(pblk);
 	kfree(pblk->pad_dist);
@@ -489,11 +503,17 @@ static void pblk_line_mg_free(struct pblk *pblk)
 	}
 }
 
-static void pblk_line_meta_free(struct pblk_line *line)
+static void pblk_line_meta_free(struct pblk_line_mgmt *l_mg,
+				struct pblk_line *line)
 {
+	struct pblk_w_err_gc *w_err_gc = line->w_err_gc;
+
 	kfree(line->blk_bitmap);
 	kfree(line->erase_bitmap);
 	kfree(line->chks);
+
+	pblk_mfree(w_err_gc->lba_list, l_mg->emeta_alloc_type);
+	kfree(w_err_gc);
 }
 
 static void pblk_lines_free(struct pblk *pblk)
@@ -506,8 +526,8 @@ static void pblk_lines_free(struct pblk *pblk)
 	for (i = 0; i < l_mg->nr_lines; i++) {
 		line = &pblk->lines[i];
 
-		pblk_line_free(pblk, line);
-		pblk_line_meta_free(line);
+		pblk_line_free(line);
+		pblk_line_meta_free(l_mg, line);
 	}
 	spin_unlock(&l_mg->free_lock);
 
@@ -748,14 +768,14 @@ static int pblk_setup_line_meta_20(struct pblk *pblk, struct pblk_line *line,
 		chunk->cnlb = chunk_meta->cnlb;
 		chunk->wp = chunk_meta->wp;
 
-		if (!(chunk->state & NVM_CHK_ST_OFFLINE))
-			continue;
-
 		if (chunk->type & NVM_CHK_TP_SZ_SPEC) {
 			WARN_ONCE(1, "pblk: custom-sized chunks unsupported\n");
 			continue;
 		}
 
+		if (!(chunk->state & NVM_CHK_ST_OFFLINE))
+			continue;
+
 		set_bit(pos, line->blk_bitmap);
 		nr_bad_chks++;
 	}
@@ -809,20 +829,28 @@ static int pblk_alloc_line_meta(struct pblk *pblk, struct pblk_line *line)
 		return -ENOMEM;
 
 	line->erase_bitmap = kzalloc(lm->blk_bitmap_len, GFP_KERNEL);
-	if (!line->erase_bitmap) {
-		kfree(line->blk_bitmap);
-		return -ENOMEM;
-	}
+	if (!line->erase_bitmap)
+		goto free_blk_bitmap;
+
 
 	line->chks = kmalloc(lm->blk_per_line * sizeof(struct nvm_chk_meta),
 								GFP_KERNEL);
-	if (!line->chks) {
-		kfree(line->erase_bitmap);
-		kfree(line->blk_bitmap);
-		return -ENOMEM;
-	}
+	if (!line->chks)
+		goto free_erase_bitmap;
+
+	line->w_err_gc = kzalloc(sizeof(struct pblk_w_err_gc), GFP_KERNEL);
+	if (!line->w_err_gc)
+		goto free_chks;
 
 	return 0;
+
+free_chks:
+	kfree(line->chks);
+free_erase_bitmap:
+	kfree(line->erase_bitmap);
+free_blk_bitmap:
+	kfree(line->blk_bitmap);
+	return -ENOMEM;
 }
 
 static int pblk_line_mg_init(struct pblk *pblk)
@@ -847,12 +875,14 @@ static int pblk_line_mg_init(struct pblk *pblk)
 	INIT_LIST_HEAD(&l_mg->gc_mid_list);
 	INIT_LIST_HEAD(&l_mg->gc_low_list);
 	INIT_LIST_HEAD(&l_mg->gc_empty_list);
+	INIT_LIST_HEAD(&l_mg->gc_werr_list);
 
 	INIT_LIST_HEAD(&l_mg->emeta_list);
 
-	l_mg->gc_lists[0] = &l_mg->gc_high_list;
-	l_mg->gc_lists[1] = &l_mg->gc_mid_list;
-	l_mg->gc_lists[2] = &l_mg->gc_low_list;
+	l_mg->gc_lists[0] = &l_mg->gc_werr_list;
+	l_mg->gc_lists[1] = &l_mg->gc_high_list;
+	l_mg->gc_lists[2] = &l_mg->gc_mid_list;
+	l_mg->gc_lists[3] = &l_mg->gc_low_list;
 
 	spin_lock_init(&l_mg->free_lock);
 	spin_lock_init(&l_mg->close_lock);
@@ -1047,6 +1077,11 @@ static int pblk_lines_init(struct pblk *pblk)
 		nr_free_chks += pblk_setup_line_meta(pblk, line, chunk_meta, i);
 	}
 
+	if (!nr_free_chks) {
+		pr_err("pblk: too many bad blocks prevent for sane instance\n");
+		return -EINTR;
+	}
+
 	pblk_set_provision(pblk, nr_free_chks);
 
 	kfree(chunk_meta);
@@ -1054,7 +1089,7 @@ static int pblk_lines_init(struct pblk *pblk)
 
 fail_free_lines:
 	while (--i >= 0)
-		pblk_line_meta_free(&pblk->lines[i]);
+		pblk_line_meta_free(l_mg, &pblk->lines[i]);
 	kfree(pblk->lines);
 fail_free_chunk_meta:
 	kfree(chunk_meta);
@@ -1110,23 +1145,25 @@ static void pblk_free(struct pblk *pblk)
 	kfree(pblk);
 }
 
-static void pblk_tear_down(struct pblk *pblk)
+static void pblk_tear_down(struct pblk *pblk, bool graceful)
 {
-	pblk_pipeline_stop(pblk);
+	if (graceful)
+		__pblk_pipeline_flush(pblk);
+	__pblk_pipeline_stop(pblk);
 	pblk_writer_stop(pblk);
 	pblk_rb_sync_l2p(&pblk->rwb);
 	pblk_rl_free(&pblk->rl);
 
-	pr_debug("pblk: consistent tear down\n");
+	pr_debug("pblk: consistent tear down (graceful:%d)\n", graceful);
 }
 
-static void pblk_exit(void *private)
+static void pblk_exit(void *private, bool graceful)
 {
 	struct pblk *pblk = private;
 
 	down_write(&pblk_lock);
-	pblk_gc_exit(pblk);
-	pblk_tear_down(pblk);
+	pblk_gc_exit(pblk, graceful);
+	pblk_tear_down(pblk, graceful);
 
 #ifdef CONFIG_NVM_DEBUG
 	pr_info("pblk exit: L2P CRC: %x\n", pblk_l2p_crc(pblk));
@@ -1175,6 +1212,7 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
 	pblk->state = PBLK_STATE_RUNNING;
 	pblk->gc.gc_enabled = 0;
 
+	spin_lock_init(&pblk->resubmit_lock);
 	spin_lock_init(&pblk->trans_lock);
 	spin_lock_init(&pblk->lock);
 
@@ -1297,18 +1335,18 @@ static int __init pblk_module_init(void)
 {
 	int ret;
 
-	pblk_bio_set = bioset_create(BIO_POOL_SIZE, 0, 0);
-	if (!pblk_bio_set)
-		return -ENOMEM;
+	ret = bioset_init(&pblk_bio_set, BIO_POOL_SIZE, 0, 0);
+	if (ret)
+		return ret;
 	ret = nvm_register_tgt_type(&tt_pblk);
 	if (ret)
-		bioset_free(pblk_bio_set);
+		bioset_exit(&pblk_bio_set);
 	return ret;
 }
 
 static void pblk_module_exit(void)
 {
-	bioset_free(pblk_bio_set);
+	bioset_exit(&pblk_bio_set);
 	nvm_unregister_tgt_type(&tt_pblk);
 }
 

+ 24 - 9
drivers/lightnvm/pblk-map.c

@@ -18,11 +18,11 @@
 
 #include "pblk.h"
 
-static void pblk_map_page_data(struct pblk *pblk, unsigned int sentry,
-			       struct ppa_addr *ppa_list,
-			       unsigned long *lun_bitmap,
-			       struct pblk_sec_meta *meta_list,
-			       unsigned int valid_secs)
+static int pblk_map_page_data(struct pblk *pblk, unsigned int sentry,
+			      struct ppa_addr *ppa_list,
+			      unsigned long *lun_bitmap,
+			      struct pblk_sec_meta *meta_list,
+			      unsigned int valid_secs)
 {
 	struct pblk_line *line = pblk_line_get_data(pblk);
 	struct pblk_emeta *emeta;
@@ -35,8 +35,14 @@ static void pblk_map_page_data(struct pblk *pblk, unsigned int sentry,
 	if (pblk_line_is_full(line)) {
 		struct pblk_line *prev_line = line;
 
+		/* If we cannot allocate a new line, make sure to store metadata
+		 * on current line and then fail
+		 */
 		line = pblk_line_replace_data(pblk);
 		pblk_line_close_meta(pblk, prev_line);
+
+		if (!line)
+			return -EINTR;
 	}
 
 	emeta = line->emeta;
@@ -74,6 +80,7 @@ static void pblk_map_page_data(struct pblk *pblk, unsigned int sentry,
 	}
 
 	pblk_down_rq(pblk, ppa_list, nr_secs, lun_bitmap);
+	return 0;
 }
 
 void pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry,
@@ -87,8 +94,12 @@ void pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry,
 
 	for (i = off; i < rqd->nr_ppas; i += min) {
 		map_secs = (i + min > valid_secs) ? (valid_secs % min) : min;
-		pblk_map_page_data(pblk, sentry + i, &rqd->ppa_list[i],
-					lun_bitmap, &meta_list[i], map_secs);
+		if (pblk_map_page_data(pblk, sentry + i, &rqd->ppa_list[i],
+					lun_bitmap, &meta_list[i], map_secs)) {
+			bio_put(rqd->bio);
+			pblk_free_rqd(pblk, rqd, PBLK_WRITE);
+			pblk_pipeline_stop(pblk);
+		}
 	}
 }
 
@@ -108,8 +119,12 @@ void pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd,
 
 	for (i = 0; i < rqd->nr_ppas; i += min) {
 		map_secs = (i + min > valid_secs) ? (valid_secs % min) : min;
-		pblk_map_page_data(pblk, sentry + i, &rqd->ppa_list[i],
-					lun_bitmap, &meta_list[i], map_secs);
+		if (pblk_map_page_data(pblk, sentry + i, &rqd->ppa_list[i],
+					lun_bitmap, &meta_list[i], map_secs)) {
+			bio_put(rqd->bio);
+			pblk_free_rqd(pblk, rqd, PBLK_WRITE);
+			pblk_pipeline_stop(pblk);
+		}
 
 		erase_lun = pblk_ppa_to_pos(geo, rqd->ppa_list[i]);
 

+ 4 - 44
drivers/lightnvm/pblk-rb.c

@@ -142,10 +142,9 @@ static void clean_wctx(struct pblk_w_ctx *w_ctx)
 {
 	int flags;
 
-try:
 	flags = READ_ONCE(w_ctx->flags);
-	if (!(flags & PBLK_SUBMITTED_ENTRY))
-		goto try;
+	WARN_ONCE(!(flags & PBLK_SUBMITTED_ENTRY),
+			"pblk: overwriting unsubmitted data\n");
 
 	/* Release flags on context. Protect from writes and reads */
 	smp_store_release(&w_ctx->flags, PBLK_WRITABLE_ENTRY);
@@ -350,7 +349,7 @@ void pblk_rb_write_entry_gc(struct pblk_rb *rb, void *data,
 }
 
 static int pblk_rb_flush_point_set(struct pblk_rb *rb, struct bio *bio,
-				  unsigned int pos)
+				   unsigned int pos)
 {
 	struct pblk_rb_entry *entry;
 	unsigned int sync, flush_point;
@@ -420,7 +419,7 @@ void pblk_rb_flush(struct pblk_rb *rb)
 	if (pblk_rb_flush_point_set(rb, NULL, mem))
 		return;
 
-	pblk_write_should_kick(pblk);
+	pblk_write_kick(pblk);
 }
 
 static int pblk_rb_may_write_flush(struct pblk_rb *rb, unsigned int nr_entries,
@@ -503,45 +502,6 @@ int pblk_rb_may_write_gc(struct pblk_rb *rb, unsigned int nr_entries,
 	return 1;
 }
 
-/*
- * The caller of this function must ensure that the backpointer will not
- * overwrite the entries passed on the list.
- */
-unsigned int pblk_rb_read_to_bio_list(struct pblk_rb *rb, struct bio *bio,
-				      struct list_head *list,
-				      unsigned int max)
-{
-	struct pblk_rb_entry *entry, *tentry;
-	struct page *page;
-	unsigned int read = 0;
-	int ret;
-
-	list_for_each_entry_safe(entry, tentry, list, index) {
-		if (read > max) {
-			pr_err("pblk: too many entries on list\n");
-			goto out;
-		}
-
-		page = virt_to_page(entry->data);
-		if (!page) {
-			pr_err("pblk: could not allocate write bio page\n");
-			goto out;
-		}
-
-		ret = bio_add_page(bio, page, rb->seg_size, 0);
-		if (ret != rb->seg_size) {
-			pr_err("pblk: could not add page to write bio\n");
-			goto out;
-		}
-
-		list_del(&entry->index);
-		read++;
-	}
-
-out:
-	return read;
-}
-
 /*
  * Read available entries on rb and add them to the given bio. To avoid a memory
  * copy, a page reference to the write buffer is used to be added to the bio.

+ 88 - 58
drivers/lightnvm/pblk-read.c

@@ -39,10 +39,10 @@ static int pblk_read_from_cache(struct pblk *pblk, struct bio *bio,
 }
 
 static void pblk_read_ppalist_rq(struct pblk *pblk, struct nvm_rq *rqd,
-				 sector_t blba, unsigned long *read_bitmap)
+				 struct bio *bio, sector_t blba,
+				 unsigned long *read_bitmap)
 {
 	struct pblk_sec_meta *meta_list = rqd->meta_list;
-	struct bio *bio = rqd->bio;
 	struct ppa_addr ppas[PBLK_MAX_REQ_ADDRS];
 	int nr_secs = rqd->nr_ppas;
 	bool advanced_bio = false;
@@ -102,32 +102,69 @@ next:
 #endif
 }
 
-static int pblk_submit_read_io(struct pblk *pblk, struct nvm_rq *rqd)
+
+static void pblk_read_check_seq(struct pblk *pblk, struct nvm_rq *rqd,
+				sector_t blba)
 {
-	int err;
+	struct pblk_sec_meta *meta_lba_list = rqd->meta_list;
+	int nr_lbas = rqd->nr_ppas;
+	int i;
 
-	err = pblk_submit_io(pblk, rqd);
-	if (err)
-		return NVM_IO_ERR;
+	for (i = 0; i < nr_lbas; i++) {
+		u64 lba = le64_to_cpu(meta_lba_list[i].lba);
+
+		if (lba == ADDR_EMPTY)
+			continue;
+
+		if (lba != blba + i) {
+#ifdef CONFIG_NVM_DEBUG
+			struct ppa_addr *p;
 
-	return NVM_IO_OK;
+			p = (nr_lbas == 1) ? &rqd->ppa_list[i] : &rqd->ppa_addr;
+			print_ppa(&pblk->dev->geo, p, "seq", i);
+#endif
+			pr_err("pblk: corrupted read LBA (%llu/%llu)\n",
+							lba, (u64)blba + i);
+			WARN_ON(1);
+		}
+	}
 }
 
-static void pblk_read_check(struct pblk *pblk, struct nvm_rq *rqd,
-			   sector_t blba)
+/*
+ * There can be holes in the lba list.
+ */
+static void pblk_read_check_rand(struct pblk *pblk, struct nvm_rq *rqd,
+				 u64 *lba_list, int nr_lbas)
 {
-	struct pblk_sec_meta *meta_list = rqd->meta_list;
-	int nr_lbas = rqd->nr_ppas;
-	int i;
+	struct pblk_sec_meta *meta_lba_list = rqd->meta_list;
+	int i, j;
 
-	for (i = 0; i < nr_lbas; i++) {
-		u64 lba = le64_to_cpu(meta_list[i].lba);
+	for (i = 0, j = 0; i < nr_lbas; i++) {
+		u64 lba = lba_list[i];
+		u64 meta_lba;
 
 		if (lba == ADDR_EMPTY)
 			continue;
 
-		WARN(lba != blba + i, "pblk: corrupted read LBA\n");
+		meta_lba = le64_to_cpu(meta_lba_list[j].lba);
+
+		if (lba != meta_lba) {
+#ifdef CONFIG_NVM_DEBUG
+			struct ppa_addr *p;
+			int nr_ppas = rqd->nr_ppas;
+
+			p = (nr_ppas == 1) ? &rqd->ppa_list[j] : &rqd->ppa_addr;
+			print_ppa(&pblk->dev->geo, p, "seq", j);
+#endif
+			pr_err("pblk: corrupted read LBA (%llu/%llu)\n",
+								lba, meta_lba);
+			WARN_ON(1);
+		}
+
+		j++;
 	}
+
+	WARN_ONCE(j != rqd->nr_ppas, "pblk: corrupted random request\n");
 }
 
 static void pblk_read_put_rqd_kref(struct pblk *pblk, struct nvm_rq *rqd)
@@ -152,7 +189,6 @@ static void pblk_end_user_read(struct bio *bio)
 	WARN_ONCE(bio->bi_status, "pblk: corrupted read bio\n");
 #endif
 	bio_endio(bio);
-	bio_put(bio);
 }
 
 static void __pblk_end_io_read(struct pblk *pblk, struct nvm_rq *rqd,
@@ -160,23 +196,18 @@ static void __pblk_end_io_read(struct pblk *pblk, struct nvm_rq *rqd,
 {
 	struct nvm_tgt_dev *dev = pblk->dev;
 	struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd);
-	struct bio *bio = rqd->bio;
+	struct bio *int_bio = rqd->bio;
 	unsigned long start_time = r_ctx->start_time;
 
 	generic_end_io_acct(dev->q, READ, &pblk->disk->part0, start_time);
 
 	if (rqd->error)
 		pblk_log_read_err(pblk, rqd);
-#ifdef CONFIG_NVM_DEBUG
-	else
-		WARN_ONCE(bio->bi_status, "pblk: corrupted read error\n");
-#endif
 
-	pblk_read_check(pblk, rqd, r_ctx->lba);
+	pblk_read_check_seq(pblk, rqd, r_ctx->lba);
 
-	bio_put(bio);
-	if (r_ctx->private)
-		pblk_end_user_read((struct bio *)r_ctx->private);
+	if (int_bio)
+		bio_put(int_bio);
 
 	if (put_line)
 		pblk_read_put_rqd_kref(pblk, rqd);
@@ -193,16 +224,19 @@ static void __pblk_end_io_read(struct pblk *pblk, struct nvm_rq *rqd,
 static void pblk_end_io_read(struct nvm_rq *rqd)
 {
 	struct pblk *pblk = rqd->private;
+	struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd);
+	struct bio *bio = (struct bio *)r_ctx->private;
 
+	pblk_end_user_read(bio);
 	__pblk_end_io_read(pblk, rqd, true);
 }
 
-static int pblk_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
-				 unsigned int bio_init_idx,
-				 unsigned long *read_bitmap)
+static int pblk_partial_read(struct pblk *pblk, struct nvm_rq *rqd,
+			     struct bio *orig_bio, unsigned int bio_init_idx,
+			     unsigned long *read_bitmap)
 {
-	struct bio *new_bio, *bio = rqd->bio;
 	struct pblk_sec_meta *meta_list = rqd->meta_list;
+	struct bio *new_bio;
 	struct bio_vec src_bv, dst_bv;
 	void *ppa_ptr = NULL;
 	void *src_p, *dst_p;
@@ -219,11 +253,11 @@ static int pblk_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
 	new_bio = bio_alloc(GFP_KERNEL, nr_holes);
 
 	if (pblk_bio_add_pages(pblk, new_bio, GFP_KERNEL, nr_holes))
-		goto err;
+		goto fail_add_pages;
 
 	if (nr_holes != new_bio->bi_vcnt) {
 		pr_err("pblk: malformed bio\n");
-		goto err;
+		goto fail;
 	}
 
 	for (i = 0; i < nr_secs; i++)
@@ -246,7 +280,7 @@ static int pblk_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
 	if (ret) {
 		bio_put(rqd->bio);
 		pr_err("pblk: sync read IO submission failed\n");
-		goto err;
+		goto fail;
 	}
 
 	if (rqd->error) {
@@ -282,7 +316,7 @@ static int pblk_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
 		meta_list[hole].lba = lba_list_media[i];
 
 		src_bv = new_bio->bi_io_vec[i++];
-		dst_bv = bio->bi_io_vec[bio_init_idx + hole];
+		dst_bv = orig_bio->bi_io_vec[bio_init_idx + hole];
 
 		src_p = kmap_atomic(src_bv.bv_page);
 		dst_p = kmap_atomic(dst_bv.bv_page);
@@ -294,35 +328,33 @@ static int pblk_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
 		kunmap_atomic(src_p);
 		kunmap_atomic(dst_p);
 
-		mempool_free(src_bv.bv_page, pblk->page_bio_pool);
+		mempool_free(src_bv.bv_page, &pblk->page_bio_pool);
 
 		hole = find_next_zero_bit(read_bitmap, nr_secs, hole + 1);
 	} while (hole < nr_secs);
 
 	bio_put(new_bio);
 
-	/* Complete the original bio and associated request */
-	bio_endio(bio);
-	rqd->bio = bio;
+	/* restore original request */
+	rqd->bio = NULL;
 	rqd->nr_ppas = nr_secs;
 
 	__pblk_end_io_read(pblk, rqd, false);
-	return NVM_IO_OK;
-
-err:
-	pr_err("pblk: failed to perform partial read\n");
+	return NVM_IO_DONE;
 
+fail:
 	/* Free allocated pages in new bio */
-	pblk_bio_free_pages(pblk, bio, 0, new_bio->bi_vcnt);
+	pblk_bio_free_pages(pblk, new_bio, 0, new_bio->bi_vcnt);
+fail_add_pages:
+	pr_err("pblk: failed to perform partial read\n");
 	__pblk_end_io_read(pblk, rqd, false);
 	return NVM_IO_ERR;
 }
 
-static void pblk_read_rq(struct pblk *pblk, struct nvm_rq *rqd,
+static void pblk_read_rq(struct pblk *pblk, struct nvm_rq *rqd, struct bio *bio,
 			 sector_t lba, unsigned long *read_bitmap)
 {
 	struct pblk_sec_meta *meta_list = rqd->meta_list;
-	struct bio *bio = rqd->bio;
 	struct ppa_addr ppa;
 
 	pblk_lookup_l2p_seq(pblk, &ppa, lba, 1);
@@ -386,14 +418,15 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
 	rqd = pblk_alloc_rqd(pblk, PBLK_READ);
 
 	rqd->opcode = NVM_OP_PREAD;
-	rqd->bio = bio;
 	rqd->nr_ppas = nr_secs;
+	rqd->bio = NULL; /* cloned bio if needed */
 	rqd->private = pblk;
 	rqd->end_io = pblk_end_io_read;
 
 	r_ctx = nvm_rq_to_pdu(rqd);
 	r_ctx->start_time = jiffies;
 	r_ctx->lba = blba;
+	r_ctx->private = bio; /* original bio */
 
 	/* Save the index for this bio's start. This is needed in case
 	 * we need to fill a partial read.
@@ -411,17 +444,15 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
 		rqd->ppa_list = rqd->meta_list + pblk_dma_meta_size;
 		rqd->dma_ppa_list = rqd->dma_meta_list + pblk_dma_meta_size;
 
-		pblk_read_ppalist_rq(pblk, rqd, blba, &read_bitmap);
+		pblk_read_ppalist_rq(pblk, rqd, bio, blba, &read_bitmap);
 	} else {
-		pblk_read_rq(pblk, rqd, blba, &read_bitmap);
+		pblk_read_rq(pblk, rqd, bio, blba, &read_bitmap);
 	}
 
-	bio_get(bio);
 	if (bitmap_full(&read_bitmap, nr_secs)) {
-		bio_endio(bio);
 		atomic_inc(&pblk->inflight_io);
 		__pblk_end_io_read(pblk, rqd, false);
-		return NVM_IO_OK;
+		return NVM_IO_DONE;
 	}
 
 	/* All sectors are to be read from the device */
@@ -429,20 +460,17 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
 		struct bio *int_bio = NULL;
 
 		/* Clone read bio to deal with read errors internally */
-		int_bio = bio_clone_fast(bio, GFP_KERNEL, pblk_bio_set);
+		int_bio = bio_clone_fast(bio, GFP_KERNEL, &pblk_bio_set);
 		if (!int_bio) {
 			pr_err("pblk: could not clone read bio\n");
 			goto fail_end_io;
 		}
 
 		rqd->bio = int_bio;
-		r_ctx->private = bio;
 
-		ret = pblk_submit_read_io(pblk, rqd);
-		if (ret) {
+		if (pblk_submit_io(pblk, rqd)) {
 			pr_err("pblk: read IO submission failed\n");
-			if (int_bio)
-				bio_put(int_bio);
+			ret = NVM_IO_ERR;
 			goto fail_end_io;
 		}
 
@@ -452,7 +480,7 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
 	/* The read bio request could be partially filled by the write buffer,
 	 * but there are some holes that need to be read from the drive.
 	 */
-	return pblk_partial_read_bio(pblk, rqd, bio_init_idx, &read_bitmap);
+	return pblk_partial_read(pblk, rqd, bio, bio_init_idx, &read_bitmap);
 
 fail_rqd_free:
 	pblk_free_rqd(pblk, rqd, PBLK_READ);
@@ -585,6 +613,8 @@ int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq)
 		goto err_free_bio;
 	}
 
+	pblk_read_check_rand(pblk, &rqd, gc_rq->lba_list, gc_rq->nr_secs);
+
 	atomic_dec(&pblk->inflight_io);
 
 	if (rqd.error) {

+ 21 - 100
drivers/lightnvm/pblk-recovery.c

@@ -16,97 +16,6 @@
 
 #include "pblk.h"
 
-void pblk_submit_rec(struct work_struct *work)
-{
-	struct pblk_rec_ctx *recovery =
-			container_of(work, struct pblk_rec_ctx, ws_rec);
-	struct pblk *pblk = recovery->pblk;
-	struct nvm_rq *rqd = recovery->rqd;
-	struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
-	struct bio *bio;
-	unsigned int nr_rec_secs;
-	unsigned int pgs_read;
-	int ret;
-
-	nr_rec_secs = bitmap_weight((unsigned long int *)&rqd->ppa_status,
-								NVM_MAX_VLBA);
-
-	bio = bio_alloc(GFP_KERNEL, nr_rec_secs);
-
-	bio->bi_iter.bi_sector = 0;
-	bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
-	rqd->bio = bio;
-	rqd->nr_ppas = nr_rec_secs;
-
-	pgs_read = pblk_rb_read_to_bio_list(&pblk->rwb, bio, &recovery->failed,
-								nr_rec_secs);
-	if (pgs_read != nr_rec_secs) {
-		pr_err("pblk: could not read recovery entries\n");
-		goto err;
-	}
-
-	if (pblk_setup_w_rec_rq(pblk, rqd, c_ctx)) {
-		pr_err("pblk: could not setup recovery request\n");
-		goto err;
-	}
-
-#ifdef CONFIG_NVM_DEBUG
-	atomic_long_add(nr_rec_secs, &pblk->recov_writes);
-#endif
-
-	ret = pblk_submit_io(pblk, rqd);
-	if (ret) {
-		pr_err("pblk: I/O submission failed: %d\n", ret);
-		goto err;
-	}
-
-	mempool_free(recovery, pblk->rec_pool);
-	return;
-
-err:
-	bio_put(bio);
-	pblk_free_rqd(pblk, rqd, PBLK_WRITE);
-}
-
-int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx,
-			struct pblk_rec_ctx *recovery, u64 *comp_bits,
-			unsigned int comp)
-{
-	struct nvm_rq *rec_rqd;
-	struct pblk_c_ctx *rec_ctx;
-	int nr_entries = c_ctx->nr_valid + c_ctx->nr_padded;
-
-	rec_rqd = pblk_alloc_rqd(pblk, PBLK_WRITE);
-	rec_ctx = nvm_rq_to_pdu(rec_rqd);
-
-	/* Copy completion bitmap, but exclude the first X completed entries */
-	bitmap_shift_right((unsigned long int *)&rec_rqd->ppa_status,
-				(unsigned long int *)comp_bits,
-				comp, NVM_MAX_VLBA);
-
-	/* Save the context for the entries that need to be re-written and
-	 * update current context with the completed entries.
-	 */
-	rec_ctx->sentry = pblk_rb_wrap_pos(&pblk->rwb, c_ctx->sentry + comp);
-	if (comp >= c_ctx->nr_valid) {
-		rec_ctx->nr_valid = 0;
-		rec_ctx->nr_padded = nr_entries - comp;
-
-		c_ctx->nr_padded = comp - c_ctx->nr_valid;
-	} else {
-		rec_ctx->nr_valid = c_ctx->nr_valid - comp;
-		rec_ctx->nr_padded = c_ctx->nr_padded;
-
-		c_ctx->nr_valid = comp;
-		c_ctx->nr_padded = 0;
-	}
-
-	recovery->rqd = rec_rqd;
-	recovery->pblk = pblk;
-
-	return 0;
-}
-
 int pblk_recov_check_emeta(struct pblk *pblk, struct line_emeta *emeta_buf)
 {
 	u32 crc;
@@ -865,18 +774,30 @@ static void pblk_recov_wa_counters(struct pblk *pblk,
 }
 
 static int pblk_line_was_written(struct pblk_line *line,
-			    struct pblk_line_meta *lm)
+			    struct pblk *pblk)
 {
 
-	int i;
-	int state_mask = NVM_CHK_ST_OFFLINE | NVM_CHK_ST_FREE;
+	struct pblk_line_meta *lm = &pblk->lm;
+	struct nvm_tgt_dev *dev = pblk->dev;
+	struct nvm_geo *geo = &dev->geo;
+	struct nvm_chk_meta *chunk;
+	struct ppa_addr bppa;
+	int smeta_blk;
 
-	for (i = 0; i < lm->blk_per_line; i++) {
-		if (!(line->chks[i].state & state_mask))
-			return 1;
-	}
+	if (line->state == PBLK_LINESTATE_BAD)
+		return 0;
 
-	return 0;
+	smeta_blk = find_first_zero_bit(line->blk_bitmap, lm->blk_per_line);
+	if (smeta_blk >= lm->blk_per_line)
+		return 0;
+
+	bppa = pblk->luns[smeta_blk].bppa;
+	chunk = &line->chks[pblk_ppa_to_pos(geo, bppa)];
+
+	if (chunk->state & NVM_CHK_ST_FREE)
+		return 0;
+
+	return 1;
 }
 
 struct pblk_line *pblk_recov_l2p(struct pblk *pblk)
@@ -915,7 +836,7 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk)
 		line->lun_bitmap = ((void *)(smeta_buf)) +
 						sizeof(struct line_smeta);
 
-		if (!pblk_line_was_written(line, lm))
+		if (!pblk_line_was_written(line, pblk))
 			continue;
 
 		/* Lines that cannot be read are assumed as not written here */

+ 25 - 4
drivers/lightnvm/pblk-rl.c

@@ -73,6 +73,16 @@ void pblk_rl_user_in(struct pblk_rl *rl, int nr_entries)
 	pblk_rl_kick_u_timer(rl);
 }
 
+void pblk_rl_werr_line_in(struct pblk_rl *rl)
+{
+	atomic_inc(&rl->werr_lines);
+}
+
+void pblk_rl_werr_line_out(struct pblk_rl *rl)
+{
+	atomic_dec(&rl->werr_lines);
+}
+
 void pblk_rl_gc_in(struct pblk_rl *rl, int nr_entries)
 {
 	atomic_add(nr_entries, &rl->rb_gc_cnt);
@@ -99,11 +109,21 @@ static void __pblk_rl_update_rates(struct pblk_rl *rl,
 {
 	struct pblk *pblk = container_of(rl, struct pblk, rl);
 	int max = rl->rb_budget;
+	int werr_gc_needed = atomic_read(&rl->werr_lines);
 
 	if (free_blocks >= rl->high) {
-		rl->rb_user_max = max;
-		rl->rb_gc_max = 0;
-		rl->rb_state = PBLK_RL_HIGH;
+		if (werr_gc_needed) {
+			/* Allocate a small budget for recovering
+			 * lines with write errors
+			 */
+			rl->rb_gc_max = 1 << rl->rb_windows_pw;
+			rl->rb_user_max = max - rl->rb_gc_max;
+			rl->rb_state = PBLK_RL_WERR;
+		} else {
+			rl->rb_user_max = max;
+			rl->rb_gc_max = 0;
+			rl->rb_state = PBLK_RL_OFF;
+		}
 	} else if (free_blocks < rl->high) {
 		int shift = rl->high_pw - rl->rb_windows_pw;
 		int user_windows = free_blocks >> shift;
@@ -124,7 +144,7 @@ static void __pblk_rl_update_rates(struct pblk_rl *rl,
 		rl->rb_state = PBLK_RL_LOW;
 	}
 
-	if (rl->rb_state == (PBLK_RL_MID | PBLK_RL_LOW))
+	if (rl->rb_state != PBLK_RL_OFF)
 		pblk_gc_should_start(pblk);
 	else
 		pblk_gc_should_stop(pblk);
@@ -221,6 +241,7 @@ void pblk_rl_init(struct pblk_rl *rl, int budget)
 	atomic_set(&rl->rb_user_cnt, 0);
 	atomic_set(&rl->rb_gc_cnt, 0);
 	atomic_set(&rl->rb_space, -1);
+	atomic_set(&rl->werr_lines, 0);
 
 	timer_setup(&rl->u_timer, pblk_rl_u_timer, 0);
 

+ 13 - 2
drivers/lightnvm/pblk-sysfs.c

@@ -173,6 +173,8 @@ static ssize_t pblk_sysfs_lines(struct pblk *pblk, char *page)
 	int free_line_cnt = 0, closed_line_cnt = 0, emeta_line_cnt = 0;
 	int d_line_cnt = 0, l_line_cnt = 0;
 	int gc_full = 0, gc_high = 0, gc_mid = 0, gc_low = 0, gc_empty = 0;
+	int gc_werr = 0;
+
 	int bad = 0, cor = 0;
 	int msecs = 0, cur_sec = 0, vsc = 0, sec_in_line = 0;
 	int map_weight = 0, meta_weight = 0;
@@ -237,6 +239,15 @@ static ssize_t pblk_sysfs_lines(struct pblk *pblk, char *page)
 		gc_empty++;
 	}
 
+	list_for_each_entry(line, &l_mg->gc_werr_list, list) {
+		if (line->type == PBLK_LINETYPE_DATA)
+			d_line_cnt++;
+		else if (line->type == PBLK_LINETYPE_LOG)
+			l_line_cnt++;
+		closed_line_cnt++;
+		gc_werr++;
+	}
+
 	list_for_each_entry(line, &l_mg->bad_list, list)
 		bad++;
 	list_for_each_entry(line, &l_mg->corrupt_list, list)
@@ -275,8 +286,8 @@ static ssize_t pblk_sysfs_lines(struct pblk *pblk, char *page)
 					l_mg->nr_lines);
 
 	sz += snprintf(page + sz, PAGE_SIZE - sz,
-		"GC: full:%d, high:%d, mid:%d, low:%d, empty:%d, queue:%d\n",
-			gc_full, gc_high, gc_mid, gc_low, gc_empty,
+		"GC: full:%d, high:%d, mid:%d, low:%d, empty:%d, werr: %d, queue:%d\n",
+			gc_full, gc_high, gc_mid, gc_low, gc_empty, gc_werr,
 			atomic_read(&pblk->gc.read_inflight_gc));
 
 	sz += snprintf(page + sz, PAGE_SIZE - sz,

+ 178 - 91
drivers/lightnvm/pblk-write.c

@@ -103,68 +103,150 @@ retry:
 	pblk_rb_sync_end(&pblk->rwb, &flags);
 }
 
-/* When a write fails, we are not sure whether the block has grown bad or a page
- * range is more susceptible to write errors. If a high number of pages fail, we
- * assume that the block is bad and we mark it accordingly. In all cases, we
- * remap and resubmit the failed entries as fast as possible; if a flush is
- * waiting on a completion, the whole stack would stall otherwise.
- */
-static void pblk_end_w_fail(struct pblk *pblk, struct nvm_rq *rqd)
+/* Map remaining sectors in chunk, starting from ppa */
+static void pblk_map_remaining(struct pblk *pblk, struct ppa_addr *ppa)
 {
-	void *comp_bits = &rqd->ppa_status;
-	struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
-	struct pblk_rec_ctx *recovery;
-	struct ppa_addr *ppa_list = rqd->ppa_list;
-	int nr_ppas = rqd->nr_ppas;
-	unsigned int c_entries;
-	int bit, ret;
+	struct nvm_tgt_dev *dev = pblk->dev;
+	struct nvm_geo *geo = &dev->geo;
+	struct pblk_line *line;
+	struct ppa_addr map_ppa = *ppa;
+	u64 paddr;
+	int done = 0;
 
-	if (unlikely(nr_ppas == 1))
-		ppa_list = &rqd->ppa_addr;
+	line = &pblk->lines[pblk_ppa_to_line(*ppa)];
+	spin_lock(&line->lock);
 
-	recovery = mempool_alloc(pblk->rec_pool, GFP_ATOMIC);
+	while (!done)  {
+		paddr = pblk_dev_ppa_to_line_addr(pblk, map_ppa);
 
-	INIT_LIST_HEAD(&recovery->failed);
+		if (!test_and_set_bit(paddr, line->map_bitmap))
+			line->left_msecs--;
 
-	bit = -1;
-	while ((bit = find_next_bit(comp_bits, nr_ppas, bit + 1)) < nr_ppas) {
-		struct pblk_rb_entry *entry;
-		struct ppa_addr ppa;
+		if (!test_and_set_bit(paddr, line->invalid_bitmap))
+			le32_add_cpu(line->vsc, -1);
 
-		/* Logic error */
-		if (bit > c_ctx->nr_valid) {
-			WARN_ONCE(1, "pblk: corrupted write request\n");
-			mempool_free(recovery, pblk->rec_pool);
-			goto out;
+		if (geo->version == NVM_OCSSD_SPEC_12) {
+			map_ppa.ppa++;
+			if (map_ppa.g.pg == geo->num_pg)
+				done = 1;
+		} else {
+			map_ppa.m.sec++;
+			if (map_ppa.m.sec == geo->clba)
+				done = 1;
 		}
+	}
 
-		ppa = ppa_list[bit];
-		entry = pblk_rb_sync_scan_entry(&pblk->rwb, &ppa);
-		if (!entry) {
-			pr_err("pblk: could not scan entry on write failure\n");
-			mempool_free(recovery, pblk->rec_pool);
-			goto out;
-		}
+	line->w_err_gc->has_write_err = 1;
+	spin_unlock(&line->lock);
+}
 
-		/* The list is filled first and emptied afterwards. No need for
-		 * protecting it with a lock
+static void pblk_prepare_resubmit(struct pblk *pblk, unsigned int sentry,
+				  unsigned int nr_entries)
+{
+	struct pblk_rb *rb = &pblk->rwb;
+	struct pblk_rb_entry *entry;
+	struct pblk_line *line;
+	struct pblk_w_ctx *w_ctx;
+	struct ppa_addr ppa_l2p;
+	int flags;
+	unsigned int pos, i;
+
+	spin_lock(&pblk->trans_lock);
+	pos = sentry;
+	for (i = 0; i < nr_entries; i++) {
+		entry = &rb->entries[pos];
+		w_ctx = &entry->w_ctx;
+
+		/* Check if the lba has been overwritten */
+		ppa_l2p = pblk_trans_map_get(pblk, w_ctx->lba);
+		if (!pblk_ppa_comp(ppa_l2p, entry->cacheline))
+			w_ctx->lba = ADDR_EMPTY;
+
+		/* Mark up the entry as submittable again */
+		flags = READ_ONCE(w_ctx->flags);
+		flags |= PBLK_WRITTEN_DATA;
+		/* Release flags on write context. Protect from writes */
+		smp_store_release(&w_ctx->flags, flags);
+
+		/* Decrese the reference count to the line as we will
+		 * re-map these entries
 		 */
-		list_add_tail(&entry->index, &recovery->failed);
+		line = &pblk->lines[pblk_ppa_to_line(w_ctx->ppa)];
+		kref_put(&line->ref, pblk_line_put);
+
+		pos = (pos + 1) & (rb->nr_entries - 1);
 	}
+	spin_unlock(&pblk->trans_lock);
+}
 
-	c_entries = find_first_bit(comp_bits, nr_ppas);
-	ret = pblk_recov_setup_rq(pblk, c_ctx, recovery, comp_bits, c_entries);
-	if (ret) {
-		pr_err("pblk: could not recover from write failure\n");
-		mempool_free(recovery, pblk->rec_pool);
-		goto out;
+static void pblk_queue_resubmit(struct pblk *pblk, struct pblk_c_ctx *c_ctx)
+{
+	struct pblk_c_ctx *r_ctx;
+
+	r_ctx = kzalloc(sizeof(struct pblk_c_ctx), GFP_KERNEL);
+	if (!r_ctx)
+		return;
+
+	r_ctx->lun_bitmap = NULL;
+	r_ctx->sentry = c_ctx->sentry;
+	r_ctx->nr_valid = c_ctx->nr_valid;
+	r_ctx->nr_padded = c_ctx->nr_padded;
+
+	spin_lock(&pblk->resubmit_lock);
+	list_add_tail(&r_ctx->list, &pblk->resubmit_list);
+	spin_unlock(&pblk->resubmit_lock);
+
+#ifdef CONFIG_NVM_DEBUG
+	atomic_long_add(c_ctx->nr_valid, &pblk->recov_writes);
+#endif
+}
+
+static void pblk_submit_rec(struct work_struct *work)
+{
+	struct pblk_rec_ctx *recovery =
+			container_of(work, struct pblk_rec_ctx, ws_rec);
+	struct pblk *pblk = recovery->pblk;
+	struct nvm_rq *rqd = recovery->rqd;
+	struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
+	struct ppa_addr *ppa_list;
+
+	pblk_log_write_err(pblk, rqd);
+
+	if (rqd->nr_ppas == 1)
+		ppa_list = &rqd->ppa_addr;
+	else
+		ppa_list = rqd->ppa_list;
+
+	pblk_map_remaining(pblk, ppa_list);
+	pblk_queue_resubmit(pblk, c_ctx);
+
+	pblk_up_rq(pblk, rqd->ppa_list, rqd->nr_ppas, c_ctx->lun_bitmap);
+	if (c_ctx->nr_padded)
+		pblk_bio_free_pages(pblk, rqd->bio, c_ctx->nr_valid,
+							c_ctx->nr_padded);
+	bio_put(rqd->bio);
+	pblk_free_rqd(pblk, rqd, PBLK_WRITE);
+	mempool_free(recovery, &pblk->rec_pool);
+
+	atomic_dec(&pblk->inflight_io);
+}
+
+
+static void pblk_end_w_fail(struct pblk *pblk, struct nvm_rq *rqd)
+{
+	struct pblk_rec_ctx *recovery;
+
+	recovery = mempool_alloc(&pblk->rec_pool, GFP_ATOMIC);
+	if (!recovery) {
+		pr_err("pblk: could not allocate recovery work\n");
+		return;
 	}
 
+	recovery->pblk = pblk;
+	recovery->rqd = rqd;
+
 	INIT_WORK(&recovery->ws_rec, pblk_submit_rec);
 	queue_work(pblk->close_wq, &recovery->ws_rec);
-
-out:
-	pblk_complete_write(pblk, rqd, c_ctx);
 }
 
 static void pblk_end_io_write(struct nvm_rq *rqd)
@@ -173,8 +255,8 @@ static void pblk_end_io_write(struct nvm_rq *rqd)
 	struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
 
 	if (rqd->error) {
-		pblk_log_write_err(pblk, rqd);
-		return pblk_end_w_fail(pblk, rqd);
+		pblk_end_w_fail(pblk, rqd);
+		return;
 	}
 #ifdef CONFIG_NVM_DEBUG
 	else
@@ -198,6 +280,7 @@ static void pblk_end_io_write_meta(struct nvm_rq *rqd)
 	if (rqd->error) {
 		pblk_log_write_err(pblk, rqd);
 		pr_err("pblk: metadata I/O failed. Line %d\n", line->id);
+		line->w_err_gc->has_write_err = 1;
 	}
 
 	sync = atomic_add_return(rqd->nr_ppas, &emeta->sync);
@@ -266,31 +349,6 @@ static int pblk_setup_w_rq(struct pblk *pblk, struct nvm_rq *rqd,
 	return 0;
 }
 
-int pblk_setup_w_rec_rq(struct pblk *pblk, struct nvm_rq *rqd,
-			struct pblk_c_ctx *c_ctx)
-{
-	struct pblk_line_meta *lm = &pblk->lm;
-	unsigned long *lun_bitmap;
-	int ret;
-
-	lun_bitmap = kzalloc(lm->lun_bitmap_len, GFP_KERNEL);
-	if (!lun_bitmap)
-		return -ENOMEM;
-
-	c_ctx->lun_bitmap = lun_bitmap;
-
-	ret = pblk_alloc_w_rq(pblk, rqd, rqd->nr_ppas, pblk_end_io_write);
-	if (ret)
-		return ret;
-
-	pblk_map_rq(pblk, rqd, c_ctx->sentry, lun_bitmap, c_ctx->nr_valid, 0);
-
-	rqd->ppa_status = (u64)0;
-	rqd->flags = pblk_set_progr_mode(pblk, PBLK_WRITE);
-
-	return ret;
-}
-
 static int pblk_calc_secs_to_sync(struct pblk *pblk, unsigned int secs_avail,
 				  unsigned int secs_to_flush)
 {
@@ -339,6 +397,7 @@ int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line)
 	bio = pblk_bio_map_addr(pblk, data, rq_ppas, rq_len,
 					l_mg->emeta_alloc_type, GFP_KERNEL);
 	if (IS_ERR(bio)) {
+		pr_err("pblk: failed to map emeta io");
 		ret = PTR_ERR(bio);
 		goto fail_free_rqd;
 	}
@@ -515,26 +574,54 @@ static int pblk_submit_write(struct pblk *pblk)
 	unsigned int secs_avail, secs_to_sync, secs_to_com;
 	unsigned int secs_to_flush;
 	unsigned long pos;
+	unsigned int resubmit;
 
-	/* If there are no sectors in the cache, flushes (bios without data)
-	 * will be cleared on the cache threads
-	 */
-	secs_avail = pblk_rb_read_count(&pblk->rwb);
-	if (!secs_avail)
-		return 1;
-
-	secs_to_flush = pblk_rb_flush_point_count(&pblk->rwb);
-	if (!secs_to_flush && secs_avail < pblk->min_write_pgs)
-		return 1;
-
-	secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail, secs_to_flush);
-	if (secs_to_sync > pblk->max_write_pgs) {
-		pr_err("pblk: bad buffer sync calculation\n");
-		return 1;
-	}
+	spin_lock(&pblk->resubmit_lock);
+	resubmit = !list_empty(&pblk->resubmit_list);
+	spin_unlock(&pblk->resubmit_lock);
+
+	/* Resubmit failed writes first */
+	if (resubmit) {
+		struct pblk_c_ctx *r_ctx;
+
+		spin_lock(&pblk->resubmit_lock);
+		r_ctx = list_first_entry(&pblk->resubmit_list,
+					struct pblk_c_ctx, list);
+		list_del(&r_ctx->list);
+		spin_unlock(&pblk->resubmit_lock);
+
+		secs_avail = r_ctx->nr_valid;
+		pos = r_ctx->sentry;
+
+		pblk_prepare_resubmit(pblk, pos, secs_avail);
+		secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail,
+				secs_avail);
 
-	secs_to_com = (secs_to_sync > secs_avail) ? secs_avail : secs_to_sync;
-	pos = pblk_rb_read_commit(&pblk->rwb, secs_to_com);
+		kfree(r_ctx);
+	} else {
+		/* If there are no sectors in the cache,
+		 * flushes (bios without data) will be cleared on
+		 * the cache threads
+		 */
+		secs_avail = pblk_rb_read_count(&pblk->rwb);
+		if (!secs_avail)
+			return 1;
+
+		secs_to_flush = pblk_rb_flush_point_count(&pblk->rwb);
+		if (!secs_to_flush && secs_avail < pblk->min_write_pgs)
+			return 1;
+
+		secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail,
+					secs_to_flush);
+		if (secs_to_sync > pblk->max_write_pgs) {
+			pr_err("pblk: bad buffer sync calculation\n");
+			return 1;
+		}
+
+		secs_to_com = (secs_to_sync > secs_avail) ?
+			secs_avail : secs_to_sync;
+		pos = pblk_rb_read_commit(&pblk->rwb, secs_to_com);
+	}
 
 	bio = bio_alloc(GFP_KERNEL, secs_to_sync);
 

+ 36 - 22
drivers/lightnvm/pblk.h

@@ -89,12 +89,14 @@ struct pblk_sec_meta {
 /* The number of GC lists and the rate-limiter states go together. This way the
  * rate-limiter can dictate how much GC is needed based on resource utilization.
  */
-#define PBLK_GC_NR_LISTS 3
+#define PBLK_GC_NR_LISTS 4
 
 enum {
-	PBLK_RL_HIGH = 1,
-	PBLK_RL_MID = 2,
-	PBLK_RL_LOW = 3,
+	PBLK_RL_OFF = 0,
+	PBLK_RL_WERR = 1,
+	PBLK_RL_HIGH = 2,
+	PBLK_RL_MID = 3,
+	PBLK_RL_LOW = 4
 };
 
 #define pblk_dma_meta_size (sizeof(struct pblk_sec_meta) * PBLK_MAX_REQ_ADDRS)
@@ -128,7 +130,6 @@ struct pblk_pad_rq {
 struct pblk_rec_ctx {
 	struct pblk *pblk;
 	struct nvm_rq *rqd;
-	struct list_head failed;
 	struct work_struct ws_rec;
 };
 
@@ -279,6 +280,8 @@ struct pblk_rl {
 	int rb_user_active;
 	int rb_gc_active;
 
+	atomic_t werr_lines;	/* Number of write error lines that needs gc */
+
 	struct timer_list u_timer;
 
 	unsigned long long nr_secs;
@@ -312,6 +315,7 @@ enum {
 	PBLK_LINEGC_MID = 23,
 	PBLK_LINEGC_HIGH = 24,
 	PBLK_LINEGC_FULL = 25,
+	PBLK_LINEGC_WERR = 26
 };
 
 #define PBLK_MAGIC 0x70626c6b /*pblk*/
@@ -413,6 +417,11 @@ struct pblk_smeta {
 	struct line_smeta *buf;		/* smeta buffer in persistent format */
 };
 
+struct pblk_w_err_gc {
+	int has_write_err;
+	__le64 *lba_list;
+};
+
 struct pblk_line {
 	struct pblk *pblk;
 	unsigned int id;		/* Line number corresponds to the
@@ -458,6 +467,8 @@ struct pblk_line {
 
 	struct kref ref;		/* Write buffer L2P references */
 
+	struct pblk_w_err_gc *w_err_gc;	/* Write error gc recovery metadata */
+
 	spinlock_t lock;		/* Necessary for invalid_bitmap only */
 };
 
@@ -489,6 +500,8 @@ struct pblk_line_mgmt {
 	struct list_head gc_mid_list;	/* Full lines ready to GC, mid isc */
 	struct list_head gc_low_list;	/* Full lines ready to GC, low isc */
 
+	struct list_head gc_werr_list;  /* Write err recovery list */
+
 	struct list_head gc_full_list;	/* Full lines ready to GC, no valid */
 	struct list_head gc_empty_list;	/* Full lines close, all valid */
 
@@ -664,12 +677,15 @@ struct pblk {
 
 	struct list_head compl_list;
 
-	mempool_t *page_bio_pool;
-	mempool_t *gen_ws_pool;
-	mempool_t *rec_pool;
-	mempool_t *r_rq_pool;
-	mempool_t *w_rq_pool;
-	mempool_t *e_rq_pool;
+	spinlock_t resubmit_lock;	 /* Resubmit list lock */
+	struct list_head resubmit_list; /* Resubmit list for failed writes*/
+
+	mempool_t page_bio_pool;
+	mempool_t gen_ws_pool;
+	mempool_t rec_pool;
+	mempool_t r_rq_pool;
+	mempool_t w_rq_pool;
+	mempool_t e_rq_pool;
 
 	struct workqueue_struct *close_wq;
 	struct workqueue_struct *bb_wq;
@@ -713,9 +729,6 @@ void pblk_rb_sync_l2p(struct pblk_rb *rb);
 unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct nvm_rq *rqd,
 				 unsigned int pos, unsigned int nr_entries,
 				 unsigned int count);
-unsigned int pblk_rb_read_to_bio_list(struct pblk_rb *rb, struct bio *bio,
-				      struct list_head *list,
-				      unsigned int max);
 int pblk_rb_copy_to_bio(struct pblk_rb *rb, struct bio *bio, sector_t lba,
 			struct ppa_addr ppa, int bio_iter, bool advanced_bio);
 unsigned int pblk_rb_read_commit(struct pblk_rb *rb, unsigned int entries);
@@ -766,11 +779,13 @@ struct pblk_line *pblk_line_get_data(struct pblk *pblk);
 struct pblk_line *pblk_line_get_erase(struct pblk *pblk);
 int pblk_line_erase(struct pblk *pblk, struct pblk_line *line);
 int pblk_line_is_full(struct pblk_line *line);
-void pblk_line_free(struct pblk *pblk, struct pblk_line *line);
+void pblk_line_free(struct pblk_line *line);
 void pblk_line_close_meta(struct pblk *pblk, struct pblk_line *line);
 void pblk_line_close(struct pblk *pblk, struct pblk_line *line);
 void pblk_line_close_ws(struct work_struct *work);
 void pblk_pipeline_stop(struct pblk *pblk);
+void __pblk_pipeline_stop(struct pblk *pblk);
+void __pblk_pipeline_flush(struct pblk *pblk);
 void pblk_gen_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv,
 		     void (*work)(struct work_struct *), gfp_t gfp_mask,
 		     struct workqueue_struct *wq);
@@ -794,7 +809,6 @@ void pblk_down_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas,
 void pblk_down_page(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas);
 void pblk_up_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas,
 		unsigned long *lun_bitmap);
-void pblk_end_io_sync(struct nvm_rq *rqd);
 int pblk_bio_add_pages(struct pblk *pblk, struct bio *bio, gfp_t flags,
 		       int nr_pages);
 void pblk_bio_free_pages(struct pblk *pblk, struct bio *bio, int off,
@@ -837,23 +851,20 @@ void pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry,
 int pblk_write_ts(void *data);
 void pblk_write_timer_fn(struct timer_list *t);
 void pblk_write_should_kick(struct pblk *pblk);
+void pblk_write_kick(struct pblk *pblk);
 
 /*
  * pblk read path
  */
-extern struct bio_set *pblk_bio_set;
+extern struct bio_set pblk_bio_set;
 int pblk_submit_read(struct pblk *pblk, struct bio *bio);
 int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq);
 /*
  * pblk recovery
  */
-void pblk_submit_rec(struct work_struct *work);
 struct pblk_line *pblk_recov_l2p(struct pblk *pblk);
 int pblk_recov_pad(struct pblk *pblk);
 int pblk_recov_check_emeta(struct pblk *pblk, struct line_emeta *emeta);
-int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx,
-			struct pblk_rec_ctx *recovery, u64 *comp_bits,
-			unsigned int comp);
 
 /*
  * pblk gc
@@ -864,7 +875,7 @@ int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx,
 #define PBLK_GC_RSV_LINE 1	/* Reserved lines for GC */
 
 int pblk_gc_init(struct pblk *pblk);
-void pblk_gc_exit(struct pblk *pblk);
+void pblk_gc_exit(struct pblk *pblk, bool graceful);
 void pblk_gc_should_start(struct pblk *pblk);
 void pblk_gc_should_stop(struct pblk *pblk);
 void pblk_gc_should_kick(struct pblk *pblk);
@@ -894,6 +905,9 @@ void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line,
 			    bool used);
 int pblk_rl_is_limit(struct pblk_rl *rl);
 
+void pblk_rl_werr_line_in(struct pblk_rl *rl);
+void pblk_rl_werr_line_out(struct pblk_rl *rl);
+
 /*
  * pblk sysfs
  */

+ 7 - 7
drivers/md/bcache/bcache.h

@@ -269,7 +269,7 @@ struct bcache_device {
 	atomic_t		*stripe_sectors_dirty;
 	unsigned long		*full_dirty_stripes;
 
-	struct bio_set		*bio_split;
+	struct bio_set		bio_split;
 
 	unsigned		data_csum:1;
 
@@ -345,6 +345,7 @@ struct cached_dev {
 
 	struct keybuf		writeback_keys;
 
+	struct task_struct	*status_update_thread;
 	/*
 	 * Order the write-half of writeback operations strongly in dispatch
 	 * order.  (Maintain LBA order; don't allow reads completing out of
@@ -392,6 +393,7 @@ struct cached_dev {
 #define DEFAULT_CACHED_DEV_ERROR_LIMIT	64
 	atomic_t		io_errors;
 	unsigned		error_limit;
+	unsigned		offline_seconds;
 
 	char			backing_dev_name[BDEVNAME_SIZE];
 };
@@ -528,9 +530,9 @@ struct cache_set {
 	struct closure		sb_write;
 	struct semaphore	sb_write_mutex;
 
-	mempool_t		*search;
-	mempool_t		*bio_meta;
-	struct bio_set		*bio_split;
+	mempool_t		search;
+	mempool_t		bio_meta;
+	struct bio_set		bio_split;
 
 	/* For the btree cache */
 	struct shrinker		shrink;
@@ -655,7 +657,7 @@ struct cache_set {
 	 * A btree node on disk could have too many bsets for an iterator to fit
 	 * on the stack - have to dynamically allocate them
 	 */
-	mempool_t		*fill_iter;
+	mempool_t		fill_iter;
 
 	struct bset_sort_state	sort;
 
@@ -956,8 +958,6 @@ void bch_prio_write(struct cache *);
 void bch_write_bdev_super(struct cached_dev *, struct closure *);
 
 extern struct workqueue_struct *bcache_wq;
-extern const char * const bch_cache_modes[];
-extern const char * const bch_stop_on_failure_modes[];
 extern struct mutex bch_register_lock;
 extern struct list_head bch_cache_sets;
 

+ 4 - 9
drivers/md/bcache/bset.c

@@ -1118,8 +1118,7 @@ struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter,
 
 void bch_bset_sort_state_free(struct bset_sort_state *state)
 {
-	if (state->pool)
-		mempool_destroy(state->pool);
+	mempool_exit(&state->pool);
 }
 
 int bch_bset_sort_state_init(struct bset_sort_state *state, unsigned page_order)
@@ -1129,11 +1128,7 @@ int bch_bset_sort_state_init(struct bset_sort_state *state, unsigned page_order)
 	state->page_order = page_order;
 	state->crit_factor = int_sqrt(1 << page_order);
 
-	state->pool = mempool_create_page_pool(1, page_order);
-	if (!state->pool)
-		return -ENOMEM;
-
-	return 0;
+	return mempool_init_page_pool(&state->pool, 1, page_order);
 }
 EXPORT_SYMBOL(bch_bset_sort_state_init);
 
@@ -1191,7 +1186,7 @@ static void __btree_sort(struct btree_keys *b, struct btree_iter *iter,
 
 		BUG_ON(order > state->page_order);
 
-		outp = mempool_alloc(state->pool, GFP_NOIO);
+		outp = mempool_alloc(&state->pool, GFP_NOIO);
 		out = page_address(outp);
 		used_mempool = true;
 		order = state->page_order;
@@ -1220,7 +1215,7 @@ static void __btree_sort(struct btree_keys *b, struct btree_iter *iter,
 	}
 
 	if (used_mempool)
-		mempool_free(virt_to_page(out), state->pool);
+		mempool_free(virt_to_page(out), &state->pool);
 	else
 		free_pages((unsigned long) out, order);
 

+ 1 - 1
drivers/md/bcache/bset.h

@@ -347,7 +347,7 @@ static inline struct bkey *bch_bset_search(struct btree_keys *b,
 /* Sorting */
 
 struct bset_sort_state {
-	mempool_t		*pool;
+	mempool_t		pool;
 
 	unsigned		page_order;
 	unsigned		crit_factor;

+ 2 - 2
drivers/md/bcache/btree.c

@@ -204,7 +204,7 @@ void bch_btree_node_read_done(struct btree *b)
 	struct bset *i = btree_bset_first(b);
 	struct btree_iter *iter;
 
-	iter = mempool_alloc(b->c->fill_iter, GFP_NOIO);
+	iter = mempool_alloc(&b->c->fill_iter, GFP_NOIO);
 	iter->size = b->c->sb.bucket_size / b->c->sb.block_size;
 	iter->used = 0;
 
@@ -271,7 +271,7 @@ void bch_btree_node_read_done(struct btree *b)
 		bch_bset_init_next(&b->keys, write_block(b),
 				   bset_magic(&b->c->sb));
 out:
-	mempool_free(iter, b->c->fill_iter);
+	mempool_free(iter, &b->c->fill_iter);
 	return;
 err:
 	set_btree_node_io_error(b);

+ 2 - 2
drivers/md/bcache/io.c

@@ -17,12 +17,12 @@
 void bch_bbio_free(struct bio *bio, struct cache_set *c)
 {
 	struct bbio *b = container_of(bio, struct bbio, bio);
-	mempool_free(b, c->bio_meta);
+	mempool_free(b, &c->bio_meta);
 }
 
 struct bio *bch_bbio_alloc(struct cache_set *c)
 {
-	struct bbio *b = mempool_alloc(c->bio_meta, GFP_NOIO);
+	struct bbio *b = mempool_alloc(&c->bio_meta, GFP_NOIO);
 	struct bio *bio = &b->bio;
 
 	bio_init(bio, bio->bi_inline_vecs, bucket_pages(c));

+ 9 - 9
drivers/md/bcache/request.c

@@ -213,7 +213,7 @@ static void bch_data_insert_start(struct closure *cl)
 	do {
 		unsigned i;
 		struct bkey *k;
-		struct bio_set *split = op->c->bio_split;
+		struct bio_set *split = &op->c->bio_split;
 
 		/* 1 for the device pointer and 1 for the chksum */
 		if (bch_keylist_realloc(&op->insert_keys,
@@ -548,7 +548,7 @@ static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k)
 
 	n = bio_next_split(bio, min_t(uint64_t, INT_MAX,
 				      KEY_OFFSET(k) - bio->bi_iter.bi_sector),
-			   GFP_NOIO, s->d->bio_split);
+			   GFP_NOIO, &s->d->bio_split);
 
 	bio_key = &container_of(n, struct bbio, bio)->key;
 	bch_bkey_copy_single_ptr(bio_key, k, ptr);
@@ -707,7 +707,7 @@ static void search_free(struct closure *cl)
 
 	bio_complete(s);
 	closure_debug_destroy(cl);
-	mempool_free(s, s->d->c->search);
+	mempool_free(s, &s->d->c->search);
 }
 
 static inline struct search *search_alloc(struct bio *bio,
@@ -715,7 +715,7 @@ static inline struct search *search_alloc(struct bio *bio,
 {
 	struct search *s;
 
-	s = mempool_alloc(d->c->search, GFP_NOIO);
+	s = mempool_alloc(&d->c->search, GFP_NOIO);
 
 	closure_init(&s->cl, NULL);
 	do_bio_hook(s, bio, request_endio);
@@ -864,7 +864,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
 	s->cache_missed = 1;
 
 	if (s->cache_miss || s->iop.bypass) {
-		miss = bio_next_split(bio, sectors, GFP_NOIO, s->d->bio_split);
+		miss = bio_next_split(bio, sectors, GFP_NOIO, &s->d->bio_split);
 		ret = miss == bio ? MAP_DONE : MAP_CONTINUE;
 		goto out_submit;
 	}
@@ -887,14 +887,14 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
 
 	s->iop.replace = true;
 
-	miss = bio_next_split(bio, sectors, GFP_NOIO, s->d->bio_split);
+	miss = bio_next_split(bio, sectors, GFP_NOIO, &s->d->bio_split);
 
 	/* btree_search_recurse()'s btree iterator is no good anymore */
 	ret = miss == bio ? MAP_DONE : -EINTR;
 
 	cache_bio = bio_alloc_bioset(GFP_NOWAIT,
 			DIV_ROUND_UP(s->insert_bio_sectors, PAGE_SECTORS),
-			dc->disk.bio_split);
+			&dc->disk.bio_split);
 	if (!cache_bio)
 		goto out_submit;
 
@@ -1008,7 +1008,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s)
 			struct bio *flush;
 
 			flush = bio_alloc_bioset(GFP_NOIO, 0,
-						 dc->disk.bio_split);
+						 &dc->disk.bio_split);
 			if (!flush) {
 				s->iop.status = BLK_STS_RESOURCE;
 				goto insert_data;
@@ -1021,7 +1021,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s)
 			closure_bio_submit(s->iop.c, flush, cl);
 		}
 	} else {
-		s->iop.bio = bio_clone_fast(bio, GFP_NOIO, dc->disk.bio_split);
+		s->iop.bio = bio_clone_fast(bio, GFP_NOIO, &dc->disk.bio_split);
 		/* I/O request sent to backing device */
 		bio->bi_end_io = backing_request_endio;
 		closure_bio_submit(s->iop.c, bio, cl);

+ 67 - 42
drivers/md/bcache/super.c

@@ -37,24 +37,6 @@ static const char invalid_uuid[] = {
 	0xc8, 0x50, 0xfc, 0x5e, 0xcb, 0x16, 0xcd, 0x99
 };
 
-/* Default is -1; we skip past it for struct cached_dev's cache mode */
-const char * const bch_cache_modes[] = {
-	"default",
-	"writethrough",
-	"writeback",
-	"writearound",
-	"none",
-	NULL
-};
-
-/* Default is -1; we skip past it for stop_when_cache_set_failed */
-const char * const bch_stop_on_failure_modes[] = {
-	"default",
-	"auto",
-	"always",
-	NULL
-};
-
 static struct kobject *bcache_kobj;
 struct mutex bch_register_lock;
 LIST_HEAD(bch_cache_sets);
@@ -654,6 +636,11 @@ static int ioctl_dev(struct block_device *b, fmode_t mode,
 		     unsigned int cmd, unsigned long arg)
 {
 	struct bcache_device *d = b->bd_disk->private_data;
+	struct cached_dev *dc = container_of(d, struct cached_dev, disk);
+
+	if (dc->io_disable)
+		return -EIO;
+
 	return d->ioctl(d, mode, cmd, arg);
 }
 
@@ -766,8 +753,7 @@ static void bcache_device_free(struct bcache_device *d)
 		put_disk(d->disk);
 	}
 
-	if (d->bio_split)
-		bioset_free(d->bio_split);
+	bioset_exit(&d->bio_split);
 	kvfree(d->full_dirty_stripes);
 	kvfree(d->stripe_sectors_dirty);
 
@@ -809,9 +795,8 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
 	if (idx < 0)
 		return idx;
 
-	if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio),
-					   BIOSET_NEED_BVECS |
-					   BIOSET_NEED_RESCUER)) ||
+	if (bioset_init(&d->bio_split, 4, offsetof(struct bbio, bio),
+			BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER) ||
 	    !(d->disk = alloc_disk(BCACHE_MINORS))) {
 		ida_simple_remove(&bcache_device_idx, idx);
 		return -ENOMEM;
@@ -864,6 +849,44 @@ static void calc_cached_dev_sectors(struct cache_set *c)
 	c->cached_dev_sectors = sectors;
 }
 
+#define BACKING_DEV_OFFLINE_TIMEOUT 5
+static int cached_dev_status_update(void *arg)
+{
+	struct cached_dev *dc = arg;
+	struct request_queue *q;
+
+	/*
+	 * If this delayed worker is stopping outside, directly quit here.
+	 * dc->io_disable might be set via sysfs interface, so check it
+	 * here too.
+	 */
+	while (!kthread_should_stop() && !dc->io_disable) {
+		q = bdev_get_queue(dc->bdev);
+		if (blk_queue_dying(q))
+			dc->offline_seconds++;
+		else
+			dc->offline_seconds = 0;
+
+		if (dc->offline_seconds >= BACKING_DEV_OFFLINE_TIMEOUT) {
+			pr_err("%s: device offline for %d seconds",
+			       dc->backing_dev_name,
+			       BACKING_DEV_OFFLINE_TIMEOUT);
+			pr_err("%s: disable I/O request due to backing "
+			       "device offline", dc->disk.name);
+			dc->io_disable = true;
+			/* let others know earlier that io_disable is true */
+			smp_mb();
+			bcache_device_stop(&dc->disk);
+			break;
+		}
+		schedule_timeout_interruptible(HZ);
+	}
+
+	wait_for_kthread_stop();
+	return 0;
+}
+
+
 void bch_cached_dev_run(struct cached_dev *dc)
 {
 	struct bcache_device *d = &dc->disk;
@@ -906,6 +929,14 @@ void bch_cached_dev_run(struct cached_dev *dc)
 	if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") ||
 	    sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache"))
 		pr_debug("error creating sysfs link");
+
+	dc->status_update_thread = kthread_run(cached_dev_status_update,
+					       dc, "bcache_status_update");
+	if (IS_ERR(dc->status_update_thread)) {
+		pr_warn("failed to create bcache_status_update kthread, "
+			"continue to run without monitoring backing "
+			"device status");
+	}
 }
 
 /*
@@ -1139,6 +1170,8 @@ static void cached_dev_free(struct closure *cl)
 		kthread_stop(dc->writeback_thread);
 	if (dc->writeback_write_wq)
 		destroy_workqueue(dc->writeback_write_wq);
+	if (!IS_ERR_OR_NULL(dc->status_update_thread))
+		kthread_stop(dc->status_update_thread);
 
 	if (atomic_read(&dc->running))
 		bd_unlink_disk_holder(dc->bdev, dc->disk.disk);
@@ -1465,14 +1498,10 @@ static void cache_set_free(struct closure *cl)
 
 	if (c->moving_gc_wq)
 		destroy_workqueue(c->moving_gc_wq);
-	if (c->bio_split)
-		bioset_free(c->bio_split);
-	if (c->fill_iter)
-		mempool_destroy(c->fill_iter);
-	if (c->bio_meta)
-		mempool_destroy(c->bio_meta);
-	if (c->search)
-		mempool_destroy(c->search);
+	bioset_exit(&c->bio_split);
+	mempool_exit(&c->fill_iter);
+	mempool_exit(&c->bio_meta);
+	mempool_exit(&c->search);
 	kfree(c->devices);
 
 	mutex_lock(&bch_register_lock);
@@ -1683,21 +1712,17 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
 	INIT_LIST_HEAD(&c->btree_cache_freed);
 	INIT_LIST_HEAD(&c->data_buckets);
 
-	c->search = mempool_create_slab_pool(32, bch_search_cache);
-	if (!c->search)
-		goto err;
-
 	iter_size = (sb->bucket_size / sb->block_size + 1) *
 		sizeof(struct btree_iter_set);
 
 	if (!(c->devices = kzalloc(c->nr_uuids * sizeof(void *), GFP_KERNEL)) ||
-	    !(c->bio_meta = mempool_create_kmalloc_pool(2,
-				sizeof(struct bbio) + sizeof(struct bio_vec) *
-				bucket_pages(c))) ||
-	    !(c->fill_iter = mempool_create_kmalloc_pool(1, iter_size)) ||
-	    !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio),
-					   BIOSET_NEED_BVECS |
-					   BIOSET_NEED_RESCUER)) ||
+	    mempool_init_slab_pool(&c->search, 32, bch_search_cache) ||
+	    mempool_init_kmalloc_pool(&c->bio_meta, 2,
+				      sizeof(struct bbio) + sizeof(struct bio_vec) *
+				      bucket_pages(c)) ||
+	    mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
+	    bioset_init(&c->bio_split, 4, offsetof(struct bbio, bio),
+			BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER) ||
 	    !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) ||
 	    !(c->moving_gc_wq = alloc_workqueue("bcache_gc",
 						WQ_MEM_RECLAIM, 0)) ||

Some files were not shown because too many files changed in this diff