Browse Source

Merge tag 'for-4.17/block-20180402' of git://git.kernel.dk/linux-block

Pull block layer updates from Jens Axboe:
 "It's a pretty quiet round this time, which is nice. This contains:

   - series from Bart, cleaning up the way we set/test/clear atomic
     queue flags.

   - series from Bart, fixing races between gendisk and queue
     registration and removal.

   - set of bcache fixes and improvements from various folks, by way of
     Michael Lyle.

   - set of lightnvm updates from Matias, most of it being the 1.2 to
     2.0 transition.

   - removal of unused DIO flags from Nikolay.

   - blk-mq/sbitmap memory ordering fixes from Omar.

   - divide-by-zero fix for BFQ from Paolo.

   - minor documentation patches from Randy.

   - timeout fix from Tejun.

   - Alpha "can't write a char atomically" fix from Mikulas.

   - set of NVMe fixes by way of Keith.

   - bsg and bsg-lib improvements from Christoph.

   - a few sed-opal fixes from Jonas.

   - cdrom check-disk-change deadlock fix from Maurizio.

   - various little fixes, comment fixes, etc from various folks"

* tag 'for-4.17/block-20180402' of git://git.kernel.dk/linux-block: (139 commits)
  blk-mq: Directly schedule q->timeout_work when aborting a request
  blktrace: fix comment in blktrace_api.h
  lightnvm: remove function name in strings
  lightnvm: pblk: remove some unnecessary NULL checks
  lightnvm: pblk: don't recover unwritten lines
  lightnvm: pblk: implement 2.0 support
  lightnvm: pblk: implement get log report chunk
  lightnvm: pblk: rename ppaf* to addrf*
  lightnvm: pblk: check for supported version
  lightnvm: implement get log report chunk helpers
  lightnvm: make address conversions depend on generic device
  lightnvm: add support for 2.0 address format
  lightnvm: normalize geometry nomenclature
  lightnvm: complete geo structure with maxoc*
  lightnvm: add shorten OCSSD version in geo
  lightnvm: add minor version to generic geometry
  lightnvm: simplify geometry structure
  lightnvm: pblk: refactor init/exit sequences
  lightnvm: Avoid validation of default op value
  lightnvm: centralize permission check for lightnvm ioctl
  ...
Linus Torvalds 7 years ago
parent
commit
3526dd0c78
100 changed files with 3763 additions and 1694 deletions
  1. 21 10
      Documentation/cdrom/cdrom-standard.tex
  2. 8 0
      Documentation/fault-injection/fault-injection.txt
  3. 116 0
      Documentation/fault-injection/nvme-fault-injection.txt
  4. 1 0
      MAINTAINERS
  5. 0 1
      arch/xtensa/platforms/iss/simdisk.c
  6. 24 1
      block/bfq-iosched.c
  7. 1 1
      block/bfq-iosched.h
  8. 2 2
      block/bio.c
  9. 62 16
      block/blk-cgroup.c
  10. 168 82
      block/blk-core.c
  11. 90 60
      block/blk-mq-debugfs.c
  12. 4 2
      block/blk-mq-pci.c
  13. 6 14
      block/blk-mq.c
  14. 2 4
      block/blk-settings.c
  15. 3 3
      block/blk-stat.c
  16. 7 22
      block/blk-sysfs.c
  17. 3 5
      block/blk-timeout.c
  18. 2 2
      block/blk-zoned.c
  19. 69 0
      block/blk.h
  20. 108 57
      block/bsg-lib.c
  21. 117 145
      block/bsg.c
  22. 27 10
      block/sed-opal.c
  23. 0 1
      drivers/block/brd.c
  24. 1 2
      drivers/block/drbd/drbd_main.c
  25. 2 2
      drivers/block/drbd/drbd_nl.c
  26. 48 29
      drivers/block/loop.c
  27. 4 4
      drivers/block/mtip32xx/mtip32xx.c
  28. 4 4
      drivers/block/nbd.c
  29. 85 39
      drivers/block/null_blk.c
  30. 2 0
      drivers/block/paride/pcd.c
  31. 2 11
      drivers/block/rbd.c
  32. 3 3
      drivers/block/rsxx/dev.c
  33. 2 2
      drivers/block/skd_main.c
  34. 3 4
      drivers/block/umem.c
  35. 5 5
      drivers/block/xen-blkfront.c
  36. 4 4
      drivers/block/zram/zram_drv.c
  37. 0 1
      drivers/block/zram/zram_drv.h
  38. 0 3
      drivers/cdrom/cdrom.c
  39. 3 0
      drivers/cdrom/gdrom.c
  40. 6 4
      drivers/ide/ide-cd.c
  41. 1 5
      drivers/ide/ide-cd.h
  42. 2 2
      drivers/ide/ide-disk.c
  43. 2 2
      drivers/ide/ide-probe.c
  44. 91 149
      drivers/lightnvm/core.c
  45. 4 0
      drivers/lightnvm/pblk-cache.c
  46. 154 48
      drivers/lightnvm/pblk-core.c
  47. 4 8
      drivers/lightnvm/pblk-gc.c
  48. 499 321
      drivers/lightnvm/pblk-init.c
  49. 4 2
      drivers/lightnvm/pblk-map.c
  50. 13 8
      drivers/lightnvm/pblk-rb.c
  51. 1 1
      drivers/lightnvm/pblk-read.c
  52. 76 15
      drivers/lightnvm/pblk-recovery.c
  53. 1 1
      drivers/lightnvm/pblk-rl.c
  54. 212 23
      drivers/lightnvm/pblk-sysfs.c
  55. 1 1
      drivers/lightnvm/pblk-write.c
  56. 220 84
      drivers/lightnvm/pblk.h
  57. 2 1
      drivers/md/bcache/alloc.c
  58. 53 4
      drivers/md/bcache/bcache.h
  59. 2 2
      drivers/md/bcache/bset.c
  60. 3 2
      drivers/md/bcache/bset.h
  61. 17 9
      drivers/md/bcache/btree.c
  62. 9 8
      drivers/md/bcache/closure.c
  63. 3 2
      drivers/md/bcache/closure.h
  64. 7 7
      drivers/md/bcache/debug.c
  65. 0 2
      drivers/md/bcache/extents.c
  66. 15 1
      drivers/md/bcache/io.c
  67. 5 3
      drivers/md/bcache/journal.c
  68. 157 29
      drivers/md/bcache/request.c
  69. 136 24
      drivers/md/bcache/super.c
  70. 52 3
      drivers/md/bcache/sysfs.c
  71. 15 10
      drivers/md/bcache/util.c
  72. 0 6
      drivers/md/bcache/util.h
  73. 79 13
      drivers/md/bcache/writeback.c
  74. 1 3
      drivers/md/bcache/writeback.h
  75. 8 8
      drivers/md/dm-table.c
  76. 1 1
      drivers/md/dm.c
  77. 2 2
      drivers/md/md-linear.c
  78. 5 5
      drivers/md/md.c
  79. 2 2
      drivers/md/raid0.c
  80. 3 3
      drivers/md/raid1.c
  81. 3 3
      drivers/md/raid10.c
  82. 2 2
      drivers/md/raid5.c
  83. 2 2
      drivers/misc/cardreader/rtsx_pcr.c
  84. 1 1
      drivers/mmc/core/block.c
  85. 4 4
      drivers/mmc/core/queue.c
  86. 3 3
      drivers/mtd/mtd_blkdevs.c
  87. 1 1
      drivers/nvdimm/blk.c
  88. 1 1
      drivers/nvdimm/btt.c
  89. 0 1
      drivers/nvdimm/nd.h
  90. 3 3
      drivers/nvdimm/pmem.c
  91. 1 0
      drivers/nvme/host/Makefile
  92. 79 44
      drivers/nvme/host/core.c
  93. 79 0
      drivers/nvme/host/fault_inject.c
  94. 16 20
      drivers/nvme/host/fc.c
  95. 579 178
      drivers/nvme/host/lightnvm.c
  96. 4 4
      drivers/nvme/host/multipath.c
  97. 34 1
      drivers/nvme/host/nvme.h
  98. 18 8
      drivers/nvme/host/pci.c
  99. 25 9
      drivers/nvme/host/rdma.c
  100. 31 34
      drivers/nvme/target/configfs.c

+ 21 - 10
Documentation/cdrom/cdrom-standard.tex

@@ -234,6 +234,7 @@ struct& cdrom_device_ops\ \{ \hidewidth\cr
   &int& (* open)(struct\ cdrom_device_info *, int)\cr
   &int& (* open)(struct\ cdrom_device_info *, int)\cr
   &void& (* release)(struct\ cdrom_device_info *);\cr 
   &void& (* release)(struct\ cdrom_device_info *);\cr 
   &int& (* drive_status)(struct\ cdrom_device_info *, int);\cr     
   &int& (* drive_status)(struct\ cdrom_device_info *, int);\cr     
+  &unsigned\ int& (* check_events)(struct\ cdrom_device_info *, unsigned\ int, int);\cr
   &int& (* media_changed)(struct\ cdrom_device_info *, int);\cr 
   &int& (* media_changed)(struct\ cdrom_device_info *, int);\cr 
   &int& (* tray_move)(struct\ cdrom_device_info *, int);\cr
   &int& (* tray_move)(struct\ cdrom_device_info *, int);\cr
   &int& (* lock_door)(struct\ cdrom_device_info *, int);\cr
   &int& (* lock_door)(struct\ cdrom_device_info *, int);\cr
@@ -245,10 +246,9 @@ struct& cdrom_device_ops\ \{ \hidewidth\cr
   &int& (* reset)(struct\ cdrom_device_info *);\cr
   &int& (* reset)(struct\ cdrom_device_info *);\cr
   &int& (* audio_ioctl)(struct\ cdrom_device_info *, unsigned\ int, 
   &int& (* audio_ioctl)(struct\ cdrom_device_info *, unsigned\ int, 
         void *{});\cr 
         void *{});\cr 
-  &int& (* dev_ioctl)(struct\ cdrom_device_info *, unsigned\ int, 
-        unsigned\ long);\cr
 \noalign{\medskip}
 \noalign{\medskip}
   &const\ int& capability;& capability flags \cr
   &const\ int& capability;& capability flags \cr
+  &int& (* generic_packet)(struct\ cdrom_device_info *, struct\ packet_command *{});\cr
 \};\cr
 \};\cr
 }
 }
 $$
 $$
@@ -274,19 +274,32 @@ $$
 \halign{$#$\ \hfil&$#$\ \hfil&\hbox to 10em{$#$\hss}&
 \halign{$#$\ \hfil&$#$\ \hfil&\hbox to 10em{$#$\hss}&
   $/*$ \rm# $*/$\hfil\cr
   $/*$ \rm# $*/$\hfil\cr
 struct& cdrom_device_info\ \{ \hidewidth\cr
 struct& cdrom_device_info\ \{ \hidewidth\cr
-  & struct\ cdrom_device_ops *& ops;& device operations for this major\cr
-  & struct\ cdrom_device_info *& next;& next device_info for this major\cr
+  & const\ struct\ cdrom_device_ops *& ops;& device operations for this major\cr
+  & struct\ list_head& list;& linked list of all device_info\cr
+  & struct\ gendisk *& disk;& matching block layer disk\cr
   & void *&  handle;& driver-dependent data\cr
   & void *&  handle;& driver-dependent data\cr
 \noalign{\medskip}
 \noalign{\medskip}
-  & kdev_t&  dev;& device number (incorporates minor)\cr
   & int& mask;& mask of capability: disables them \cr
   & int& mask;& mask of capability: disables them \cr
   & int& speed;& maximum speed for reading data \cr
   & int& speed;& maximum speed for reading data \cr
   & int& capacity;& number of discs in a jukebox \cr
   & int& capacity;& number of discs in a jukebox \cr
 \noalign{\medskip}
 \noalign{\medskip}
-  &int& options : 30;& options flags \cr
+  &unsigned\ int& options : 30;& options flags \cr
   &unsigned& mc_flags : 2;& media-change buffer flags \cr
   &unsigned& mc_flags : 2;& media-change buffer flags \cr
+  &unsigned\ int& vfs_events;& cached events for vfs path\cr
+  &unsigned\ int& ioctl_events;& cached events for ioctl path\cr
   & int& use_count;& number of times device is opened\cr
   & int& use_count;& number of times device is opened\cr
   & char& name[20];& name of the device type\cr
   & char& name[20];& name of the device type\cr
+\noalign{\medskip}
+  &__u8& sanyo_slot : 2;& Sanyo 3-CD changer support\cr
+  &__u8& keeplocked : 1;& CDROM_LOCKDOOR status\cr
+  &__u8& reserved : 5;& not used yet\cr
+  & int& cdda_method;& see CDDA_* flags\cr
+  &__u8& last_sense;& saves last sense key\cr
+  &__u8& media_written;& dirty flag, DVD+RW bookkeeping\cr
+  &unsigned\ short& mmc3_profile;& current MMC3 profile\cr
+  & int& for_data;& unknown:TBD\cr
+  & int\ (* exit)\ (struct\ cdrom_device_info *);&& unknown:TBD\cr
+  & int& mrw_mode_page;& which MRW mode page is in use\cr
 \}\cr
 \}\cr
 }$$
 }$$
 Using this $struct$, a linked list of the registered minor devices is
 Using this $struct$, a linked list of the registered minor devices is
@@ -298,9 +311,7 @@ The $mask$ flags can be used to mask out some of the capabilities listed
 in $ops\to capability$, if a specific drive doesn't support a feature
 in $ops\to capability$, if a specific drive doesn't support a feature
 of the driver. The value $speed$ specifies the maximum head-rate of the
 of the driver. The value $speed$ specifies the maximum head-rate of the
 drive, measured in units of normal audio speed (176\,kB/sec raw data or
 drive, measured in units of normal audio speed (176\,kB/sec raw data or
-150\,kB/sec file system data). The value $n_discs$ should reflect the
-number of discs the drive can hold simultaneously, if it is designed
-as a juke-box, or otherwise~1. The parameters are declared $const$
+150\,kB/sec file system data).  The parameters are declared $const$
 because they describe properties of the drive, which don't change after
 because they describe properties of the drive, which don't change after
 registration.
 registration.
 
 
@@ -1002,7 +1013,7 @@ taken over the torch in maintaining \cdromc\ and integrating much
 \cdrom-related code in the 2.1-kernel.  Thanks to Scott Snyder and
 \cdrom-related code in the 2.1-kernel.  Thanks to Scott Snyder and
 Gerd Knorr, who were the first to implement this interface for SCSI
 Gerd Knorr, who were the first to implement this interface for SCSI
 and IDE-CD drivers and added many ideas for extension of the data
 and IDE-CD drivers and added many ideas for extension of the data
-structures relative to kernel~2.0.  Further thanks to Heiko Ei{\sz}feldt,
+structures relative to kernel~2.0.  Further thanks to Heiko Ei{\ss}feldt,
 Thomas Quinot, Jon Tombs, Ken Pizzini, Eberhard M\"onkeberg and Andrew
 Thomas Quinot, Jon Tombs, Ken Pizzini, Eberhard M\"onkeberg and Andrew
 Kroll, the \linux\ \cdrom\ device driver developers who were kind
 Kroll, the \linux\ \cdrom\ device driver developers who were kind
 enough to give suggestions and criticisms during the writing. Finally
 enough to give suggestions and criticisms during the writing. Finally

+ 8 - 0
Documentation/fault-injection/fault-injection.txt

@@ -36,6 +36,14 @@ o fail_function
   ALLOW_ERROR_INJECTION() macro, by setting debugfs entries
   ALLOW_ERROR_INJECTION() macro, by setting debugfs entries
   under /sys/kernel/debug/fail_function. No boot option supported.
   under /sys/kernel/debug/fail_function. No boot option supported.
 
 
+o NVMe fault injection
+
+  inject NVMe status code and retry flag on devices permitted by setting
+  debugfs entries under /sys/kernel/debug/nvme*/fault_inject. The default
+  status code is NVME_SC_INVALID_OPCODE with no retry. The status code and
+  retry flag can be set via the debugfs.
+
+
 Configure fault-injection capabilities behavior
 Configure fault-injection capabilities behavior
 -----------------------------------------------
 -----------------------------------------------
 
 

+ 116 - 0
Documentation/fault-injection/nvme-fault-injection.txt

@@ -0,0 +1,116 @@
+NVMe Fault Injection
+====================
+Linux's fault injection framework provides a systematic way to support
+error injection via debugfs in the /sys/kernel/debug directory. When
+enabled, the default NVME_SC_INVALID_OPCODE with no retry will be
+injected into the nvme_end_request. Users can change the default status
+code and no retry flag via the debugfs. The list of Generic Command
+Status can be found in include/linux/nvme.h
+
+Following examples show how to inject an error into the nvme.
+
+First, enable CONFIG_FAULT_INJECTION_DEBUG_FS kernel config,
+recompile the kernel. After booting up the kernel, do the
+following.
+
+Example 1: Inject default status code with no retry
+---------------------------------------------------
+
+mount /dev/nvme0n1 /mnt
+echo 1 > /sys/kernel/debug/nvme0n1/fault_inject/times
+echo 100 > /sys/kernel/debug/nvme0n1/fault_inject/probability
+cp a.file /mnt
+
+Expected Result:
+
+cp: cannot stat ‘/mnt/a.file’: Input/output error
+
+Message from dmesg:
+
+FAULT_INJECTION: forcing a failure.
+name fault_inject, interval 1, probability 100, space 0, times 1
+CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.15.0-rc8+ #2
+Hardware name: innotek GmbH VirtualBox/VirtualBox,
+BIOS VirtualBox 12/01/2006
+Call Trace:
+  <IRQ>
+  dump_stack+0x5c/0x7d
+  should_fail+0x148/0x170
+  nvme_should_fail+0x2f/0x50 [nvme_core]
+  nvme_process_cq+0xe7/0x1d0 [nvme]
+  nvme_irq+0x1e/0x40 [nvme]
+  __handle_irq_event_percpu+0x3a/0x190
+  handle_irq_event_percpu+0x30/0x70
+  handle_irq_event+0x36/0x60
+  handle_fasteoi_irq+0x78/0x120
+  handle_irq+0xa7/0x130
+  ? tick_irq_enter+0xa8/0xc0
+  do_IRQ+0x43/0xc0
+  common_interrupt+0xa2/0xa2
+  </IRQ>
+RIP: 0010:native_safe_halt+0x2/0x10
+RSP: 0018:ffffffff82003e90 EFLAGS: 00000246 ORIG_RAX: ffffffffffffffdd
+RAX: ffffffff817a10c0 RBX: ffffffff82012480 RCX: 0000000000000000
+RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000
+RBP: 0000000000000000 R08: 000000008e38ce64 R09: 0000000000000000
+R10: 0000000000000000 R11: 0000000000000000 R12: ffffffff82012480
+R13: ffffffff82012480 R14: 0000000000000000 R15: 0000000000000000
+  ? __sched_text_end+0x4/0x4
+  default_idle+0x18/0xf0
+  do_idle+0x150/0x1d0
+  cpu_startup_entry+0x6f/0x80
+  start_kernel+0x4c4/0x4e4
+  ? set_init_arg+0x55/0x55
+  secondary_startup_64+0xa5/0xb0
+  print_req_error: I/O error, dev nvme0n1, sector 9240
+EXT4-fs error (device nvme0n1): ext4_find_entry:1436:
+inode #2: comm cp: reading directory lblock 0
+
+Example 2: Inject default status code with retry
+------------------------------------------------
+
+mount /dev/nvme0n1 /mnt
+echo 1 > /sys/kernel/debug/nvme0n1/fault_inject/times
+echo 100 > /sys/kernel/debug/nvme0n1/fault_inject/probability
+echo 1 > /sys/kernel/debug/nvme0n1/fault_inject/status
+echo 0 > /sys/kernel/debug/nvme0n1/fault_inject/dont_retry
+
+cp a.file /mnt
+
+Expected Result:
+
+command success without error
+
+Message from dmesg:
+
+FAULT_INJECTION: forcing a failure.
+name fault_inject, interval 1, probability 100, space 0, times 1
+CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.15.0-rc8+ #4
+Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
+Call Trace:
+  <IRQ>
+  dump_stack+0x5c/0x7d
+  should_fail+0x148/0x170
+  nvme_should_fail+0x30/0x60 [nvme_core]
+  nvme_loop_queue_response+0x84/0x110 [nvme_loop]
+  nvmet_req_complete+0x11/0x40 [nvmet]
+  nvmet_bio_done+0x28/0x40 [nvmet]
+  blk_update_request+0xb0/0x310
+  blk_mq_end_request+0x18/0x60
+  flush_smp_call_function_queue+0x3d/0xf0
+  smp_call_function_single_interrupt+0x2c/0xc0
+  call_function_single_interrupt+0xa2/0xb0
+  </IRQ>
+RIP: 0010:native_safe_halt+0x2/0x10
+RSP: 0018:ffffc9000068bec0 EFLAGS: 00000246 ORIG_RAX: ffffffffffffff04
+RAX: ffffffff817a10c0 RBX: ffff88011a3c9680 RCX: 0000000000000000
+RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000
+RBP: 0000000000000001 R08: 000000008e38c131 R09: 0000000000000000
+R10: 0000000000000000 R11: 0000000000000000 R12: ffff88011a3c9680
+R13: ffff88011a3c9680 R14: 0000000000000000 R15: 0000000000000000
+  ? __sched_text_end+0x4/0x4
+  default_idle+0x18/0xf0
+  do_idle+0x150/0x1d0
+  cpu_startup_entry+0x6f/0x80
+  start_secondary+0x187/0x1e0
+  secondary_startup_64+0xa5/0xb0

+ 1 - 0
MAINTAINERS

@@ -2646,6 +2646,7 @@ L:	linux-block@vger.kernel.org
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git
 S:	Maintained
 S:	Maintained
 F:	block/
 F:	block/
+F:	drivers/block/
 F:	kernel/trace/blktrace.c
 F:	kernel/trace/blktrace.c
 F:	lib/sbitmap.c
 F:	lib/sbitmap.c
 
 

+ 0 - 1
arch/xtensa/platforms/iss/simdisk.c

@@ -21,7 +21,6 @@
 #include <platform/simcall.h>
 #include <platform/simcall.h>
 
 
 #define SIMDISK_MAJOR 240
 #define SIMDISK_MAJOR 240
-#define SECTOR_SHIFT 9
 #define SIMDISK_MINORS 1
 #define SIMDISK_MINORS 1
 #define MAX_SIMDISK_COUNT 10
 #define MAX_SIMDISK_COUNT 10
 
 

+ 24 - 1
block/bfq-iosched.c

@@ -201,7 +201,20 @@ static struct kmem_cache *bfq_pool;
 /* Target observation time interval for a peak-rate update (ns) */
 /* Target observation time interval for a peak-rate update (ns) */
 #define BFQ_RATE_REF_INTERVAL	NSEC_PER_SEC
 #define BFQ_RATE_REF_INTERVAL	NSEC_PER_SEC
 
 
-/* Shift used for peak rate fixed precision calculations. */
+/*
+ * Shift used for peak-rate fixed precision calculations.
+ * With
+ * - the current shift: 16 positions
+ * - the current type used to store rate: u32
+ * - the current unit of measure for rate: [sectors/usec], or, more precisely,
+ *   [(sectors/usec) / 2^BFQ_RATE_SHIFT] to take into account the shift,
+ * the range of rates that can be stored is
+ * [1 / 2^BFQ_RATE_SHIFT, 2^(32 - BFQ_RATE_SHIFT)] sectors/usec =
+ * [1 / 2^16, 2^16] sectors/usec = [15e-6, 65536] sectors/usec =
+ * [15, 65G] sectors/sec
+ * Which, assuming a sector size of 512B, corresponds to a range of
+ * [7.5K, 33T] B/sec
+ */
 #define BFQ_RATE_SHIFT		16
 #define BFQ_RATE_SHIFT		16
 
 
 /*
 /*
@@ -2637,6 +2650,16 @@ static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq)
 	rate /= divisor; /* smoothing constant alpha = 1/divisor */
 	rate /= divisor; /* smoothing constant alpha = 1/divisor */
 
 
 	bfqd->peak_rate += rate;
 	bfqd->peak_rate += rate;
+
+	/*
+	 * For a very slow device, bfqd->peak_rate can reach 0 (see
+	 * the minimum representable values reported in the comments
+	 * on BFQ_RATE_SHIFT). Push to 1 if this happens, to avoid
+	 * divisions by zero where bfqd->peak_rate is used as a
+	 * divisor.
+	 */
+	bfqd->peak_rate = max_t(u32, 1, bfqd->peak_rate);
+
 	update_thr_responsiveness_params(bfqd);
 	update_thr_responsiveness_params(bfqd);
 
 
 reset_computation:
 reset_computation:

+ 1 - 1
block/bfq-iosched.h

@@ -499,7 +499,7 @@ struct bfq_data {
 	u64 delta_from_first;
 	u64 delta_from_first;
 	/*
 	/*
 	 * Current estimate of the device peak rate, measured in
 	 * Current estimate of the device peak rate, measured in
-	 * [BFQ_RATE_SHIFT * sectors/usec]. The left-shift by
+	 * [(sectors/usec) / 2^BFQ_RATE_SHIFT]. The left-shift by
 	 * BFQ_RATE_SHIFT is performed to increase precision in
 	 * BFQ_RATE_SHIFT is performed to increase precision in
 	 * fixed-point calculations.
 	 * fixed-point calculations.
 	 */
 	 */

+ 2 - 2
block/bio.c

@@ -43,9 +43,9 @@
  * break badly! cannot be bigger than what you can fit into an
  * break badly! cannot be bigger than what you can fit into an
  * unsigned short
  * unsigned short
  */
  */
-#define BV(x) { .nr_vecs = x, .name = "biovec-"__stringify(x) }
+#define BV(x, n) { .nr_vecs = x, .name = "biovec-"#n }
 static struct biovec_slab bvec_slabs[BVEC_POOL_NR] __read_mostly = {
 static struct biovec_slab bvec_slabs[BVEC_POOL_NR] __read_mostly = {
-	BV(1), BV(4), BV(16), BV(64), BV(128), BV(BIO_MAX_PAGES),
+	BV(1, 1), BV(4, 4), BV(16, 16), BV(64, 64), BV(128, 128), BV(BIO_MAX_PAGES, max),
 };
 };
 #undef BV
 #undef BV
 
 

+ 62 - 16
block/blk-cgroup.c

@@ -307,11 +307,28 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
 	}
 	}
 }
 }
 
 
+static void blkg_pd_offline(struct blkcg_gq *blkg)
+{
+	int i;
+
+	lockdep_assert_held(blkg->q->queue_lock);
+	lockdep_assert_held(&blkg->blkcg->lock);
+
+	for (i = 0; i < BLKCG_MAX_POLS; i++) {
+		struct blkcg_policy *pol = blkcg_policy[i];
+
+		if (blkg->pd[i] && !blkg->pd[i]->offline &&
+		    pol->pd_offline_fn) {
+			pol->pd_offline_fn(blkg->pd[i]);
+			blkg->pd[i]->offline = true;
+		}
+	}
+}
+
 static void blkg_destroy(struct blkcg_gq *blkg)
 static void blkg_destroy(struct blkcg_gq *blkg)
 {
 {
 	struct blkcg *blkcg = blkg->blkcg;
 	struct blkcg *blkcg = blkg->blkcg;
 	struct blkcg_gq *parent = blkg->parent;
 	struct blkcg_gq *parent = blkg->parent;
-	int i;
 
 
 	lockdep_assert_held(blkg->q->queue_lock);
 	lockdep_assert_held(blkg->q->queue_lock);
 	lockdep_assert_held(&blkcg->lock);
 	lockdep_assert_held(&blkcg->lock);
@@ -320,13 +337,6 @@ static void blkg_destroy(struct blkcg_gq *blkg)
 	WARN_ON_ONCE(list_empty(&blkg->q_node));
 	WARN_ON_ONCE(list_empty(&blkg->q_node));
 	WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node));
 	WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node));
 
 
-	for (i = 0; i < BLKCG_MAX_POLS; i++) {
-		struct blkcg_policy *pol = blkcg_policy[i];
-
-		if (blkg->pd[i] && pol->pd_offline_fn)
-			pol->pd_offline_fn(blkg->pd[i]);
-	}
-
 	if (parent) {
 	if (parent) {
 		blkg_rwstat_add_aux(&parent->stat_bytes, &blkg->stat_bytes);
 		blkg_rwstat_add_aux(&parent->stat_bytes, &blkg->stat_bytes);
 		blkg_rwstat_add_aux(&parent->stat_ios, &blkg->stat_ios);
 		blkg_rwstat_add_aux(&parent->stat_ios, &blkg->stat_ios);
@@ -369,6 +379,7 @@ static void blkg_destroy_all(struct request_queue *q)
 		struct blkcg *blkcg = blkg->blkcg;
 		struct blkcg *blkcg = blkg->blkcg;
 
 
 		spin_lock(&blkcg->lock);
 		spin_lock(&blkcg->lock);
+		blkg_pd_offline(blkg);
 		blkg_destroy(blkg);
 		blkg_destroy(blkg);
 		spin_unlock(&blkcg->lock);
 		spin_unlock(&blkcg->lock);
 	}
 	}
@@ -995,25 +1006,25 @@ static struct cftype blkcg_legacy_files[] = {
  * @css: css of interest
  * @css: css of interest
  *
  *
  * This function is called when @css is about to go away and responsible
  * This function is called when @css is about to go away and responsible
- * for shooting down all blkgs associated with @css.  blkgs should be
- * removed while holding both q and blkcg locks.  As blkcg lock is nested
- * inside q lock, this function performs reverse double lock dancing.
+ * for offlining all blkgs pd and killing all wbs associated with @css.
+ * blkgs pd offline should be done while holding both q and blkcg locks.
+ * As blkcg lock is nested inside q lock, this function performs reverse
+ * double lock dancing.
  *
  *
  * This is the blkcg counterpart of ioc_release_fn().
  * This is the blkcg counterpart of ioc_release_fn().
  */
  */
 static void blkcg_css_offline(struct cgroup_subsys_state *css)
 static void blkcg_css_offline(struct cgroup_subsys_state *css)
 {
 {
 	struct blkcg *blkcg = css_to_blkcg(css);
 	struct blkcg *blkcg = css_to_blkcg(css);
+	struct blkcg_gq *blkg;
 
 
 	spin_lock_irq(&blkcg->lock);
 	spin_lock_irq(&blkcg->lock);
 
 
-	while (!hlist_empty(&blkcg->blkg_list)) {
-		struct blkcg_gq *blkg = hlist_entry(blkcg->blkg_list.first,
-						struct blkcg_gq, blkcg_node);
+	hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
 		struct request_queue *q = blkg->q;
 		struct request_queue *q = blkg->q;
 
 
 		if (spin_trylock(q->queue_lock)) {
 		if (spin_trylock(q->queue_lock)) {
-			blkg_destroy(blkg);
+			blkg_pd_offline(blkg);
 			spin_unlock(q->queue_lock);
 			spin_unlock(q->queue_lock);
 		} else {
 		} else {
 			spin_unlock_irq(&blkcg->lock);
 			spin_unlock_irq(&blkcg->lock);
@@ -1027,11 +1038,43 @@ static void blkcg_css_offline(struct cgroup_subsys_state *css)
 	wb_blkcg_offline(blkcg);
 	wb_blkcg_offline(blkcg);
 }
 }
 
 
+/**
+ * blkcg_destroy_all_blkgs - destroy all blkgs associated with a blkcg
+ * @blkcg: blkcg of interest
+ *
+ * This function is called when blkcg css is about to free and responsible for
+ * destroying all blkgs associated with @blkcg.
+ * blkgs should be removed while holding both q and blkcg locks. As blkcg lock
+ * is nested inside q lock, this function performs reverse double lock dancing.
+ */
+static void blkcg_destroy_all_blkgs(struct blkcg *blkcg)
+{
+	spin_lock_irq(&blkcg->lock);
+	while (!hlist_empty(&blkcg->blkg_list)) {
+		struct blkcg_gq *blkg = hlist_entry(blkcg->blkg_list.first,
+						    struct blkcg_gq,
+						    blkcg_node);
+		struct request_queue *q = blkg->q;
+
+		if (spin_trylock(q->queue_lock)) {
+			blkg_destroy(blkg);
+			spin_unlock(q->queue_lock);
+		} else {
+			spin_unlock_irq(&blkcg->lock);
+			cpu_relax();
+			spin_lock_irq(&blkcg->lock);
+		}
+	}
+	spin_unlock_irq(&blkcg->lock);
+}
+
 static void blkcg_css_free(struct cgroup_subsys_state *css)
 static void blkcg_css_free(struct cgroup_subsys_state *css)
 {
 {
 	struct blkcg *blkcg = css_to_blkcg(css);
 	struct blkcg *blkcg = css_to_blkcg(css);
 	int i;
 	int i;
 
 
+	blkcg_destroy_all_blkgs(blkcg);
+
 	mutex_lock(&blkcg_pol_mutex);
 	mutex_lock(&blkcg_pol_mutex);
 
 
 	list_del(&blkcg->all_blkcgs_node);
 	list_del(&blkcg->all_blkcgs_node);
@@ -1371,8 +1414,11 @@ void blkcg_deactivate_policy(struct request_queue *q,
 		spin_lock(&blkg->blkcg->lock);
 		spin_lock(&blkg->blkcg->lock);
 
 
 		if (blkg->pd[pol->plid]) {
 		if (blkg->pd[pol->plid]) {
-			if (pol->pd_offline_fn)
+			if (!blkg->pd[pol->plid]->offline &&
+			    pol->pd_offline_fn) {
 				pol->pd_offline_fn(blkg->pd[pol->plid]);
 				pol->pd_offline_fn(blkg->pd[pol->plid]);
+				blkg->pd[pol->plid]->offline = true;
+			}
 			pol->pd_free_fn(blkg->pd[pol->plid]);
 			pol->pd_free_fn(blkg->pd[pol->plid]);
 			blkg->pd[pol->plid] = NULL;
 			blkg->pd[pol->plid] = NULL;
 		}
 		}

+ 168 - 82
block/blk-core.c

@@ -71,6 +71,78 @@ struct kmem_cache *blk_requestq_cachep;
  */
  */
 static struct workqueue_struct *kblockd_workqueue;
 static struct workqueue_struct *kblockd_workqueue;
 
 
+/**
+ * blk_queue_flag_set - atomically set a queue flag
+ * @flag: flag to be set
+ * @q: request queue
+ */
+void blk_queue_flag_set(unsigned int flag, struct request_queue *q)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(q->queue_lock, flags);
+	queue_flag_set(flag, q);
+	spin_unlock_irqrestore(q->queue_lock, flags);
+}
+EXPORT_SYMBOL(blk_queue_flag_set);
+
+/**
+ * blk_queue_flag_clear - atomically clear a queue flag
+ * @flag: flag to be cleared
+ * @q: request queue
+ */
+void blk_queue_flag_clear(unsigned int flag, struct request_queue *q)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(q->queue_lock, flags);
+	queue_flag_clear(flag, q);
+	spin_unlock_irqrestore(q->queue_lock, flags);
+}
+EXPORT_SYMBOL(blk_queue_flag_clear);
+
+/**
+ * blk_queue_flag_test_and_set - atomically test and set a queue flag
+ * @flag: flag to be set
+ * @q: request queue
+ *
+ * Returns the previous value of @flag - 0 if the flag was not set and 1 if
+ * the flag was already set.
+ */
+bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q)
+{
+	unsigned long flags;
+	bool res;
+
+	spin_lock_irqsave(q->queue_lock, flags);
+	res = queue_flag_test_and_set(flag, q);
+	spin_unlock_irqrestore(q->queue_lock, flags);
+
+	return res;
+}
+EXPORT_SYMBOL_GPL(blk_queue_flag_test_and_set);
+
+/**
+ * blk_queue_flag_test_and_clear - atomically test and clear a queue flag
+ * @flag: flag to be cleared
+ * @q: request queue
+ *
+ * Returns the previous value of @flag - 0 if the flag was not set and 1 if
+ * the flag was set.
+ */
+bool blk_queue_flag_test_and_clear(unsigned int flag, struct request_queue *q)
+{
+	unsigned long flags;
+	bool res;
+
+	spin_lock_irqsave(q->queue_lock, flags);
+	res = queue_flag_test_and_clear(flag, q);
+	spin_unlock_irqrestore(q->queue_lock, flags);
+
+	return res;
+}
+EXPORT_SYMBOL_GPL(blk_queue_flag_test_and_clear);
+
 static void blk_clear_congested(struct request_list *rl, int sync)
 static void blk_clear_congested(struct request_list *rl, int sync)
 {
 {
 #ifdef CONFIG_CGROUP_WRITEBACK
 #ifdef CONFIG_CGROUP_WRITEBACK
@@ -361,25 +433,14 @@ EXPORT_SYMBOL(blk_sync_queue);
  */
  */
 int blk_set_preempt_only(struct request_queue *q)
 int blk_set_preempt_only(struct request_queue *q)
 {
 {
-	unsigned long flags;
-	int res;
-
-	spin_lock_irqsave(q->queue_lock, flags);
-	res = queue_flag_test_and_set(QUEUE_FLAG_PREEMPT_ONLY, q);
-	spin_unlock_irqrestore(q->queue_lock, flags);
-
-	return res;
+	return blk_queue_flag_test_and_set(QUEUE_FLAG_PREEMPT_ONLY, q);
 }
 }
 EXPORT_SYMBOL_GPL(blk_set_preempt_only);
 EXPORT_SYMBOL_GPL(blk_set_preempt_only);
 
 
 void blk_clear_preempt_only(struct request_queue *q)
 void blk_clear_preempt_only(struct request_queue *q)
 {
 {
-	unsigned long flags;
-
-	spin_lock_irqsave(q->queue_lock, flags);
-	queue_flag_clear(QUEUE_FLAG_PREEMPT_ONLY, q);
+	blk_queue_flag_clear(QUEUE_FLAG_PREEMPT_ONLY, q);
 	wake_up_all(&q->mq_freeze_wq);
 	wake_up_all(&q->mq_freeze_wq);
-	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 }
 EXPORT_SYMBOL_GPL(blk_clear_preempt_only);
 EXPORT_SYMBOL_GPL(blk_clear_preempt_only);
 
 
@@ -629,9 +690,7 @@ EXPORT_SYMBOL_GPL(blk_queue_bypass_end);
 
 
 void blk_set_queue_dying(struct request_queue *q)
 void blk_set_queue_dying(struct request_queue *q)
 {
 {
-	spin_lock_irq(q->queue_lock);
-	queue_flag_set(QUEUE_FLAG_DYING, q);
-	spin_unlock_irq(q->queue_lock);
+	blk_queue_flag_set(QUEUE_FLAG_DYING, q);
 
 
 	/*
 	/*
 	 * When queue DYING flag is set, we need to block new req
 	 * When queue DYING flag is set, we need to block new req
@@ -719,6 +778,37 @@ void blk_cleanup_queue(struct request_queue *q)
 	del_timer_sync(&q->backing_dev_info->laptop_mode_wb_timer);
 	del_timer_sync(&q->backing_dev_info->laptop_mode_wb_timer);
 	blk_sync_queue(q);
 	blk_sync_queue(q);
 
 
+	/*
+	 * I/O scheduler exit is only safe after the sysfs scheduler attribute
+	 * has been removed.
+	 */
+	WARN_ON_ONCE(q->kobj.state_in_sysfs);
+
+	/*
+	 * Since the I/O scheduler exit code may access cgroup information,
+	 * perform I/O scheduler exit before disassociating from the block
+	 * cgroup controller.
+	 */
+	if (q->elevator) {
+		ioc_clear_queue(q);
+		elevator_exit(q, q->elevator);
+		q->elevator = NULL;
+	}
+
+	/*
+	 * Remove all references to @q from the block cgroup controller before
+	 * restoring @q->queue_lock to avoid that restoring this pointer causes
+	 * e.g. blkcg_print_blkgs() to crash.
+	 */
+	blkcg_exit_queue(q);
+
+	/*
+	 * Since the cgroup code may dereference the @q->backing_dev_info
+	 * pointer, only decrease its reference count after having removed the
+	 * association with the block cgroup controller.
+	 */
+	bdi_put(q->backing_dev_info);
+
 	if (q->mq_ops)
 	if (q->mq_ops)
 		blk_mq_free_queue(q);
 		blk_mq_free_queue(q);
 	percpu_ref_exit(&q->q_usage_counter);
 	percpu_ref_exit(&q->q_usage_counter);
@@ -810,7 +900,7 @@ void blk_exit_rl(struct request_queue *q, struct request_list *rl)
 
 
 struct request_queue *blk_alloc_queue(gfp_t gfp_mask)
 struct request_queue *blk_alloc_queue(gfp_t gfp_mask)
 {
 {
-	return blk_alloc_queue_node(gfp_mask, NUMA_NO_NODE);
+	return blk_alloc_queue_node(gfp_mask, NUMA_NO_NODE, NULL);
 }
 }
 EXPORT_SYMBOL(blk_alloc_queue);
 EXPORT_SYMBOL(blk_alloc_queue);
 
 
@@ -827,7 +917,7 @@ int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags)
 		bool success = false;
 		bool success = false;
 		int ret;
 		int ret;
 
 
-		rcu_read_lock_sched();
+		rcu_read_lock();
 		if (percpu_ref_tryget_live(&q->q_usage_counter)) {
 		if (percpu_ref_tryget_live(&q->q_usage_counter)) {
 			/*
 			/*
 			 * The code that sets the PREEMPT_ONLY flag is
 			 * The code that sets the PREEMPT_ONLY flag is
@@ -840,7 +930,7 @@ int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags)
 				percpu_ref_put(&q->q_usage_counter);
 				percpu_ref_put(&q->q_usage_counter);
 			}
 			}
 		}
 		}
-		rcu_read_unlock_sched();
+		rcu_read_unlock();
 
 
 		if (success)
 		if (success)
 			return 0;
 			return 0;
@@ -888,7 +978,21 @@ static void blk_rq_timed_out_timer(struct timer_list *t)
 	kblockd_schedule_work(&q->timeout_work);
 	kblockd_schedule_work(&q->timeout_work);
 }
 }
 
 
-struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
+/**
+ * blk_alloc_queue_node - allocate a request queue
+ * @gfp_mask: memory allocation flags
+ * @node_id: NUMA node to allocate memory from
+ * @lock: For legacy queues, pointer to a spinlock that will be used to e.g.
+ *        serialize calls to the legacy .request_fn() callback. Ignored for
+ *	  blk-mq request queues.
+ *
+ * Note: pass the queue lock as the third argument to this function instead of
+ * setting the queue lock pointer explicitly to avoid triggering a sporadic
+ * crash in the blkcg code. This function namely calls blkcg_init_queue() and
+ * the queue lock pointer must be set before blkcg_init_queue() is called.
+ */
+struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id,
+					   spinlock_t *lock)
 {
 {
 	struct request_queue *q;
 	struct request_queue *q;
 
 
@@ -939,11 +1043,8 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 	mutex_init(&q->sysfs_lock);
 	mutex_init(&q->sysfs_lock);
 	spin_lock_init(&q->__queue_lock);
 	spin_lock_init(&q->__queue_lock);
 
 
-	/*
-	 * By default initialize queue_lock to internal lock and driver can
-	 * override it later if need be.
-	 */
-	q->queue_lock = &q->__queue_lock;
+	if (!q->mq_ops)
+		q->queue_lock = lock ? : &q->__queue_lock;
 
 
 	/*
 	/*
 	 * A queue starts its life with bypass turned on to avoid
 	 * A queue starts its life with bypass turned on to avoid
@@ -952,7 +1053,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 	 * registered by blk_register_queue().
 	 * registered by blk_register_queue().
 	 */
 	 */
 	q->bypass_depth = 1;
 	q->bypass_depth = 1;
-	__set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
+	queue_flag_set_unlocked(QUEUE_FLAG_BYPASS, q);
 
 
 	init_waitqueue_head(&q->mq_freeze_wq);
 	init_waitqueue_head(&q->mq_freeze_wq);
 
 
@@ -1030,13 +1131,11 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
 {
 {
 	struct request_queue *q;
 	struct request_queue *q;
 
 
-	q = blk_alloc_queue_node(GFP_KERNEL, node_id);
+	q = blk_alloc_queue_node(GFP_KERNEL, node_id, lock);
 	if (!q)
 	if (!q)
 		return NULL;
 		return NULL;
 
 
 	q->request_fn = rfn;
 	q->request_fn = rfn;
-	if (lock)
-		q->queue_lock = lock;
 	if (blk_init_allocated_queue(q) < 0) {
 	if (blk_init_allocated_queue(q) < 0) {
 		blk_cleanup_queue(q);
 		blk_cleanup_queue(q);
 		return NULL;
 		return NULL;
@@ -2023,7 +2122,7 @@ out_unlock:
 	return BLK_QC_T_NONE;
 	return BLK_QC_T_NONE;
 }
 }
 
 
-static void handle_bad_sector(struct bio *bio)
+static void handle_bad_sector(struct bio *bio, sector_t maxsector)
 {
 {
 	char b[BDEVNAME_SIZE];
 	char b[BDEVNAME_SIZE];
 
 
@@ -2031,7 +2130,7 @@ static void handle_bad_sector(struct bio *bio)
 	printk(KERN_INFO "%s: rw=%d, want=%Lu, limit=%Lu\n",
 	printk(KERN_INFO "%s: rw=%d, want=%Lu, limit=%Lu\n",
 			bio_devname(bio, b), bio->bi_opf,
 			bio_devname(bio, b), bio->bi_opf,
 			(unsigned long long)bio_end_sector(bio),
 			(unsigned long long)bio_end_sector(bio),
-			(long long)get_capacity(bio->bi_disk));
+			(long long)maxsector);
 }
 }
 
 
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 #ifdef CONFIG_FAIL_MAKE_REQUEST
@@ -2092,68 +2191,59 @@ static noinline int should_fail_bio(struct bio *bio)
 }
 }
 ALLOW_ERROR_INJECTION(should_fail_bio, ERRNO);
 ALLOW_ERROR_INJECTION(should_fail_bio, ERRNO);
 
 
+/*
+ * Check whether this bio extends beyond the end of the device or partition.
+ * This may well happen - the kernel calls bread() without checking the size of
+ * the device, e.g., when mounting a file system.
+ */
+static inline int bio_check_eod(struct bio *bio, sector_t maxsector)
+{
+	unsigned int nr_sectors = bio_sectors(bio);
+
+	if (nr_sectors && maxsector &&
+	    (nr_sectors > maxsector ||
+	     bio->bi_iter.bi_sector > maxsector - nr_sectors)) {
+		handle_bad_sector(bio, maxsector);
+		return -EIO;
+	}
+	return 0;
+}
+
 /*
 /*
  * Remap block n of partition p to block n+start(p) of the disk.
  * Remap block n of partition p to block n+start(p) of the disk.
  */
  */
 static inline int blk_partition_remap(struct bio *bio)
 static inline int blk_partition_remap(struct bio *bio)
 {
 {
 	struct hd_struct *p;
 	struct hd_struct *p;
-	int ret = 0;
+	int ret = -EIO;
 
 
 	rcu_read_lock();
 	rcu_read_lock();
 	p = __disk_get_part(bio->bi_disk, bio->bi_partno);
 	p = __disk_get_part(bio->bi_disk, bio->bi_partno);
-	if (unlikely(!p || should_fail_request(p, bio->bi_iter.bi_size) ||
-		     bio_check_ro(bio, p))) {
-		ret = -EIO;
+	if (unlikely(!p))
+		goto out;
+	if (unlikely(should_fail_request(p, bio->bi_iter.bi_size)))
+		goto out;
+	if (unlikely(bio_check_ro(bio, p)))
 		goto out;
 		goto out;
-	}
 
 
 	/*
 	/*
 	 * Zone reset does not include bi_size so bio_sectors() is always 0.
 	 * Zone reset does not include bi_size so bio_sectors() is always 0.
 	 * Include a test for the reset op code and perform the remap if needed.
 	 * Include a test for the reset op code and perform the remap if needed.
 	 */
 	 */
-	if (!bio_sectors(bio) && bio_op(bio) != REQ_OP_ZONE_RESET)
-		goto out;
-
-	bio->bi_iter.bi_sector += p->start_sect;
-	bio->bi_partno = 0;
-	trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p),
-			      bio->bi_iter.bi_sector - p->start_sect);
-
+	if (bio_sectors(bio) || bio_op(bio) == REQ_OP_ZONE_RESET) {
+		if (bio_check_eod(bio, part_nr_sects_read(p)))
+			goto out;
+		bio->bi_iter.bi_sector += p->start_sect;
+		bio->bi_partno = 0;
+		trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p),
+				      bio->bi_iter.bi_sector - p->start_sect);
+	}
+	ret = 0;
 out:
 out:
 	rcu_read_unlock();
 	rcu_read_unlock();
 	return ret;
 	return ret;
 }
 }
 
 
-/*
- * Check whether this bio extends beyond the end of the device.
- */
-static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors)
-{
-	sector_t maxsector;
-
-	if (!nr_sectors)
-		return 0;
-
-	/* Test device or partition size, when known. */
-	maxsector = get_capacity(bio->bi_disk);
-	if (maxsector) {
-		sector_t sector = bio->bi_iter.bi_sector;
-
-		if (maxsector < nr_sectors || maxsector - nr_sectors < sector) {
-			/*
-			 * This may well happen - the kernel calls bread()
-			 * without checking the size of the device, e.g., when
-			 * mounting a device.
-			 */
-			handle_bad_sector(bio);
-			return 1;
-		}
-	}
-
-	return 0;
-}
-
 static noinline_for_stack bool
 static noinline_for_stack bool
 generic_make_request_checks(struct bio *bio)
 generic_make_request_checks(struct bio *bio)
 {
 {
@@ -2164,9 +2254,6 @@ generic_make_request_checks(struct bio *bio)
 
 
 	might_sleep();
 	might_sleep();
 
 
-	if (bio_check_eod(bio, nr_sectors))
-		goto end_io;
-
 	q = bio->bi_disk->queue;
 	q = bio->bi_disk->queue;
 	if (unlikely(!q)) {
 	if (unlikely(!q)) {
 		printk(KERN_ERR
 		printk(KERN_ERR
@@ -2186,17 +2273,16 @@ generic_make_request_checks(struct bio *bio)
 	if (should_fail_bio(bio))
 	if (should_fail_bio(bio))
 		goto end_io;
 		goto end_io;
 
 
-	if (!bio->bi_partno) {
-		if (unlikely(bio_check_ro(bio, &bio->bi_disk->part0)))
+	if (bio->bi_partno) {
+		if (unlikely(blk_partition_remap(bio)))
 			goto end_io;
 			goto end_io;
 	} else {
 	} else {
-		if (blk_partition_remap(bio))
+		if (unlikely(bio_check_ro(bio, &bio->bi_disk->part0)))
+			goto end_io;
+		if (unlikely(bio_check_eod(bio, get_capacity(bio->bi_disk))))
 			goto end_io;
 			goto end_io;
 	}
 	}
 
 
-	if (bio_check_eod(bio, nr_sectors))
-		goto end_io;
-
 	/*
 	/*
 	 * Filter flush bio's early so that make_request based
 	 * Filter flush bio's early so that make_request based
 	 * drivers without flush support don't have to worry
 	 * drivers without flush support don't have to worry

+ 90 - 60
block/blk-mq-debugfs.c

@@ -24,6 +24,64 @@
 #include "blk-mq-debugfs.h"
 #include "blk-mq-debugfs.h"
 #include "blk-mq-tag.h"
 #include "blk-mq-tag.h"
 
 
+static void print_stat(struct seq_file *m, struct blk_rq_stat *stat)
+{
+	if (stat->nr_samples) {
+		seq_printf(m, "samples=%d, mean=%lld, min=%llu, max=%llu",
+			   stat->nr_samples, stat->mean, stat->min, stat->max);
+	} else {
+		seq_puts(m, "samples=0");
+	}
+}
+
+static int queue_poll_stat_show(void *data, struct seq_file *m)
+{
+	struct request_queue *q = data;
+	int bucket;
+
+	for (bucket = 0; bucket < BLK_MQ_POLL_STATS_BKTS/2; bucket++) {
+		seq_printf(m, "read  (%d Bytes): ", 1 << (9+bucket));
+		print_stat(m, &q->poll_stat[2*bucket]);
+		seq_puts(m, "\n");
+
+		seq_printf(m, "write (%d Bytes): ",  1 << (9+bucket));
+		print_stat(m, &q->poll_stat[2*bucket+1]);
+		seq_puts(m, "\n");
+	}
+	return 0;
+}
+
+static void *queue_requeue_list_start(struct seq_file *m, loff_t *pos)
+	__acquires(&q->requeue_lock)
+{
+	struct request_queue *q = m->private;
+
+	spin_lock_irq(&q->requeue_lock);
+	return seq_list_start(&q->requeue_list, *pos);
+}
+
+static void *queue_requeue_list_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct request_queue *q = m->private;
+
+	return seq_list_next(v, &q->requeue_list, pos);
+}
+
+static void queue_requeue_list_stop(struct seq_file *m, void *v)
+	__releases(&q->requeue_lock)
+{
+	struct request_queue *q = m->private;
+
+	spin_unlock_irq(&q->requeue_lock);
+}
+
+static const struct seq_operations queue_requeue_list_seq_ops = {
+	.start	= queue_requeue_list_start,
+	.next	= queue_requeue_list_next,
+	.stop	= queue_requeue_list_stop,
+	.show	= blk_mq_debugfs_rq_show,
+};
+
 static int blk_flags_show(struct seq_file *m, const unsigned long flags,
 static int blk_flags_show(struct seq_file *m, const unsigned long flags,
 			  const char *const *flag_name, int flag_name_count)
 			  const char *const *flag_name, int flag_name_count)
 {
 {
@@ -125,16 +183,6 @@ inval:
 	return count;
 	return count;
 }
 }
 
 
-static void print_stat(struct seq_file *m, struct blk_rq_stat *stat)
-{
-	if (stat->nr_samples) {
-		seq_printf(m, "samples=%d, mean=%lld, min=%llu, max=%llu",
-			   stat->nr_samples, stat->mean, stat->min, stat->max);
-	} else {
-		seq_puts(m, "samples=0");
-	}
-}
-
 static int queue_write_hint_show(void *data, struct seq_file *m)
 static int queue_write_hint_show(void *data, struct seq_file *m)
 {
 {
 	struct request_queue *q = data;
 	struct request_queue *q = data;
@@ -158,23 +206,30 @@ static ssize_t queue_write_hint_store(void *data, const char __user *buf,
 	return count;
 	return count;
 }
 }
 
 
-static int queue_poll_stat_show(void *data, struct seq_file *m)
+static int queue_zone_wlock_show(void *data, struct seq_file *m)
 {
 {
 	struct request_queue *q = data;
 	struct request_queue *q = data;
-	int bucket;
+	unsigned int i;
 
 
-	for (bucket = 0; bucket < BLK_MQ_POLL_STATS_BKTS/2; bucket++) {
-		seq_printf(m, "read  (%d Bytes): ", 1 << (9+bucket));
-		print_stat(m, &q->poll_stat[2*bucket]);
-		seq_puts(m, "\n");
+	if (!q->seq_zones_wlock)
+		return 0;
+
+	for (i = 0; i < blk_queue_nr_zones(q); i++)
+		if (test_bit(i, q->seq_zones_wlock))
+			seq_printf(m, "%u\n", i);
 
 
-		seq_printf(m, "write (%d Bytes): ",  1 << (9+bucket));
-		print_stat(m, &q->poll_stat[2*bucket+1]);
-		seq_puts(m, "\n");
-	}
 	return 0;
 	return 0;
 }
 }
 
 
+static const struct blk_mq_debugfs_attr blk_mq_debugfs_queue_attrs[] = {
+	{ "poll_stat", 0400, queue_poll_stat_show },
+	{ "requeue_list", 0400, .seq_ops = &queue_requeue_list_seq_ops },
+	{ "state", 0600, queue_state_show, queue_state_write },
+	{ "write_hints", 0600, queue_write_hint_show, queue_write_hint_store },
+	{ "zone_wlock", 0400, queue_zone_wlock_show, NULL },
+	{ },
+};
+
 #define HCTX_STATE_NAME(name) [BLK_MQ_S_##name] = #name
 #define HCTX_STATE_NAME(name) [BLK_MQ_S_##name] = #name
 static const char *const hctx_state_name[] = {
 static const char *const hctx_state_name[] = {
 	HCTX_STATE_NAME(STOPPED),
 	HCTX_STATE_NAME(STOPPED),
@@ -295,6 +350,20 @@ static const char *const rqf_name[] = {
 };
 };
 #undef RQF_NAME
 #undef RQF_NAME
 
 
+static const char *const blk_mq_rq_state_name_array[] = {
+	[MQ_RQ_IDLE]		= "idle",
+	[MQ_RQ_IN_FLIGHT]	= "in_flight",
+	[MQ_RQ_COMPLETE]	= "complete",
+};
+
+static const char *blk_mq_rq_state_name(enum mq_rq_state rq_state)
+{
+	if (WARN_ON_ONCE((unsigned int)rq_state >
+			 ARRAY_SIZE(blk_mq_rq_state_name_array)))
+		return "(?)";
+	return blk_mq_rq_state_name_array[rq_state];
+}
+
 int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq)
 int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq)
 {
 {
 	const struct blk_mq_ops *const mq_ops = rq->q->mq_ops;
 	const struct blk_mq_ops *const mq_ops = rq->q->mq_ops;
@@ -311,7 +380,7 @@ int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq)
 	seq_puts(m, ", .rq_flags=");
 	seq_puts(m, ", .rq_flags=");
 	blk_flags_show(m, (__force unsigned int)rq->rq_flags, rqf_name,
 	blk_flags_show(m, (__force unsigned int)rq->rq_flags, rqf_name,
 		       ARRAY_SIZE(rqf_name));
 		       ARRAY_SIZE(rqf_name));
-	seq_printf(m, ", complete=%d", blk_rq_is_complete(rq));
+	seq_printf(m, ", .state=%s", blk_mq_rq_state_name(blk_mq_rq_state(rq)));
 	seq_printf(m, ", .tag=%d, .internal_tag=%d", rq->tag,
 	seq_printf(m, ", .tag=%d, .internal_tag=%d", rq->tag,
 		   rq->internal_tag);
 		   rq->internal_tag);
 	if (mq_ops->show_rq)
 	if (mq_ops->show_rq)
@@ -327,37 +396,6 @@ int blk_mq_debugfs_rq_show(struct seq_file *m, void *v)
 }
 }
 EXPORT_SYMBOL_GPL(blk_mq_debugfs_rq_show);
 EXPORT_SYMBOL_GPL(blk_mq_debugfs_rq_show);
 
 
-static void *queue_requeue_list_start(struct seq_file *m, loff_t *pos)
-	__acquires(&q->requeue_lock)
-{
-	struct request_queue *q = m->private;
-
-	spin_lock_irq(&q->requeue_lock);
-	return seq_list_start(&q->requeue_list, *pos);
-}
-
-static void *queue_requeue_list_next(struct seq_file *m, void *v, loff_t *pos)
-{
-	struct request_queue *q = m->private;
-
-	return seq_list_next(v, &q->requeue_list, pos);
-}
-
-static void queue_requeue_list_stop(struct seq_file *m, void *v)
-	__releases(&q->requeue_lock)
-{
-	struct request_queue *q = m->private;
-
-	spin_unlock_irq(&q->requeue_lock);
-}
-
-static const struct seq_operations queue_requeue_list_seq_ops = {
-	.start	= queue_requeue_list_start,
-	.next	= queue_requeue_list_next,
-	.stop	= queue_requeue_list_stop,
-	.show	= blk_mq_debugfs_rq_show,
-};
-
 static void *hctx_dispatch_start(struct seq_file *m, loff_t *pos)
 static void *hctx_dispatch_start(struct seq_file *m, loff_t *pos)
 	__acquires(&hctx->lock)
 	__acquires(&hctx->lock)
 {
 {
@@ -747,14 +785,6 @@ static const struct file_operations blk_mq_debugfs_fops = {
 	.release	= blk_mq_debugfs_release,
 	.release	= blk_mq_debugfs_release,
 };
 };
 
 
-static const struct blk_mq_debugfs_attr blk_mq_debugfs_queue_attrs[] = {
-	{"poll_stat", 0400, queue_poll_stat_show},
-	{"requeue_list", 0400, .seq_ops = &queue_requeue_list_seq_ops},
-	{"state", 0600, queue_state_show, queue_state_write},
-	{"write_hints", 0600, queue_write_hint_show, queue_write_hint_store},
-	{},
-};
-
 static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = {
 static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = {
 	{"state", 0400, hctx_state_show},
 	{"state", 0400, hctx_state_show},
 	{"flags", 0400, hctx_flags_show},
 	{"flags", 0400, hctx_flags_show},

+ 4 - 2
block/blk-mq-pci.c

@@ -21,6 +21,7 @@
  * blk_mq_pci_map_queues - provide a default queue mapping for PCI device
  * blk_mq_pci_map_queues - provide a default queue mapping for PCI device
  * @set:	tagset to provide the mapping for
  * @set:	tagset to provide the mapping for
  * @pdev:	PCI device associated with @set.
  * @pdev:	PCI device associated with @set.
+ * @offset:	Offset to use for the pci irq vector
  *
  *
  * This function assumes the PCI device @pdev has at least as many available
  * This function assumes the PCI device @pdev has at least as many available
  * interrupt vectors as @set has queues.  It will then query the vector
  * interrupt vectors as @set has queues.  It will then query the vector
@@ -28,13 +29,14 @@
  * that maps a queue to the CPUs that have irq affinity for the corresponding
  * that maps a queue to the CPUs that have irq affinity for the corresponding
  * vector.
  * vector.
  */
  */
-int blk_mq_pci_map_queues(struct blk_mq_tag_set *set, struct pci_dev *pdev)
+int blk_mq_pci_map_queues(struct blk_mq_tag_set *set, struct pci_dev *pdev,
+			    int offset)
 {
 {
 	const struct cpumask *mask;
 	const struct cpumask *mask;
 	unsigned int queue, cpu;
 	unsigned int queue, cpu;
 
 
 	for (queue = 0; queue < set->nr_hw_queues; queue++) {
 	for (queue = 0; queue < set->nr_hw_queues; queue++) {
-		mask = pci_irq_get_affinity(pdev, queue);
+		mask = pci_irq_get_affinity(pdev, queue + offset);
 		if (!mask)
 		if (!mask)
 			goto fallback;
 			goto fallback;
 
 

+ 6 - 14
block/blk-mq.c

@@ -194,11 +194,7 @@ EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
  */
  */
 void blk_mq_quiesce_queue_nowait(struct request_queue *q)
 void blk_mq_quiesce_queue_nowait(struct request_queue *q)
 {
 {
-	unsigned long flags;
-
-	spin_lock_irqsave(q->queue_lock, flags);
-	queue_flag_set(QUEUE_FLAG_QUIESCED, q);
-	spin_unlock_irqrestore(q->queue_lock, flags);
+	blk_queue_flag_set(QUEUE_FLAG_QUIESCED, q);
 }
 }
 EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);
 EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);
 
 
@@ -239,11 +235,7 @@ EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue);
  */
  */
 void blk_mq_unquiesce_queue(struct request_queue *q)
 void blk_mq_unquiesce_queue(struct request_queue *q)
 {
 {
-	unsigned long flags;
-
-	spin_lock_irqsave(q->queue_lock, flags);
-	queue_flag_clear(QUEUE_FLAG_QUIESCED, q);
-	spin_unlock_irqrestore(q->queue_lock, flags);
+	blk_queue_flag_clear(QUEUE_FLAG_QUIESCED, q);
 
 
 	/* dispatch requests which are inserted during quiescing */
 	/* dispatch requests which are inserted during quiescing */
 	blk_mq_run_hw_queues(q, true);
 	blk_mq_run_hw_queues(q, true);
@@ -986,9 +978,9 @@ static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data)
 	struct blk_mq_hw_ctx *hctx = flush_data->hctx;
 	struct blk_mq_hw_ctx *hctx = flush_data->hctx;
 	struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
 	struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
 
 
-	sbitmap_clear_bit(sb, bitnr);
 	spin_lock(&ctx->lock);
 	spin_lock(&ctx->lock);
 	list_splice_tail_init(&ctx->rq_list, flush_data->list);
 	list_splice_tail_init(&ctx->rq_list, flush_data->list);
+	sbitmap_clear_bit(sb, bitnr);
 	spin_unlock(&ctx->lock);
 	spin_unlock(&ctx->lock);
 	return true;
 	return true;
 }
 }
@@ -2556,7 +2548,7 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
 {
 {
 	struct request_queue *uninit_q, *q;
 	struct request_queue *uninit_q, *q;
 
 
-	uninit_q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node);
+	uninit_q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node, NULL);
 	if (!uninit_q)
 	if (!uninit_q)
 		return ERR_PTR(-ENOMEM);
 		return ERR_PTR(-ENOMEM);
 
 
@@ -2678,7 +2670,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 	q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
 	q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
 
 
 	if (!(set->flags & BLK_MQ_F_SG_MERGE))
 	if (!(set->flags & BLK_MQ_F_SG_MERGE))
-		q->queue_flags |= 1 << QUEUE_FLAG_NO_SG_MERGE;
+		queue_flag_set_unlocked(QUEUE_FLAG_NO_SG_MERGE, q);
 
 
 	q->sg_reserved_size = INT_MAX;
 	q->sg_reserved_size = INT_MAX;
 
 
@@ -3005,7 +2997,7 @@ EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);
 static bool blk_poll_stats_enable(struct request_queue *q)
 static bool blk_poll_stats_enable(struct request_queue *q)
 {
 {
 	if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) ||
 	if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) ||
-	    test_and_set_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags))
+	    blk_queue_flag_test_and_set(QUEUE_FLAG_POLL_STATS, q))
 		return true;
 		return true;
 	blk_stat_add_callback(q, q->poll_cb);
 	blk_stat_add_callback(q, q->poll_cb);
 	return false;
 	return false;

+ 2 - 4
block/blk-settings.c

@@ -859,12 +859,10 @@ EXPORT_SYMBOL(blk_queue_update_dma_alignment);
 
 
 void blk_queue_flush_queueable(struct request_queue *q, bool queueable)
 void blk_queue_flush_queueable(struct request_queue *q, bool queueable)
 {
 {
-	spin_lock_irq(q->queue_lock);
 	if (queueable)
 	if (queueable)
-		clear_bit(QUEUE_FLAG_FLUSH_NQ, &q->queue_flags);
+		blk_queue_flag_clear(QUEUE_FLAG_FLUSH_NQ, q);
 	else
 	else
-		set_bit(QUEUE_FLAG_FLUSH_NQ, &q->queue_flags);
-	spin_unlock_irq(q->queue_lock);
+		blk_queue_flag_set(QUEUE_FLAG_FLUSH_NQ, q);
 }
 }
 EXPORT_SYMBOL_GPL(blk_queue_flush_queueable);
 EXPORT_SYMBOL_GPL(blk_queue_flush_queueable);
 
 

+ 3 - 3
block/blk-stat.c

@@ -152,7 +152,7 @@ void blk_stat_add_callback(struct request_queue *q,
 
 
 	spin_lock(&q->stats->lock);
 	spin_lock(&q->stats->lock);
 	list_add_tail_rcu(&cb->list, &q->stats->callbacks);
 	list_add_tail_rcu(&cb->list, &q->stats->callbacks);
-	set_bit(QUEUE_FLAG_STATS, &q->queue_flags);
+	blk_queue_flag_set(QUEUE_FLAG_STATS, q);
 	spin_unlock(&q->stats->lock);
 	spin_unlock(&q->stats->lock);
 }
 }
 EXPORT_SYMBOL_GPL(blk_stat_add_callback);
 EXPORT_SYMBOL_GPL(blk_stat_add_callback);
@@ -163,7 +163,7 @@ void blk_stat_remove_callback(struct request_queue *q,
 	spin_lock(&q->stats->lock);
 	spin_lock(&q->stats->lock);
 	list_del_rcu(&cb->list);
 	list_del_rcu(&cb->list);
 	if (list_empty(&q->stats->callbacks) && !q->stats->enable_accounting)
 	if (list_empty(&q->stats->callbacks) && !q->stats->enable_accounting)
-		clear_bit(QUEUE_FLAG_STATS, &q->queue_flags);
+		blk_queue_flag_clear(QUEUE_FLAG_STATS, q);
 	spin_unlock(&q->stats->lock);
 	spin_unlock(&q->stats->lock);
 
 
 	del_timer_sync(&cb->timer);
 	del_timer_sync(&cb->timer);
@@ -191,7 +191,7 @@ void blk_stat_enable_accounting(struct request_queue *q)
 {
 {
 	spin_lock(&q->stats->lock);
 	spin_lock(&q->stats->lock);
 	q->stats->enable_accounting = true;
 	q->stats->enable_accounting = true;
-	set_bit(QUEUE_FLAG_STATS, &q->queue_flags);
+	blk_queue_flag_set(QUEUE_FLAG_STATS, q);
 	spin_unlock(&q->stats->lock);
 	spin_unlock(&q->stats->lock);
 }
 }
 
 

+ 7 - 22
block/blk-sysfs.c

@@ -276,12 +276,10 @@ queue_store_##name(struct request_queue *q, const char *page, size_t count) \
 	if (neg)							\
 	if (neg)							\
 		val = !val;						\
 		val = !val;						\
 									\
 									\
-	spin_lock_irq(q->queue_lock);					\
 	if (val)							\
 	if (val)							\
-		queue_flag_set(QUEUE_FLAG_##flag, q);			\
+		blk_queue_flag_set(QUEUE_FLAG_##flag, q);		\
 	else								\
 	else								\
-		queue_flag_clear(QUEUE_FLAG_##flag, q);			\
-	spin_unlock_irq(q->queue_lock);					\
+		blk_queue_flag_clear(QUEUE_FLAG_##flag, q);		\
 	return ret;							\
 	return ret;							\
 }
 }
 
 
@@ -414,12 +412,10 @@ static ssize_t queue_poll_store(struct request_queue *q, const char *page,
 	if (ret < 0)
 	if (ret < 0)
 		return ret;
 		return ret;
 
 
-	spin_lock_irq(q->queue_lock);
 	if (poll_on)
 	if (poll_on)
-		queue_flag_set(QUEUE_FLAG_POLL, q);
+		blk_queue_flag_set(QUEUE_FLAG_POLL, q);
 	else
 	else
-		queue_flag_clear(QUEUE_FLAG_POLL, q);
-	spin_unlock_irq(q->queue_lock);
+		blk_queue_flag_clear(QUEUE_FLAG_POLL, q);
 
 
 	return ret;
 	return ret;
 }
 }
@@ -487,12 +483,10 @@ static ssize_t queue_wc_store(struct request_queue *q, const char *page,
 	if (set == -1)
 	if (set == -1)
 		return -EINVAL;
 		return -EINVAL;
 
 
-	spin_lock_irq(q->queue_lock);
 	if (set)
 	if (set)
-		queue_flag_set(QUEUE_FLAG_WC, q);
+		blk_queue_flag_set(QUEUE_FLAG_WC, q);
 	else
 	else
-		queue_flag_clear(QUEUE_FLAG_WC, q);
-	spin_unlock_irq(q->queue_lock);
+		blk_queue_flag_clear(QUEUE_FLAG_WC, q);
 
 
 	return count;
 	return count;
 }
 }
@@ -798,13 +792,6 @@ static void __blk_release_queue(struct work_struct *work)
 	if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags))
 	if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags))
 		blk_stat_remove_callback(q, q->poll_cb);
 		blk_stat_remove_callback(q, q->poll_cb);
 	blk_stat_free_callback(q->poll_cb);
 	blk_stat_free_callback(q->poll_cb);
-	bdi_put(q->backing_dev_info);
-	blkcg_exit_queue(q);
-
-	if (q->elevator) {
-		ioc_clear_queue(q);
-		elevator_exit(q, q->elevator);
-	}
 
 
 	blk_free_queue_stats(q->stats);
 	blk_free_queue_stats(q->stats);
 
 
@@ -953,9 +940,7 @@ void blk_unregister_queue(struct gendisk *disk)
 	 */
 	 */
 	mutex_lock(&q->sysfs_lock);
 	mutex_lock(&q->sysfs_lock);
 
 
-	spin_lock_irq(q->queue_lock);
-	queue_flag_clear(QUEUE_FLAG_REGISTERED, q);
-	spin_unlock_irq(q->queue_lock);
+	blk_queue_flag_clear(QUEUE_FLAG_REGISTERED, q);
 
 
 	/*
 	/*
 	 * Remove the sysfs attributes before unregistering the queue data
 	 * Remove the sysfs attributes before unregistering the queue data

+ 3 - 5
block/blk-timeout.c

@@ -57,12 +57,10 @@ ssize_t part_timeout_store(struct device *dev, struct device_attribute *attr,
 		char *p = (char *) buf;
 		char *p = (char *) buf;
 
 
 		val = simple_strtoul(p, &p, 10);
 		val = simple_strtoul(p, &p, 10);
-		spin_lock_irq(q->queue_lock);
 		if (val)
 		if (val)
-			queue_flag_set(QUEUE_FLAG_FAIL_IO, q);
+			blk_queue_flag_set(QUEUE_FLAG_FAIL_IO, q);
 		else
 		else
-			queue_flag_clear(QUEUE_FLAG_FAIL_IO, q);
-		spin_unlock_irq(q->queue_lock);
+			blk_queue_flag_clear(QUEUE_FLAG_FAIL_IO, q);
 	}
 	}
 
 
 	return count;
 	return count;
@@ -165,7 +163,7 @@ void blk_abort_request(struct request *req)
 		 * No need for fancy synchronizations.
 		 * No need for fancy synchronizations.
 		 */
 		 */
 		blk_rq_set_deadline(req, jiffies);
 		blk_rq_set_deadline(req, jiffies);
-		mod_timer(&req->q->timeout, 0);
+		kblockd_schedule_work(&req->q->timeout_work);
 	} else {
 	} else {
 		if (blk_mark_rq_complete(req))
 		if (blk_mark_rq_complete(req))
 			return;
 			return;

+ 2 - 2
block/blk-zoned.c

@@ -296,7 +296,7 @@ int blkdev_reset_zones(struct block_device *bdev,
 }
 }
 EXPORT_SYMBOL_GPL(blkdev_reset_zones);
 EXPORT_SYMBOL_GPL(blkdev_reset_zones);
 
 
-/**
+/*
  * BLKREPORTZONE ioctl processing.
  * BLKREPORTZONE ioctl processing.
  * Called from blkdev_ioctl.
  * Called from blkdev_ioctl.
  */
  */
@@ -355,7 +355,7 @@ int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode,
 	return ret;
 	return ret;
 }
 }
 
 
-/**
+/*
  * BLKRESETZONE ioctl processing.
  * BLKRESETZONE ioctl processing.
  * Called from blkdev_ioctl.
  * Called from blkdev_ioctl.
  */
  */

+ 69 - 0
block/blk.h

@@ -41,6 +41,75 @@ extern struct kmem_cache *request_cachep;
 extern struct kobj_type blk_queue_ktype;
 extern struct kobj_type blk_queue_ktype;
 extern struct ida blk_queue_ida;
 extern struct ida blk_queue_ida;
 
 
+/*
+ * @q->queue_lock is set while a queue is being initialized. Since we know
+ * that no other threads access the queue object before @q->queue_lock has
+ * been set, it is safe to manipulate queue flags without holding the
+ * queue_lock if @q->queue_lock == NULL. See also blk_alloc_queue_node() and
+ * blk_init_allocated_queue().
+ */
+static inline void queue_lockdep_assert_held(struct request_queue *q)
+{
+	if (q->queue_lock)
+		lockdep_assert_held(q->queue_lock);
+}
+
+static inline void queue_flag_set_unlocked(unsigned int flag,
+					   struct request_queue *q)
+{
+	if (test_bit(QUEUE_FLAG_INIT_DONE, &q->queue_flags) &&
+	    kref_read(&q->kobj.kref))
+		lockdep_assert_held(q->queue_lock);
+	__set_bit(flag, &q->queue_flags);
+}
+
+static inline void queue_flag_clear_unlocked(unsigned int flag,
+					     struct request_queue *q)
+{
+	if (test_bit(QUEUE_FLAG_INIT_DONE, &q->queue_flags) &&
+	    kref_read(&q->kobj.kref))
+		lockdep_assert_held(q->queue_lock);
+	__clear_bit(flag, &q->queue_flags);
+}
+
+static inline int queue_flag_test_and_clear(unsigned int flag,
+					    struct request_queue *q)
+{
+	queue_lockdep_assert_held(q);
+
+	if (test_bit(flag, &q->queue_flags)) {
+		__clear_bit(flag, &q->queue_flags);
+		return 1;
+	}
+
+	return 0;
+}
+
+static inline int queue_flag_test_and_set(unsigned int flag,
+					  struct request_queue *q)
+{
+	queue_lockdep_assert_held(q);
+
+	if (!test_bit(flag, &q->queue_flags)) {
+		__set_bit(flag, &q->queue_flags);
+		return 0;
+	}
+
+	return 1;
+}
+
+static inline void queue_flag_set(unsigned int flag, struct request_queue *q)
+{
+	queue_lockdep_assert_held(q);
+	__set_bit(flag, &q->queue_flags);
+}
+
+static inline void queue_flag_clear(unsigned int flag, struct request_queue *q)
+{
+	queue_lockdep_assert_held(q);
+	__clear_bit(flag, &q->queue_flags);
+}
+
 static inline struct blk_flush_queue *blk_get_flush_queue(
 static inline struct blk_flush_queue *blk_get_flush_queue(
 		struct request_queue *q, struct blk_mq_ctx *ctx)
 		struct request_queue *q, struct blk_mq_ctx *ctx)
 {
 {

+ 108 - 57
block/bsg-lib.c

@@ -27,6 +27,94 @@
 #include <linux/bsg-lib.h>
 #include <linux/bsg-lib.h>
 #include <linux/export.h>
 #include <linux/export.h>
 #include <scsi/scsi_cmnd.h>
 #include <scsi/scsi_cmnd.h>
+#include <scsi/sg.h>
+
+#define uptr64(val) ((void __user *)(uintptr_t)(val))
+
+static int bsg_transport_check_proto(struct sg_io_v4 *hdr)
+{
+	if (hdr->protocol != BSG_PROTOCOL_SCSI  ||
+	    hdr->subprotocol != BSG_SUB_PROTOCOL_SCSI_TRANSPORT)
+		return -EINVAL;
+	if (!capable(CAP_SYS_RAWIO))
+		return -EPERM;
+	return 0;
+}
+
+static int bsg_transport_fill_hdr(struct request *rq, struct sg_io_v4 *hdr,
+		fmode_t mode)
+{
+	struct bsg_job *job = blk_mq_rq_to_pdu(rq);
+
+	job->request_len = hdr->request_len;
+	job->request = memdup_user(uptr64(hdr->request), hdr->request_len);
+	if (IS_ERR(job->request))
+		return PTR_ERR(job->request);
+	return 0;
+}
+
+static int bsg_transport_complete_rq(struct request *rq, struct sg_io_v4 *hdr)
+{
+	struct bsg_job *job = blk_mq_rq_to_pdu(rq);
+	int ret = 0;
+
+	/*
+	 * The assignments below don't make much sense, but are kept for
+	 * bug by bug backwards compatibility:
+	 */
+	hdr->device_status = job->result & 0xff;
+	hdr->transport_status = host_byte(job->result);
+	hdr->driver_status = driver_byte(job->result);
+	hdr->info = 0;
+	if (hdr->device_status || hdr->transport_status || hdr->driver_status)
+		hdr->info |= SG_INFO_CHECK;
+	hdr->response_len = 0;
+
+	if (job->result < 0) {
+		/* we're only returning the result field in the reply */
+		job->reply_len = sizeof(u32);
+		ret = job->result;
+	}
+
+	if (job->reply_len && hdr->response) {
+		int len = min(hdr->max_response_len, job->reply_len);
+
+		if (copy_to_user(uptr64(hdr->response), job->reply, len))
+			ret = -EFAULT;
+		else
+			hdr->response_len = len;
+	}
+
+	/* we assume all request payload was transferred, residual == 0 */
+	hdr->dout_resid = 0;
+
+	if (rq->next_rq) {
+		unsigned int rsp_len = job->reply_payload.payload_len;
+
+		if (WARN_ON(job->reply_payload_rcv_len > rsp_len))
+			hdr->din_resid = 0;
+		else
+			hdr->din_resid = rsp_len - job->reply_payload_rcv_len;
+	} else {
+		hdr->din_resid = 0;
+	}
+
+	return ret;
+}
+
+static void bsg_transport_free_rq(struct request *rq)
+{
+	struct bsg_job *job = blk_mq_rq_to_pdu(rq);
+
+	kfree(job->request);
+}
+
+static const struct bsg_ops bsg_transport_ops = {
+	.check_proto		= bsg_transport_check_proto,
+	.fill_hdr		= bsg_transport_fill_hdr,
+	.complete_rq		= bsg_transport_complete_rq,
+	.free_rq		= bsg_transport_free_rq,
+};
 
 
 /**
 /**
  * bsg_teardown_job - routine to teardown a bsg job
  * bsg_teardown_job - routine to teardown a bsg job
@@ -35,7 +123,7 @@
 static void bsg_teardown_job(struct kref *kref)
 static void bsg_teardown_job(struct kref *kref)
 {
 {
 	struct bsg_job *job = container_of(kref, struct bsg_job, kref);
 	struct bsg_job *job = container_of(kref, struct bsg_job, kref);
-	struct request *rq = job->req;
+	struct request *rq = blk_mq_rq_from_pdu(job);
 
 
 	put_device(job->dev);	/* release reference for the request */
 	put_device(job->dev);	/* release reference for the request */
 
 
@@ -68,28 +156,9 @@ EXPORT_SYMBOL_GPL(bsg_job_get);
 void bsg_job_done(struct bsg_job *job, int result,
 void bsg_job_done(struct bsg_job *job, int result,
 		  unsigned int reply_payload_rcv_len)
 		  unsigned int reply_payload_rcv_len)
 {
 {
-	struct request *req = job->req;
-	struct request *rsp = req->next_rq;
-	struct scsi_request *rq = scsi_req(req);
-	int err;
-
-	err = scsi_req(job->req)->result = result;
-	if (err < 0)
-		/* we're only returning the result field in the reply */
-		rq->sense_len = sizeof(u32);
-	else
-		rq->sense_len = job->reply_len;
-	/* we assume all request payload was transferred, residual == 0 */
-	rq->resid_len = 0;
-
-	if (rsp) {
-		WARN_ON(reply_payload_rcv_len > scsi_req(rsp)->resid_len);
-
-		/* set reply (bidi) residual */
-		scsi_req(rsp)->resid_len -=
-			min(reply_payload_rcv_len, scsi_req(rsp)->resid_len);
-	}
-	blk_complete_request(req);
+	job->result = result;
+	job->reply_payload_rcv_len = reply_payload_rcv_len;
+	blk_complete_request(blk_mq_rq_from_pdu(job));
 }
 }
 EXPORT_SYMBOL_GPL(bsg_job_done);
 EXPORT_SYMBOL_GPL(bsg_job_done);
 
 
@@ -114,7 +183,6 @@ static int bsg_map_buffer(struct bsg_buffer *buf, struct request *req)
 	if (!buf->sg_list)
 	if (!buf->sg_list)
 		return -ENOMEM;
 		return -ENOMEM;
 	sg_init_table(buf->sg_list, req->nr_phys_segments);
 	sg_init_table(buf->sg_list, req->nr_phys_segments);
-	scsi_req(req)->resid_len = blk_rq_bytes(req);
 	buf->sg_cnt = blk_rq_map_sg(req->q, req, buf->sg_list);
 	buf->sg_cnt = blk_rq_map_sg(req->q, req, buf->sg_list);
 	buf->payload_len = blk_rq_bytes(req);
 	buf->payload_len = blk_rq_bytes(req);
 	return 0;
 	return 0;
@@ -125,15 +193,13 @@ static int bsg_map_buffer(struct bsg_buffer *buf, struct request *req)
  * @dev: device that is being sent the bsg request
  * @dev: device that is being sent the bsg request
  * @req: BSG request that needs a job structure
  * @req: BSG request that needs a job structure
  */
  */
-static int bsg_prepare_job(struct device *dev, struct request *req)
+static bool bsg_prepare_job(struct device *dev, struct request *req)
 {
 {
 	struct request *rsp = req->next_rq;
 	struct request *rsp = req->next_rq;
-	struct scsi_request *rq = scsi_req(req);
 	struct bsg_job *job = blk_mq_rq_to_pdu(req);
 	struct bsg_job *job = blk_mq_rq_to_pdu(req);
 	int ret;
 	int ret;
 
 
-	job->request = rq->cmd;
-	job->request_len = rq->cmd_len;
+	job->timeout = req->timeout;
 
 
 	if (req->bio) {
 	if (req->bio) {
 		ret = bsg_map_buffer(&job->request_payload, req);
 		ret = bsg_map_buffer(&job->request_payload, req);
@@ -149,12 +215,13 @@ static int bsg_prepare_job(struct device *dev, struct request *req)
 	/* take a reference for the request */
 	/* take a reference for the request */
 	get_device(job->dev);
 	get_device(job->dev);
 	kref_init(&job->kref);
 	kref_init(&job->kref);
-	return 0;
+	return true;
 
 
 failjob_rls_rqst_payload:
 failjob_rls_rqst_payload:
 	kfree(job->request_payload.sg_list);
 	kfree(job->request_payload.sg_list);
 failjob_rls_job:
 failjob_rls_job:
-	return -ENOMEM;
+	job->result = -ENOMEM;
+	return false;
 }
 }
 
 
 /**
 /**
@@ -183,9 +250,7 @@ static void bsg_request_fn(struct request_queue *q)
 			break;
 			break;
 		spin_unlock_irq(q->queue_lock);
 		spin_unlock_irq(q->queue_lock);
 
 
-		ret = bsg_prepare_job(dev, req);
-		if (ret) {
-			scsi_req(req)->result = ret;
+		if (!bsg_prepare_job(dev, req)) {
 			blk_end_request_all(req, BLK_STS_OK);
 			blk_end_request_all(req, BLK_STS_OK);
 			spin_lock_irq(q->queue_lock);
 			spin_lock_irq(q->queue_lock);
 			continue;
 			continue;
@@ -202,47 +267,34 @@ static void bsg_request_fn(struct request_queue *q)
 	spin_lock_irq(q->queue_lock);
 	spin_lock_irq(q->queue_lock);
 }
 }
 
 
+/* called right after the request is allocated for the request_queue */
 static int bsg_init_rq(struct request_queue *q, struct request *req, gfp_t gfp)
 static int bsg_init_rq(struct request_queue *q, struct request *req, gfp_t gfp)
 {
 {
 	struct bsg_job *job = blk_mq_rq_to_pdu(req);
 	struct bsg_job *job = blk_mq_rq_to_pdu(req);
-	struct scsi_request *sreq = &job->sreq;
-
-	/* called right after the request is allocated for the request_queue */
 
 
-	sreq->sense = kzalloc(SCSI_SENSE_BUFFERSIZE, gfp);
-	if (!sreq->sense)
+	job->reply = kzalloc(SCSI_SENSE_BUFFERSIZE, gfp);
+	if (!job->reply)
 		return -ENOMEM;
 		return -ENOMEM;
-
 	return 0;
 	return 0;
 }
 }
 
 
+/* called right before the request is given to the request_queue user */
 static void bsg_initialize_rq(struct request *req)
 static void bsg_initialize_rq(struct request *req)
 {
 {
 	struct bsg_job *job = blk_mq_rq_to_pdu(req);
 	struct bsg_job *job = blk_mq_rq_to_pdu(req);
-	struct scsi_request *sreq = &job->sreq;
-	void *sense = sreq->sense;
-
-	/* called right before the request is given to the request_queue user */
+	void *reply = job->reply;
 
 
 	memset(job, 0, sizeof(*job));
 	memset(job, 0, sizeof(*job));
-
-	scsi_req_init(sreq);
-
-	sreq->sense = sense;
-	sreq->sense_len = SCSI_SENSE_BUFFERSIZE;
-
-	job->req = req;
-	job->reply = sense;
-	job->reply_len = sreq->sense_len;
+	job->reply = reply;
+	job->reply_len = SCSI_SENSE_BUFFERSIZE;
 	job->dd_data = job + 1;
 	job->dd_data = job + 1;
 }
 }
 
 
 static void bsg_exit_rq(struct request_queue *q, struct request *req)
 static void bsg_exit_rq(struct request_queue *q, struct request *req)
 {
 {
 	struct bsg_job *job = blk_mq_rq_to_pdu(req);
 	struct bsg_job *job = blk_mq_rq_to_pdu(req);
-	struct scsi_request *sreq = &job->sreq;
 
 
-	kfree(sreq->sense);
+	kfree(job->reply);
 }
 }
 
 
 /**
 /**
@@ -275,12 +327,11 @@ struct request_queue *bsg_setup_queue(struct device *dev, const char *name,
 
 
 	q->queuedata = dev;
 	q->queuedata = dev;
 	q->bsg_job_fn = job_fn;
 	q->bsg_job_fn = job_fn;
-	queue_flag_set_unlocked(QUEUE_FLAG_BIDI, q);
-	queue_flag_set_unlocked(QUEUE_FLAG_SCSI_PASSTHROUGH, q);
+	blk_queue_flag_set(QUEUE_FLAG_BIDI, q);
 	blk_queue_softirq_done(q, bsg_softirq_done);
 	blk_queue_softirq_done(q, bsg_softirq_done);
 	blk_queue_rq_timeout(q, BLK_DEFAULT_SG_TIMEOUT);
 	blk_queue_rq_timeout(q, BLK_DEFAULT_SG_TIMEOUT);
 
 
-	ret = bsg_register_queue(q, dev, name, release);
+	ret = bsg_register_queue(q, dev, name, &bsg_transport_ops, release);
 	if (ret) {
 	if (ret) {
 		printk(KERN_ERR "%s: bsg interface failed to "
 		printk(KERN_ERR "%s: bsg interface failed to "
 		       "initialize - register queue\n", dev->kobj.name);
 		       "initialize - register queue\n", dev->kobj.name);

+ 117 - 145
block/bsg.c

@@ -130,114 +130,120 @@ static inline struct hlist_head *bsg_dev_idx_hash(int index)
 	return &bsg_device_list[index & (BSG_LIST_ARRAY_SIZE - 1)];
 	return &bsg_device_list[index & (BSG_LIST_ARRAY_SIZE - 1)];
 }
 }
 
 
-static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq,
-				struct sg_io_v4 *hdr, struct bsg_device *bd,
-				fmode_t mode)
+#define uptr64(val) ((void __user *)(uintptr_t)(val))
+
+static int bsg_scsi_check_proto(struct sg_io_v4 *hdr)
+{
+	if (hdr->protocol != BSG_PROTOCOL_SCSI  ||
+	    hdr->subprotocol != BSG_SUB_PROTOCOL_SCSI_CMD)
+		return -EINVAL;
+	return 0;
+}
+
+static int bsg_scsi_fill_hdr(struct request *rq, struct sg_io_v4 *hdr,
+		fmode_t mode)
 {
 {
-	struct scsi_request *req = scsi_req(rq);
+	struct scsi_request *sreq = scsi_req(rq);
 
 
-	if (hdr->request_len > BLK_MAX_CDB) {
-		req->cmd = kzalloc(hdr->request_len, GFP_KERNEL);
-		if (!req->cmd)
+	sreq->cmd_len = hdr->request_len;
+	if (sreq->cmd_len > BLK_MAX_CDB) {
+		sreq->cmd = kzalloc(sreq->cmd_len, GFP_KERNEL);
+		if (!sreq->cmd)
 			return -ENOMEM;
 			return -ENOMEM;
 	}
 	}
 
 
-	if (copy_from_user(req->cmd, (void __user *)(unsigned long)hdr->request,
-			   hdr->request_len))
+	if (copy_from_user(sreq->cmd, uptr64(hdr->request), sreq->cmd_len))
 		return -EFAULT;
 		return -EFAULT;
-
-	if (hdr->subprotocol == BSG_SUB_PROTOCOL_SCSI_CMD) {
-		if (blk_verify_command(req->cmd, mode))
-			return -EPERM;
-	} else if (!capable(CAP_SYS_RAWIO))
+	if (blk_verify_command(sreq->cmd, mode))
 		return -EPERM;
 		return -EPERM;
-
-	/*
-	 * fill in request structure
-	 */
-	req->cmd_len = hdr->request_len;
-
-	rq->timeout = msecs_to_jiffies(hdr->timeout);
-	if (!rq->timeout)
-		rq->timeout = q->sg_timeout;
-	if (!rq->timeout)
-		rq->timeout = BLK_DEFAULT_SG_TIMEOUT;
-	if (rq->timeout < BLK_MIN_SG_TIMEOUT)
-		rq->timeout = BLK_MIN_SG_TIMEOUT;
-
 	return 0;
 	return 0;
 }
 }
 
 
-/*
- * Check if sg_io_v4 from user is allowed and valid
- */
-static int
-bsg_validate_sgv4_hdr(struct sg_io_v4 *hdr, int *op)
+static int bsg_scsi_complete_rq(struct request *rq, struct sg_io_v4 *hdr)
 {
 {
+	struct scsi_request *sreq = scsi_req(rq);
 	int ret = 0;
 	int ret = 0;
 
 
-	if (hdr->guard != 'Q')
-		return -EINVAL;
+	/*
+	 * fill in all the output members
+	 */
+	hdr->device_status = sreq->result & 0xff;
+	hdr->transport_status = host_byte(sreq->result);
+	hdr->driver_status = driver_byte(sreq->result);
+	hdr->info = 0;
+	if (hdr->device_status || hdr->transport_status || hdr->driver_status)
+		hdr->info |= SG_INFO_CHECK;
+	hdr->response_len = 0;
 
 
-	switch (hdr->protocol) {
-	case BSG_PROTOCOL_SCSI:
-		switch (hdr->subprotocol) {
-		case BSG_SUB_PROTOCOL_SCSI_CMD:
-		case BSG_SUB_PROTOCOL_SCSI_TRANSPORT:
-			break;
-		default:
-			ret = -EINVAL;
-		}
-		break;
-	default:
-		ret = -EINVAL;
+	if (sreq->sense_len && hdr->response) {
+		int len = min_t(unsigned int, hdr->max_response_len,
+					sreq->sense_len);
+
+		if (copy_to_user(uptr64(hdr->response), sreq->sense, len))
+			ret = -EFAULT;
+		else
+			hdr->response_len = len;
+	}
+
+	if (rq->next_rq) {
+		hdr->dout_resid = sreq->resid_len;
+		hdr->din_resid = scsi_req(rq->next_rq)->resid_len;
+	} else if (rq_data_dir(rq) == READ) {
+		hdr->din_resid = sreq->resid_len;
+	} else {
+		hdr->dout_resid = sreq->resid_len;
 	}
 	}
 
 
-	*op = hdr->dout_xfer_len ? REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN;
 	return ret;
 	return ret;
 }
 }
 
 
-/*
- * map sg_io_v4 to a request.
- */
+static void bsg_scsi_free_rq(struct request *rq)
+{
+	scsi_req_free_cmd(scsi_req(rq));
+}
+
+static const struct bsg_ops bsg_scsi_ops = {
+	.check_proto		= bsg_scsi_check_proto,
+	.fill_hdr		= bsg_scsi_fill_hdr,
+	.complete_rq		= bsg_scsi_complete_rq,
+	.free_rq		= bsg_scsi_free_rq,
+};
+
 static struct request *
 static struct request *
-bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t mode)
+bsg_map_hdr(struct request_queue *q, struct sg_io_v4 *hdr, fmode_t mode)
 {
 {
-	struct request_queue *q = bd->queue;
 	struct request *rq, *next_rq = NULL;
 	struct request *rq, *next_rq = NULL;
 	int ret;
 	int ret;
-	unsigned int op, dxfer_len;
-	void __user *dxferp = NULL;
-	struct bsg_class_device *bcd = &q->bsg_dev;
 
 
-	/* if the LLD has been removed then the bsg_unregister_queue will
-	 * eventually be called and the class_dev was freed, so we can no
-	 * longer use this request_queue. Return no such address.
-	 */
-	if (!bcd->class_dev)
+	if (!q->bsg_dev.class_dev)
 		return ERR_PTR(-ENXIO);
 		return ERR_PTR(-ENXIO);
 
 
-	bsg_dbg(bd, "map hdr %llx/%u %llx/%u\n",
-		(unsigned long long) hdr->dout_xferp,
-		hdr->dout_xfer_len, (unsigned long long) hdr->din_xferp,
-		hdr->din_xfer_len);
+	if (hdr->guard != 'Q')
+		return ERR_PTR(-EINVAL);
 
 
-	ret = bsg_validate_sgv4_hdr(hdr, &op);
+	ret = q->bsg_dev.ops->check_proto(hdr);
 	if (ret)
 	if (ret)
 		return ERR_PTR(ret);
 		return ERR_PTR(ret);
 
 
-	/*
-	 * map scatter-gather elements separately and string them to request
-	 */
-	rq = blk_get_request(q, op, GFP_KERNEL);
+	rq = blk_get_request(q, hdr->dout_xfer_len ?
+			REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN,
+			GFP_KERNEL);
 	if (IS_ERR(rq))
 	if (IS_ERR(rq))
 		return rq;
 		return rq;
 
 
-	ret = blk_fill_sgv4_hdr_rq(q, rq, hdr, bd, mode);
+	ret = q->bsg_dev.ops->fill_hdr(rq, hdr, mode);
 	if (ret)
 	if (ret)
 		goto out;
 		goto out;
 
 
-	if (op == REQ_OP_SCSI_OUT && hdr->din_xfer_len) {
+	rq->timeout = msecs_to_jiffies(hdr->timeout);
+	if (!rq->timeout)
+		rq->timeout = q->sg_timeout;
+	if (!rq->timeout)
+		rq->timeout = BLK_DEFAULT_SG_TIMEOUT;
+	if (rq->timeout < BLK_MIN_SG_TIMEOUT)
+		rq->timeout = BLK_MIN_SG_TIMEOUT;
+
+	if (hdr->dout_xfer_len && hdr->din_xfer_len) {
 		if (!test_bit(QUEUE_FLAG_BIDI, &q->queue_flags)) {
 		if (!test_bit(QUEUE_FLAG_BIDI, &q->queue_flags)) {
 			ret = -EOPNOTSUPP;
 			ret = -EOPNOTSUPP;
 			goto out;
 			goto out;
@@ -246,42 +252,39 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t mode)
 		next_rq = blk_get_request(q, REQ_OP_SCSI_IN, GFP_KERNEL);
 		next_rq = blk_get_request(q, REQ_OP_SCSI_IN, GFP_KERNEL);
 		if (IS_ERR(next_rq)) {
 		if (IS_ERR(next_rq)) {
 			ret = PTR_ERR(next_rq);
 			ret = PTR_ERR(next_rq);
-			next_rq = NULL;
 			goto out;
 			goto out;
 		}
 		}
-		rq->next_rq = next_rq;
 
 
-		dxferp = (void __user *)(unsigned long)hdr->din_xferp;
-		ret =  blk_rq_map_user(q, next_rq, NULL, dxferp,
+		rq->next_rq = next_rq;
+		ret = blk_rq_map_user(q, next_rq, NULL, uptr64(hdr->din_xferp),
 				       hdr->din_xfer_len, GFP_KERNEL);
 				       hdr->din_xfer_len, GFP_KERNEL);
 		if (ret)
 		if (ret)
-			goto out;
+			goto out_free_nextrq;
 	}
 	}
 
 
 	if (hdr->dout_xfer_len) {
 	if (hdr->dout_xfer_len) {
-		dxfer_len = hdr->dout_xfer_len;
-		dxferp = (void __user *)(unsigned long)hdr->dout_xferp;
+		ret = blk_rq_map_user(q, rq, NULL, uptr64(hdr->dout_xferp),
+				hdr->dout_xfer_len, GFP_KERNEL);
 	} else if (hdr->din_xfer_len) {
 	} else if (hdr->din_xfer_len) {
-		dxfer_len = hdr->din_xfer_len;
-		dxferp = (void __user *)(unsigned long)hdr->din_xferp;
-	} else
-		dxfer_len = 0;
-
-	if (dxfer_len) {
-		ret = blk_rq_map_user(q, rq, NULL, dxferp, dxfer_len,
-				      GFP_KERNEL);
-		if (ret)
-			goto out;
+		ret = blk_rq_map_user(q, rq, NULL, uptr64(hdr->din_xferp),
+				hdr->din_xfer_len, GFP_KERNEL);
+	} else {
+		ret = blk_rq_map_user(q, rq, NULL, NULL, 0, GFP_KERNEL);
 	}
 	}
 
 
+	if (ret)
+		goto out_unmap_nextrq;
 	return rq;
 	return rq;
+
+out_unmap_nextrq:
+	if (rq->next_rq)
+		blk_rq_unmap_user(rq->next_rq->bio);
+out_free_nextrq:
+	if (rq->next_rq)
+		blk_put_request(rq->next_rq);
 out:
 out:
-	scsi_req_free_cmd(scsi_req(rq));
+	q->bsg_dev.ops->free_rq(rq);
 	blk_put_request(rq);
 	blk_put_request(rq);
-	if (next_rq) {
-		blk_rq_unmap_user(next_rq->bio);
-		blk_put_request(next_rq);
-	}
 	return ERR_PTR(ret);
 	return ERR_PTR(ret);
 }
 }
 
 
@@ -383,56 +386,18 @@ static struct bsg_command *bsg_get_done_cmd(struct bsg_device *bd)
 static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr,
 static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr,
 				    struct bio *bio, struct bio *bidi_bio)
 				    struct bio *bio, struct bio *bidi_bio)
 {
 {
-	struct scsi_request *req = scsi_req(rq);
-	int ret = 0;
-
-	pr_debug("rq %p bio %p 0x%x\n", rq, bio, req->result);
-	/*
-	 * fill in all the output members
-	 */
-	hdr->device_status = req->result & 0xff;
-	hdr->transport_status = host_byte(req->result);
-	hdr->driver_status = driver_byte(req->result);
-	hdr->info = 0;
-	if (hdr->device_status || hdr->transport_status || hdr->driver_status)
-		hdr->info |= SG_INFO_CHECK;
-	hdr->response_len = 0;
-
-	if (req->sense_len && hdr->response) {
-		int len = min_t(unsigned int, hdr->max_response_len,
-					req->sense_len);
+	int ret;
 
 
-		ret = copy_to_user((void __user *)(unsigned long)hdr->response,
-				   req->sense, len);
-		if (!ret)
-			hdr->response_len = len;
-		else
-			ret = -EFAULT;
-	}
+	ret = rq->q->bsg_dev.ops->complete_rq(rq, hdr);
 
 
 	if (rq->next_rq) {
 	if (rq->next_rq) {
-		hdr->dout_resid = req->resid_len;
-		hdr->din_resid = scsi_req(rq->next_rq)->resid_len;
 		blk_rq_unmap_user(bidi_bio);
 		blk_rq_unmap_user(bidi_bio);
 		blk_put_request(rq->next_rq);
 		blk_put_request(rq->next_rq);
-	} else if (rq_data_dir(rq) == READ)
-		hdr->din_resid = req->resid_len;
-	else
-		hdr->dout_resid = req->resid_len;
-
-	/*
-	 * If the request generated a negative error number, return it
-	 * (providing we aren't already returning an error); if it's
-	 * just a protocol response (i.e. non negative), that gets
-	 * processed above.
-	 */
-	if (!ret && req->result < 0)
-		ret = req->result;
+	}
 
 
 	blk_rq_unmap_user(bio);
 	blk_rq_unmap_user(bio);
-	scsi_req_free_cmd(req);
+	rq->q->bsg_dev.ops->free_rq(rq);
 	blk_put_request(rq);
 	blk_put_request(rq);
-
 	return ret;
 	return ret;
 }
 }
 
 
@@ -614,7 +579,7 @@ static int __bsg_write(struct bsg_device *bd, const char __user *buf,
 		/*
 		/*
 		 * get a request, fill in the blanks, and add to request queue
 		 * get a request, fill in the blanks, and add to request queue
 		 */
 		 */
-		rq = bsg_map_hdr(bd, &bc->hdr, mode);
+		rq = bsg_map_hdr(bd->queue, &bc->hdr, mode);
 		if (IS_ERR(rq)) {
 		if (IS_ERR(rq)) {
 			ret = PTR_ERR(rq);
 			ret = PTR_ERR(rq);
 			rq = NULL;
 			rq = NULL;
@@ -742,11 +707,6 @@ static struct bsg_device *bsg_add_device(struct inode *inode,
 	struct bsg_device *bd;
 	struct bsg_device *bd;
 	unsigned char buf[32];
 	unsigned char buf[32];
 
 
-	if (!blk_queue_scsi_passthrough(rq)) {
-		WARN_ONCE(true, "Attempt to register a non-SCSI queue\n");
-		return ERR_PTR(-EINVAL);
-	}
-
 	if (!blk_get_queue(rq))
 	if (!blk_get_queue(rq))
 		return ERR_PTR(-ENXIO);
 		return ERR_PTR(-ENXIO);
 
 
@@ -907,7 +867,7 @@ static long bsg_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		if (copy_from_user(&hdr, uarg, sizeof(hdr)))
 		if (copy_from_user(&hdr, uarg, sizeof(hdr)))
 			return -EFAULT;
 			return -EFAULT;
 
 
-		rq = bsg_map_hdr(bd, &hdr, file->f_mode);
+		rq = bsg_map_hdr(bd->queue, &hdr, file->f_mode);
 		if (IS_ERR(rq))
 		if (IS_ERR(rq))
 			return PTR_ERR(rq);
 			return PTR_ERR(rq);
 
 
@@ -959,7 +919,8 @@ void bsg_unregister_queue(struct request_queue *q)
 EXPORT_SYMBOL_GPL(bsg_unregister_queue);
 EXPORT_SYMBOL_GPL(bsg_unregister_queue);
 
 
 int bsg_register_queue(struct request_queue *q, struct device *parent,
 int bsg_register_queue(struct request_queue *q, struct device *parent,
-		       const char *name, void (*release)(struct device *))
+		const char *name, const struct bsg_ops *ops,
+		void (*release)(struct device *))
 {
 {
 	struct bsg_class_device *bcd;
 	struct bsg_class_device *bcd;
 	dev_t dev;
 	dev_t dev;
@@ -996,6 +957,7 @@ int bsg_register_queue(struct request_queue *q, struct device *parent,
 	bcd->queue = q;
 	bcd->queue = q;
 	bcd->parent = get_device(parent);
 	bcd->parent = get_device(parent);
 	bcd->release = release;
 	bcd->release = release;
+	bcd->ops = ops;
 	kref_init(&bcd->ref);
 	kref_init(&bcd->ref);
 	dev = MKDEV(bsg_major, bcd->minor);
 	dev = MKDEV(bsg_major, bcd->minor);
 	class_dev = device_create(bsg_class, parent, dev, NULL, "%s", devname);
 	class_dev = device_create(bsg_class, parent, dev, NULL, "%s", devname);
@@ -1023,7 +985,17 @@ unlock:
 	mutex_unlock(&bsg_mutex);
 	mutex_unlock(&bsg_mutex);
 	return ret;
 	return ret;
 }
 }
-EXPORT_SYMBOL_GPL(bsg_register_queue);
+
+int bsg_scsi_register_queue(struct request_queue *q, struct device *parent)
+{
+	if (!blk_queue_scsi_passthrough(q)) {
+		WARN_ONCE(true, "Attempt to register a non-SCSI queue\n");
+		return -EINVAL;
+	}
+
+	return bsg_register_queue(q, parent, NULL, &bsg_scsi_ops, NULL);
+}
+EXPORT_SYMBOL_GPL(bsg_scsi_register_queue);
 
 
 static struct cdev bsg_cdev;
 static struct cdev bsg_cdev;
 
 

+ 27 - 10
block/sed-opal.c

@@ -554,15 +554,14 @@ static void add_token_u64(int *err, struct opal_dev *cmd, u64 number)
 
 
 	size_t len;
 	size_t len;
 	int msb;
 	int msb;
-	u8 n;
 
 
 	if (!(number & ~TINY_ATOM_DATA_MASK)) {
 	if (!(number & ~TINY_ATOM_DATA_MASK)) {
 		add_token_u8(err, cmd, number);
 		add_token_u8(err, cmd, number);
 		return;
 		return;
 	}
 	}
 
 
-	msb = fls(number);
-	len = DIV_ROUND_UP(msb, 4);
+	msb = fls64(number);
+	len = DIV_ROUND_UP(msb, 8);
 
 
 	if (cmd->pos >= IO_BUFFER_LENGTH - len - 1) {
 	if (cmd->pos >= IO_BUFFER_LENGTH - len - 1) {
 		pr_debug("Error adding u64: end of buffer.\n");
 		pr_debug("Error adding u64: end of buffer.\n");
@@ -570,10 +569,8 @@ static void add_token_u64(int *err, struct opal_dev *cmd, u64 number)
 		return;
 		return;
 	}
 	}
 	add_short_atom_header(cmd, false, false, len);
 	add_short_atom_header(cmd, false, false, len);
-	while (len--) {
-		n = number >> (len * 8);
-		add_token_u8(err, cmd, n);
-	}
+	while (len--)
+		add_token_u8(err, cmd, number >> (len * 8));
 }
 }
 
 
 static void add_token_bytestring(int *err, struct opal_dev *cmd,
 static void add_token_bytestring(int *err, struct opal_dev *cmd,
@@ -871,6 +868,9 @@ static int response_parse(const u8 *buf, size_t length,
 static size_t response_get_string(const struct parsed_resp *resp, int n,
 static size_t response_get_string(const struct parsed_resp *resp, int n,
 				  const char **store)
 				  const char **store)
 {
 {
+	u8 skip;
+	const struct opal_resp_tok *token;
+
 	*store = NULL;
 	*store = NULL;
 	if (!resp) {
 	if (!resp) {
 		pr_debug("Response is NULL\n");
 		pr_debug("Response is NULL\n");
@@ -883,13 +883,30 @@ static size_t response_get_string(const struct parsed_resp *resp, int n,
 		return 0;
 		return 0;
 	}
 	}
 
 
-	if (resp->toks[n].type != OPAL_DTA_TOKENID_BYTESTRING) {
+	token = &resp->toks[n];
+	if (token->type != OPAL_DTA_TOKENID_BYTESTRING) {
 		pr_debug("Token is not a byte string!\n");
 		pr_debug("Token is not a byte string!\n");
 		return 0;
 		return 0;
 	}
 	}
 
 
-	*store = resp->toks[n].pos + 1;
-	return resp->toks[n].len - 1;
+	switch (token->width) {
+	case OPAL_WIDTH_TINY:
+	case OPAL_WIDTH_SHORT:
+		skip = 1;
+		break;
+	case OPAL_WIDTH_MEDIUM:
+		skip = 2;
+		break;
+	case OPAL_WIDTH_LONG:
+		skip = 4;
+		break;
+	default:
+		pr_debug("Token has invalid width!\n");
+		return 0;
+	}
+
+	*store = token->pos + skip;
+	return token->len - skip;
 }
 }
 
 
 static u64 response_get_u64(const struct parsed_resp *resp, int n)
 static u64 response_get_u64(const struct parsed_resp *resp, int n)

+ 0 - 1
drivers/block/brd.c

@@ -24,7 +24,6 @@
 
 
 #include <linux/uaccess.h>
 #include <linux/uaccess.h>
 
 
-#define SECTOR_SHIFT		9
 #define PAGE_SECTORS_SHIFT	(PAGE_SHIFT - SECTOR_SHIFT)
 #define PAGE_SECTORS_SHIFT	(PAGE_SHIFT - SECTOR_SHIFT)
 #define PAGE_SECTORS		(1 << PAGE_SECTORS_SHIFT)
 #define PAGE_SECTORS		(1 << PAGE_SECTORS_SHIFT)
 
 

+ 1 - 2
drivers/block/drbd/drbd_main.c

@@ -2816,7 +2816,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig
 
 
 	drbd_init_set_defaults(device);
 	drbd_init_set_defaults(device);
 
 
-	q = blk_alloc_queue(GFP_KERNEL);
+	q = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE, &resource->req_lock);
 	if (!q)
 	if (!q)
 		goto out_no_q;
 		goto out_no_q;
 	device->rq_queue = q;
 	device->rq_queue = q;
@@ -2848,7 +2848,6 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig
 	/* Setting the max_hw_sectors to an odd value of 8kibyte here
 	/* Setting the max_hw_sectors to an odd value of 8kibyte here
 	   This triggers a max_bio_size message upon first attach or connect */
 	   This triggers a max_bio_size message upon first attach or connect */
 	blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8);
 	blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8);
-	q->queue_lock = &resource->req_lock;
 
 
 	device->md_io.page = alloc_page(GFP_KERNEL);
 	device->md_io.page = alloc_page(GFP_KERNEL);
 	if (!device->md_io.page)
 	if (!device->md_io.page)

+ 2 - 2
drivers/block/drbd/drbd_nl.c

@@ -1212,10 +1212,10 @@ static void decide_on_discard_support(struct drbd_device *device,
 		 * topology on all peers. */
 		 * topology on all peers. */
 		blk_queue_discard_granularity(q, 512);
 		blk_queue_discard_granularity(q, 512);
 		q->limits.max_discard_sectors = drbd_max_discard_sectors(connection);
 		q->limits.max_discard_sectors = drbd_max_discard_sectors(connection);
-		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
+		blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
 		q->limits.max_write_zeroes_sectors = drbd_max_discard_sectors(connection);
 		q->limits.max_write_zeroes_sectors = drbd_max_discard_sectors(connection);
 	} else {
 	} else {
-		queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q);
+		blk_queue_flag_clear(QUEUE_FLAG_DISCARD, q);
 		blk_queue_discard_granularity(q, 0);
 		blk_queue_discard_granularity(q, 0);
 		q->limits.max_discard_sectors = 0;
 		q->limits.max_discard_sectors = 0;
 		q->limits.max_write_zeroes_sectors = 0;
 		q->limits.max_write_zeroes_sectors = 0;

+ 48 - 29
drivers/block/loop.c

@@ -214,10 +214,10 @@ static void __loop_update_dio(struct loop_device *lo, bool dio)
 	blk_mq_freeze_queue(lo->lo_queue);
 	blk_mq_freeze_queue(lo->lo_queue);
 	lo->use_dio = use_dio;
 	lo->use_dio = use_dio;
 	if (use_dio) {
 	if (use_dio) {
-		queue_flag_clear_unlocked(QUEUE_FLAG_NOMERGES, lo->lo_queue);
+		blk_queue_flag_clear(QUEUE_FLAG_NOMERGES, lo->lo_queue);
 		lo->lo_flags |= LO_FLAGS_DIRECT_IO;
 		lo->lo_flags |= LO_FLAGS_DIRECT_IO;
 	} else {
 	} else {
-		queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, lo->lo_queue);
+		blk_queue_flag_set(QUEUE_FLAG_NOMERGES, lo->lo_queue);
 		lo->lo_flags &= ~LO_FLAGS_DIRECT_IO;
 		lo->lo_flags &= ~LO_FLAGS_DIRECT_IO;
 	}
 	}
 	blk_mq_unfreeze_queue(lo->lo_queue);
 	blk_mq_unfreeze_queue(lo->lo_queue);
@@ -817,7 +817,7 @@ static void loop_config_discard(struct loop_device *lo)
 		q->limits.discard_alignment = 0;
 		q->limits.discard_alignment = 0;
 		blk_queue_max_discard_sectors(q, 0);
 		blk_queue_max_discard_sectors(q, 0);
 		blk_queue_max_write_zeroes_sectors(q, 0);
 		blk_queue_max_write_zeroes_sectors(q, 0);
-		queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q);
+		blk_queue_flag_clear(QUEUE_FLAG_DISCARD, q);
 		return;
 		return;
 	}
 	}
 
 
@@ -826,7 +826,7 @@ static void loop_config_discard(struct loop_device *lo)
 
 
 	blk_queue_max_discard_sectors(q, UINT_MAX >> 9);
 	blk_queue_max_discard_sectors(q, UINT_MAX >> 9);
 	blk_queue_max_write_zeroes_sectors(q, UINT_MAX >> 9);
 	blk_queue_max_write_zeroes_sectors(q, UINT_MAX >> 9);
-	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
+	blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
 }
 }
 
 
 static void loop_unprepare_queue(struct loop_device *lo)
 static void loop_unprepare_queue(struct loop_device *lo)
@@ -1167,21 +1167,17 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
 static int
 static int
 loop_get_status(struct loop_device *lo, struct loop_info64 *info)
 loop_get_status(struct loop_device *lo, struct loop_info64 *info)
 {
 {
-	struct file *file = lo->lo_backing_file;
+	struct file *file;
 	struct kstat stat;
 	struct kstat stat;
-	int error;
+	int ret;
 
 
-	if (lo->lo_state != Lo_bound)
+	if (lo->lo_state != Lo_bound) {
+		mutex_unlock(&lo->lo_ctl_mutex);
 		return -ENXIO;
 		return -ENXIO;
-	error = vfs_getattr(&file->f_path, &stat,
-			    STATX_INO, AT_STATX_SYNC_AS_STAT);
-	if (error)
-		return error;
+	}
+
 	memset(info, 0, sizeof(*info));
 	memset(info, 0, sizeof(*info));
 	info->lo_number = lo->lo_number;
 	info->lo_number = lo->lo_number;
-	info->lo_device = huge_encode_dev(stat.dev);
-	info->lo_inode = stat.ino;
-	info->lo_rdevice = huge_encode_dev(lo->lo_device ? stat.rdev : stat.dev);
 	info->lo_offset = lo->lo_offset;
 	info->lo_offset = lo->lo_offset;
 	info->lo_sizelimit = lo->lo_sizelimit;
 	info->lo_sizelimit = lo->lo_sizelimit;
 	info->lo_flags = lo->lo_flags;
 	info->lo_flags = lo->lo_flags;
@@ -1194,7 +1190,19 @@ loop_get_status(struct loop_device *lo, struct loop_info64 *info)
 		memcpy(info->lo_encrypt_key, lo->lo_encrypt_key,
 		memcpy(info->lo_encrypt_key, lo->lo_encrypt_key,
 		       lo->lo_encrypt_key_size);
 		       lo->lo_encrypt_key_size);
 	}
 	}
-	return 0;
+
+	/* Drop lo_ctl_mutex while we call into the filesystem. */
+	file = get_file(lo->lo_backing_file);
+	mutex_unlock(&lo->lo_ctl_mutex);
+	ret = vfs_getattr(&file->f_path, &stat, STATX_INO,
+			  AT_STATX_SYNC_AS_STAT);
+	if (!ret) {
+		info->lo_device = huge_encode_dev(stat.dev);
+		info->lo_inode = stat.ino;
+		info->lo_rdevice = huge_encode_dev(stat.rdev);
+	}
+	fput(file);
+	return ret;
 }
 }
 
 
 static void
 static void
@@ -1352,7 +1360,10 @@ static int lo_ioctl(struct block_device *bdev, fmode_t mode,
 	struct loop_device *lo = bdev->bd_disk->private_data;
 	struct loop_device *lo = bdev->bd_disk->private_data;
 	int err;
 	int err;
 
 
-	mutex_lock_nested(&lo->lo_ctl_mutex, 1);
+	err = mutex_lock_killable_nested(&lo->lo_ctl_mutex, 1);
+	if (err)
+		goto out_unlocked;
+
 	switch (cmd) {
 	switch (cmd) {
 	case LOOP_SET_FD:
 	case LOOP_SET_FD:
 		err = loop_set_fd(lo, mode, bdev, arg);
 		err = loop_set_fd(lo, mode, bdev, arg);
@@ -1374,7 +1385,8 @@ static int lo_ioctl(struct block_device *bdev, fmode_t mode,
 		break;
 		break;
 	case LOOP_GET_STATUS:
 	case LOOP_GET_STATUS:
 		err = loop_get_status_old(lo, (struct loop_info __user *) arg);
 		err = loop_get_status_old(lo, (struct loop_info __user *) arg);
-		break;
+		/* loop_get_status() unlocks lo_ctl_mutex */
+		goto out_unlocked;
 	case LOOP_SET_STATUS64:
 	case LOOP_SET_STATUS64:
 		err = -EPERM;
 		err = -EPERM;
 		if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN))
 		if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN))
@@ -1383,7 +1395,8 @@ static int lo_ioctl(struct block_device *bdev, fmode_t mode,
 		break;
 		break;
 	case LOOP_GET_STATUS64:
 	case LOOP_GET_STATUS64:
 		err = loop_get_status64(lo, (struct loop_info64 __user *) arg);
 		err = loop_get_status64(lo, (struct loop_info64 __user *) arg);
-		break;
+		/* loop_get_status() unlocks lo_ctl_mutex */
+		goto out_unlocked;
 	case LOOP_SET_CAPACITY:
 	case LOOP_SET_CAPACITY:
 		err = -EPERM;
 		err = -EPERM;
 		if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN))
 		if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN))
@@ -1535,16 +1548,20 @@ static int lo_compat_ioctl(struct block_device *bdev, fmode_t mode,
 
 
 	switch(cmd) {
 	switch(cmd) {
 	case LOOP_SET_STATUS:
 	case LOOP_SET_STATUS:
-		mutex_lock(&lo->lo_ctl_mutex);
-		err = loop_set_status_compat(
-			lo, (const struct compat_loop_info __user *) arg);
-		mutex_unlock(&lo->lo_ctl_mutex);
+		err = mutex_lock_killable(&lo->lo_ctl_mutex);
+		if (!err) {
+			err = loop_set_status_compat(lo,
+						     (const struct compat_loop_info __user *)arg);
+			mutex_unlock(&lo->lo_ctl_mutex);
+		}
 		break;
 		break;
 	case LOOP_GET_STATUS:
 	case LOOP_GET_STATUS:
-		mutex_lock(&lo->lo_ctl_mutex);
-		err = loop_get_status_compat(
-			lo, (struct compat_loop_info __user *) arg);
-		mutex_unlock(&lo->lo_ctl_mutex);
+		err = mutex_lock_killable(&lo->lo_ctl_mutex);
+		if (!err) {
+			err = loop_get_status_compat(lo,
+						     (struct compat_loop_info __user *)arg);
+			/* loop_get_status() unlocks lo_ctl_mutex */
+		}
 		break;
 		break;
 	case LOOP_SET_CAPACITY:
 	case LOOP_SET_CAPACITY:
 	case LOOP_CLR_FD:
 	case LOOP_CLR_FD:
@@ -1808,7 +1825,7 @@ static int loop_add(struct loop_device **l, int i)
 	 * page. For directio mode, merge does help to dispatch bigger request
 	 * page. For directio mode, merge does help to dispatch bigger request
 	 * to underlayer disk. We will enable merge once directio is enabled.
 	 * to underlayer disk. We will enable merge once directio is enabled.
 	 */
 	 */
-	queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, lo->lo_queue);
+	blk_queue_flag_set(QUEUE_FLAG_NOMERGES, lo->lo_queue);
 
 
 	err = -ENOMEM;
 	err = -ENOMEM;
 	disk = lo->lo_disk = alloc_disk(1 << part_shift);
 	disk = lo->lo_disk = alloc_disk(1 << part_shift);
@@ -1864,8 +1881,8 @@ out:
 
 
 static void loop_remove(struct loop_device *lo)
 static void loop_remove(struct loop_device *lo)
 {
 {
-	blk_cleanup_queue(lo->lo_queue);
 	del_gendisk(lo->lo_disk);
 	del_gendisk(lo->lo_disk);
+	blk_cleanup_queue(lo->lo_queue);
 	blk_mq_free_tag_set(&lo->tag_set);
 	blk_mq_free_tag_set(&lo->tag_set);
 	put_disk(lo->lo_disk);
 	put_disk(lo->lo_disk);
 	kfree(lo);
 	kfree(lo);
@@ -1949,7 +1966,9 @@ static long loop_control_ioctl(struct file *file, unsigned int cmd,
 		ret = loop_lookup(&lo, parm);
 		ret = loop_lookup(&lo, parm);
 		if (ret < 0)
 		if (ret < 0)
 			break;
 			break;
-		mutex_lock(&lo->lo_ctl_mutex);
+		ret = mutex_lock_killable(&lo->lo_ctl_mutex);
+		if (ret)
+			break;
 		if (lo->lo_state != Lo_unbound) {
 		if (lo->lo_state != Lo_unbound) {
 			ret = -EBUSY;
 			ret = -EBUSY;
 			mutex_unlock(&lo->lo_ctl_mutex);
 			mutex_unlock(&lo->lo_ctl_mutex);

+ 4 - 4
drivers/block/mtip32xx/mtip32xx.c

@@ -159,7 +159,7 @@ static bool mtip_check_surprise_removal(struct pci_dev *pdev)
 	if (vendor_id == 0xFFFF) {
 	if (vendor_id == 0xFFFF) {
 		dd->sr = true;
 		dd->sr = true;
 		if (dd->queue)
 		if (dd->queue)
-			set_bit(QUEUE_FLAG_DEAD, &dd->queue->queue_flags);
+			blk_queue_flag_set(QUEUE_FLAG_DEAD, dd->queue);
 		else
 		else
 			dev_warn(&dd->pdev->dev,
 			dev_warn(&dd->pdev->dev,
 				"%s: dd->queue is NULL\n", __func__);
 				"%s: dd->queue is NULL\n", __func__);
@@ -3855,8 +3855,8 @@ skip_create_disk:
 		goto start_service_thread;
 		goto start_service_thread;
 
 
 	/* Set device limits. */
 	/* Set device limits. */
-	set_bit(QUEUE_FLAG_NONROT, &dd->queue->queue_flags);
-	clear_bit(QUEUE_FLAG_ADD_RANDOM, &dd->queue->queue_flags);
+	blk_queue_flag_set(QUEUE_FLAG_NONROT, dd->queue);
+	blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, dd->queue);
 	blk_queue_max_segments(dd->queue, MTIP_MAX_SG);
 	blk_queue_max_segments(dd->queue, MTIP_MAX_SG);
 	blk_queue_physical_block_size(dd->queue, 4096);
 	blk_queue_physical_block_size(dd->queue, 4096);
 	blk_queue_max_hw_sectors(dd->queue, 0xffff);
 	blk_queue_max_hw_sectors(dd->queue, 0xffff);
@@ -3866,7 +3866,7 @@ skip_create_disk:
 
 
 	/* Signal trim support */
 	/* Signal trim support */
 	if (dd->trim_supp == true) {
 	if (dd->trim_supp == true) {
-		set_bit(QUEUE_FLAG_DISCARD, &dd->queue->queue_flags);
+		blk_queue_flag_set(QUEUE_FLAG_DISCARD, dd->queue);
 		dd->queue->limits.discard_granularity = 4096;
 		dd->queue->limits.discard_granularity = 4096;
 		blk_queue_max_discard_sectors(dd->queue,
 		blk_queue_max_discard_sectors(dd->queue,
 			MTIP_MAX_TRIM_ENTRY_LEN * MTIP_MAX_TRIM_ENTRIES);
 			MTIP_MAX_TRIM_ENTRY_LEN * MTIP_MAX_TRIM_ENTRIES);

+ 4 - 4
drivers/block/nbd.c

@@ -964,7 +964,7 @@ static void nbd_parse_flags(struct nbd_device *nbd)
 	else
 	else
 		set_disk_ro(nbd->disk, false);
 		set_disk_ro(nbd->disk, false);
 	if (config->flags & NBD_FLAG_SEND_TRIM)
 	if (config->flags & NBD_FLAG_SEND_TRIM)
-		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue);
+		blk_queue_flag_set(QUEUE_FLAG_DISCARD, nbd->disk->queue);
 	if (config->flags & NBD_FLAG_SEND_FLUSH) {
 	if (config->flags & NBD_FLAG_SEND_FLUSH) {
 		if (config->flags & NBD_FLAG_SEND_FUA)
 		if (config->flags & NBD_FLAG_SEND_FUA)
 			blk_queue_write_cache(nbd->disk->queue, true, true);
 			blk_queue_write_cache(nbd->disk->queue, true, true);
@@ -1040,7 +1040,7 @@ static void nbd_config_put(struct nbd_device *nbd)
 		nbd->config = NULL;
 		nbd->config = NULL;
 
 
 		nbd->tag_set.timeout = 0;
 		nbd->tag_set.timeout = 0;
-		queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue);
+		blk_queue_flag_clear(QUEUE_FLAG_DISCARD, nbd->disk->queue);
 
 
 		mutex_unlock(&nbd->config_lock);
 		mutex_unlock(&nbd->config_lock);
 		nbd_put(nbd);
 		nbd_put(nbd);
@@ -1488,8 +1488,8 @@ static int nbd_dev_add(int index)
 	/*
 	/*
 	 * Tell the block layer that we are not a rotational device
 	 * Tell the block layer that we are not a rotational device
 	 */
 	 */
-	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, disk->queue);
-	queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, disk->queue);
+	blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue);
+	blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, disk->queue);
 	disk->queue->limits.discard_granularity = 512;
 	disk->queue->limits.discard_granularity = 512;
 	blk_queue_max_discard_sectors(disk->queue, UINT_MAX);
 	blk_queue_max_discard_sectors(disk->queue, UINT_MAX);
 	blk_queue_max_segment_size(disk->queue, UINT_MAX);
 	blk_queue_max_segment_size(disk->queue, UINT_MAX);

+ 85 - 39
drivers/block/null_blk.c

@@ -16,10 +16,8 @@
 #include <linux/badblocks.h>
 #include <linux/badblocks.h>
 #include <linux/fault-inject.h>
 #include <linux/fault-inject.h>
 
 
-#define SECTOR_SHIFT		9
 #define PAGE_SECTORS_SHIFT	(PAGE_SHIFT - SECTOR_SHIFT)
 #define PAGE_SECTORS_SHIFT	(PAGE_SHIFT - SECTOR_SHIFT)
 #define PAGE_SECTORS		(1 << PAGE_SECTORS_SHIFT)
 #define PAGE_SECTORS		(1 << PAGE_SECTORS_SHIFT)
-#define SECTOR_SIZE		(1 << SECTOR_SHIFT)
 #define SECTOR_MASK		(PAGE_SECTORS - 1)
 #define SECTOR_MASK		(PAGE_SECTORS - 1)
 
 
 #define FREE_BATCH		16
 #define FREE_BATCH		16
@@ -29,6 +27,7 @@
 
 
 #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
 #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
 static DECLARE_FAULT_ATTR(null_timeout_attr);
 static DECLARE_FAULT_ATTR(null_timeout_attr);
+static DECLARE_FAULT_ATTR(null_requeue_attr);
 #endif
 #endif
 
 
 static inline u64 mb_per_tick(int mbps)
 static inline u64 mb_per_tick(int mbps)
@@ -53,6 +52,7 @@ struct nullb_queue {
 	wait_queue_head_t wait;
 	wait_queue_head_t wait;
 	unsigned int queue_depth;
 	unsigned int queue_depth;
 	struct nullb_device *dev;
 	struct nullb_device *dev;
+	unsigned int requeue_selection;
 
 
 	struct nullb_cmd *cmds;
 	struct nullb_cmd *cmds;
 };
 };
@@ -72,6 +72,7 @@ enum nullb_device_flags {
 	NULLB_DEV_FL_CACHE	= 3,
 	NULLB_DEV_FL_CACHE	= 3,
 };
 };
 
 
+#define MAP_SZ		((PAGE_SIZE >> SECTOR_SHIFT) + 2)
 /*
 /*
  * nullb_page is a page in memory for nullb devices.
  * nullb_page is a page in memory for nullb devices.
  *
  *
@@ -86,10 +87,10 @@ enum nullb_device_flags {
  */
  */
 struct nullb_page {
 struct nullb_page {
 	struct page *page;
 	struct page *page;
-	unsigned long bitmap;
+	DECLARE_BITMAP(bitmap, MAP_SZ);
 };
 };
-#define NULLB_PAGE_LOCK (sizeof(unsigned long) * 8 - 1)
-#define NULLB_PAGE_FREE (sizeof(unsigned long) * 8 - 2)
+#define NULLB_PAGE_LOCK (MAP_SZ - 1)
+#define NULLB_PAGE_FREE (MAP_SZ - 2)
 
 
 struct nullb_device {
 struct nullb_device {
 	struct nullb *nullb;
 	struct nullb *nullb;
@@ -170,6 +171,9 @@ MODULE_PARM_DESC(home_node, "Home node for the device");
 #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
 #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
 static char g_timeout_str[80];
 static char g_timeout_str[80];
 module_param_string(timeout, g_timeout_str, sizeof(g_timeout_str), S_IRUGO);
 module_param_string(timeout, g_timeout_str, sizeof(g_timeout_str), S_IRUGO);
+
+static char g_requeue_str[80];
+module_param_string(requeue, g_requeue_str, sizeof(g_requeue_str), S_IRUGO);
 #endif
 #endif
 
 
 static int g_queue_mode = NULL_Q_MQ;
 static int g_queue_mode = NULL_Q_MQ;
@@ -728,7 +732,7 @@ static struct nullb_page *null_alloc_page(gfp_t gfp_flags)
 	if (!t_page->page)
 	if (!t_page->page)
 		goto out_freepage;
 		goto out_freepage;
 
 
-	t_page->bitmap = 0;
+	memset(t_page->bitmap, 0, sizeof(t_page->bitmap));
 	return t_page;
 	return t_page;
 out_freepage:
 out_freepage:
 	kfree(t_page);
 	kfree(t_page);
@@ -738,13 +742,20 @@ out:
 
 
 static void null_free_page(struct nullb_page *t_page)
 static void null_free_page(struct nullb_page *t_page)
 {
 {
-	__set_bit(NULLB_PAGE_FREE, &t_page->bitmap);
-	if (test_bit(NULLB_PAGE_LOCK, &t_page->bitmap))
+	__set_bit(NULLB_PAGE_FREE, t_page->bitmap);
+	if (test_bit(NULLB_PAGE_LOCK, t_page->bitmap))
 		return;
 		return;
 	__free_page(t_page->page);
 	__free_page(t_page->page);
 	kfree(t_page);
 	kfree(t_page);
 }
 }
 
 
+static bool null_page_empty(struct nullb_page *page)
+{
+	int size = MAP_SZ - 2;
+
+	return find_first_bit(page->bitmap, size) == size;
+}
+
 static void null_free_sector(struct nullb *nullb, sector_t sector,
 static void null_free_sector(struct nullb *nullb, sector_t sector,
 	bool is_cache)
 	bool is_cache)
 {
 {
@@ -759,9 +770,9 @@ static void null_free_sector(struct nullb *nullb, sector_t sector,
 
 
 	t_page = radix_tree_lookup(root, idx);
 	t_page = radix_tree_lookup(root, idx);
 	if (t_page) {
 	if (t_page) {
-		__clear_bit(sector_bit, &t_page->bitmap);
+		__clear_bit(sector_bit, t_page->bitmap);
 
 
-		if (!t_page->bitmap) {
+		if (null_page_empty(t_page)) {
 			ret = radix_tree_delete_item(root, idx, t_page);
 			ret = radix_tree_delete_item(root, idx, t_page);
 			WARN_ON(ret != t_page);
 			WARN_ON(ret != t_page);
 			null_free_page(ret);
 			null_free_page(ret);
@@ -832,7 +843,7 @@ static struct nullb_page *__null_lookup_page(struct nullb *nullb,
 	t_page = radix_tree_lookup(root, idx);
 	t_page = radix_tree_lookup(root, idx);
 	WARN_ON(t_page && t_page->page->index != idx);
 	WARN_ON(t_page && t_page->page->index != idx);
 
 
-	if (t_page && (for_write || test_bit(sector_bit, &t_page->bitmap)))
+	if (t_page && (for_write || test_bit(sector_bit, t_page->bitmap)))
 		return t_page;
 		return t_page;
 
 
 	return NULL;
 	return NULL;
@@ -895,10 +906,10 @@ static int null_flush_cache_page(struct nullb *nullb, struct nullb_page *c_page)
 
 
 	t_page = null_insert_page(nullb, idx << PAGE_SECTORS_SHIFT, true);
 	t_page = null_insert_page(nullb, idx << PAGE_SECTORS_SHIFT, true);
 
 
-	__clear_bit(NULLB_PAGE_LOCK, &c_page->bitmap);
-	if (test_bit(NULLB_PAGE_FREE, &c_page->bitmap)) {
+	__clear_bit(NULLB_PAGE_LOCK, c_page->bitmap);
+	if (test_bit(NULLB_PAGE_FREE, c_page->bitmap)) {
 		null_free_page(c_page);
 		null_free_page(c_page);
-		if (t_page && t_page->bitmap == 0) {
+		if (t_page && null_page_empty(t_page)) {
 			ret = radix_tree_delete_item(&nullb->dev->data,
 			ret = radix_tree_delete_item(&nullb->dev->data,
 				idx, t_page);
 				idx, t_page);
 			null_free_page(t_page);
 			null_free_page(t_page);
@@ -914,11 +925,11 @@ static int null_flush_cache_page(struct nullb *nullb, struct nullb_page *c_page)
 
 
 	for (i = 0; i < PAGE_SECTORS;
 	for (i = 0; i < PAGE_SECTORS;
 			i += (nullb->dev->blocksize >> SECTOR_SHIFT)) {
 			i += (nullb->dev->blocksize >> SECTOR_SHIFT)) {
-		if (test_bit(i, &c_page->bitmap)) {
+		if (test_bit(i, c_page->bitmap)) {
 			offset = (i << SECTOR_SHIFT);
 			offset = (i << SECTOR_SHIFT);
 			memcpy(dst + offset, src + offset,
 			memcpy(dst + offset, src + offset,
 				nullb->dev->blocksize);
 				nullb->dev->blocksize);
-			__set_bit(i, &t_page->bitmap);
+			__set_bit(i, t_page->bitmap);
 		}
 		}
 	}
 	}
 
 
@@ -955,10 +966,10 @@ again:
 		 * We found the page which is being flushed to disk by other
 		 * We found the page which is being flushed to disk by other
 		 * threads
 		 * threads
 		 */
 		 */
-		if (test_bit(NULLB_PAGE_LOCK, &c_pages[i]->bitmap))
+		if (test_bit(NULLB_PAGE_LOCK, c_pages[i]->bitmap))
 			c_pages[i] = NULL;
 			c_pages[i] = NULL;
 		else
 		else
-			__set_bit(NULLB_PAGE_LOCK, &c_pages[i]->bitmap);
+			__set_bit(NULLB_PAGE_LOCK, c_pages[i]->bitmap);
 	}
 	}
 
 
 	one_round = 0;
 	one_round = 0;
@@ -1011,7 +1022,7 @@ static int copy_to_nullb(struct nullb *nullb, struct page *source,
 		kunmap_atomic(dst);
 		kunmap_atomic(dst);
 		kunmap_atomic(src);
 		kunmap_atomic(src);
 
 
-		__set_bit(sector & SECTOR_MASK, &t_page->bitmap);
+		__set_bit(sector & SECTOR_MASK, t_page->bitmap);
 
 
 		if (is_fua)
 		if (is_fua)
 			null_free_sector(nullb, sector, true);
 			null_free_sector(nullb, sector, true);
@@ -1380,7 +1391,15 @@ static bool should_timeout_request(struct request *rq)
 	if (g_timeout_str[0])
 	if (g_timeout_str[0])
 		return should_fail(&null_timeout_attr, 1);
 		return should_fail(&null_timeout_attr, 1);
 #endif
 #endif
+	return false;
+}
 
 
+static bool should_requeue_request(struct request *rq)
+{
+#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
+	if (g_requeue_str[0])
+		return should_fail(&null_requeue_attr, 1);
+#endif
 	return false;
 	return false;
 }
 }
 
 
@@ -1391,11 +1410,17 @@ static void null_request_fn(struct request_queue *q)
 	while ((rq = blk_fetch_request(q)) != NULL) {
 	while ((rq = blk_fetch_request(q)) != NULL) {
 		struct nullb_cmd *cmd = rq->special;
 		struct nullb_cmd *cmd = rq->special;
 
 
-		if (!should_timeout_request(rq)) {
-			spin_unlock_irq(q->queue_lock);
-			null_handle_cmd(cmd);
-			spin_lock_irq(q->queue_lock);
+		/* just ignore the request */
+		if (should_timeout_request(rq))
+			continue;
+		if (should_requeue_request(rq)) {
+			blk_requeue_request(q, rq);
+			continue;
 		}
 		}
+
+		spin_unlock_irq(q->queue_lock);
+		null_handle_cmd(cmd);
+		spin_lock_irq(q->queue_lock);
 	}
 	}
 }
 }
 
 
@@ -1422,10 +1447,23 @@ static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx,
 
 
 	blk_mq_start_request(bd->rq);
 	blk_mq_start_request(bd->rq);
 
 
-	if (!should_timeout_request(bd->rq))
-		return null_handle_cmd(cmd);
+	if (should_requeue_request(bd->rq)) {
+		/*
+		 * Alternate between hitting the core BUSY path, and the
+		 * driver driven requeue path
+		 */
+		nq->requeue_selection++;
+		if (nq->requeue_selection & 1)
+			return BLK_STS_RESOURCE;
+		else {
+			blk_mq_requeue_request(bd->rq, true);
+			return BLK_STS_OK;
+		}
+	}
+	if (should_timeout_request(bd->rq))
+		return BLK_STS_OK;
 
 
-	return BLK_STS_OK;
+	return null_handle_cmd(cmd);
 }
 }
 
 
 static const struct blk_mq_ops null_mq_ops = {
 static const struct blk_mq_ops null_mq_ops = {
@@ -1485,7 +1523,7 @@ static void null_config_discard(struct nullb *nullb)
 	nullb->q->limits.discard_granularity = nullb->dev->blocksize;
 	nullb->q->limits.discard_granularity = nullb->dev->blocksize;
 	nullb->q->limits.discard_alignment = nullb->dev->blocksize;
 	nullb->q->limits.discard_alignment = nullb->dev->blocksize;
 	blk_queue_max_discard_sectors(nullb->q, UINT_MAX >> 9);
 	blk_queue_max_discard_sectors(nullb->q, UINT_MAX >> 9);
-	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, nullb->q);
+	blk_queue_flag_set(QUEUE_FLAG_DISCARD, nullb->q);
 }
 }
 
 
 static int null_open(struct block_device *bdev, fmode_t mode)
 static int null_open(struct block_device *bdev, fmode_t mode)
@@ -1659,16 +1697,27 @@ static void null_validate_conf(struct nullb_device *dev)
 		dev->mbps = 0;
 		dev->mbps = 0;
 }
 }
 
 
-static bool null_setup_fault(void)
-{
 #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
 #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
-	if (!g_timeout_str[0])
+static bool __null_setup_fault(struct fault_attr *attr, char *str)
+{
+	if (!str[0])
 		return true;
 		return true;
 
 
-	if (!setup_fault_attr(&null_timeout_attr, g_timeout_str))
+	if (!setup_fault_attr(attr, str))
 		return false;
 		return false;
 
 
-	null_timeout_attr.verbose = 0;
+	attr->verbose = 0;
+	return true;
+}
+#endif
+
+static bool null_setup_fault(void)
+{
+#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
+	if (!__null_setup_fault(&null_timeout_attr, g_timeout_str))
+		return false;
+	if (!__null_setup_fault(&null_requeue_attr, g_requeue_str))
+		return false;
 #endif
 #endif
 	return true;
 	return true;
 }
 }
@@ -1717,7 +1766,8 @@ static int null_add_dev(struct nullb_device *dev)
 		}
 		}
 		null_init_queues(nullb);
 		null_init_queues(nullb);
 	} else if (dev->queue_mode == NULL_Q_BIO) {
 	} else if (dev->queue_mode == NULL_Q_BIO) {
-		nullb->q = blk_alloc_queue_node(GFP_KERNEL, dev->home_node);
+		nullb->q = blk_alloc_queue_node(GFP_KERNEL, dev->home_node,
+						NULL);
 		if (!nullb->q) {
 		if (!nullb->q) {
 			rv = -ENOMEM;
 			rv = -ENOMEM;
 			goto out_cleanup_queues;
 			goto out_cleanup_queues;
@@ -1758,8 +1808,8 @@ static int null_add_dev(struct nullb_device *dev)
 	}
 	}
 
 
 	nullb->q->queuedata = nullb;
 	nullb->q->queuedata = nullb;
-	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, nullb->q);
-	queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, nullb->q);
+	blk_queue_flag_set(QUEUE_FLAG_NONROT, nullb->q);
+	blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, nullb->q);
 
 
 	mutex_lock(&lock);
 	mutex_lock(&lock);
 	nullb->index = ida_simple_get(&nullb_indexes, 0, 0, GFP_KERNEL);
 	nullb->index = ida_simple_get(&nullb_indexes, 0, 0, GFP_KERNEL);
@@ -1802,10 +1852,6 @@ static int __init null_init(void)
 	struct nullb *nullb;
 	struct nullb *nullb;
 	struct nullb_device *dev;
 	struct nullb_device *dev;
 
 
-	/* check for nullb_page.bitmap */
-	if (sizeof(unsigned long) * 8 - 2 < (PAGE_SIZE >> SECTOR_SHIFT))
-		return -EINVAL;
-
 	if (g_bs > PAGE_SIZE) {
 	if (g_bs > PAGE_SIZE) {
 		pr_warn("null_blk: invalid block size\n");
 		pr_warn("null_blk: invalid block size\n");
 		pr_warn("null_blk: defaults block size to %lu\n", PAGE_SIZE);
 		pr_warn("null_blk: defaults block size to %lu\n", PAGE_SIZE);

+ 2 - 0
drivers/block/paride/pcd.c

@@ -230,6 +230,8 @@ static int pcd_block_open(struct block_device *bdev, fmode_t mode)
 	struct pcd_unit *cd = bdev->bd_disk->private_data;
 	struct pcd_unit *cd = bdev->bd_disk->private_data;
 	int ret;
 	int ret;
 
 
+	check_disk_change(bdev);
+
 	mutex_lock(&pcd_mutex);
 	mutex_lock(&pcd_mutex);
 	ret = cdrom_open(&cd->info, bdev, mode);
 	ret = cdrom_open(&cd->info, bdev, mode);
 	mutex_unlock(&pcd_mutex);
 	mutex_unlock(&pcd_mutex);

+ 2 - 11
drivers/block/rbd.c

@@ -50,15 +50,6 @@
 
 
 #define RBD_DEBUG	/* Activate rbd_assert() calls */
 #define RBD_DEBUG	/* Activate rbd_assert() calls */
 
 
-/*
- * The basic unit of block I/O is a sector.  It is interpreted in a
- * number of contexts in Linux (blk, bio, genhd), but the default is
- * universally 512 bytes.  These symbols are just slightly more
- * meaningful than the bare numbers they represent.
- */
-#define	SECTOR_SHIFT	9
-#define	SECTOR_SIZE	(1ULL << SECTOR_SHIFT)
-
 /*
 /*
  * Increment the given counter and return its updated value.
  * Increment the given counter and return its updated value.
  * If the counter is already 0 it will not be incremented.
  * If the counter is already 0 it will not be incremented.
@@ -4370,7 +4361,7 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
 		goto out_tag_set;
 		goto out_tag_set;
 	}
 	}
 
 
-	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
+	blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
 	/* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */
 	/* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */
 
 
 	/* set io sizes to object size */
 	/* set io sizes to object size */
@@ -4383,7 +4374,7 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
 	blk_queue_io_opt(q, segment_size);
 	blk_queue_io_opt(q, segment_size);
 
 
 	/* enable the discard support */
 	/* enable the discard support */
-	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
+	blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
 	q->limits.discard_granularity = segment_size;
 	q->limits.discard_granularity = segment_size;
 	blk_queue_max_discard_sectors(q, segment_size / SECTOR_SIZE);
 	blk_queue_max_discard_sectors(q, segment_size / SECTOR_SIZE);
 	blk_queue_max_write_zeroes_sectors(q, segment_size / SECTOR_SIZE);
 	blk_queue_max_write_zeroes_sectors(q, segment_size / SECTOR_SIZE);

+ 3 - 3
drivers/block/rsxx/dev.c

@@ -287,10 +287,10 @@ int rsxx_setup_dev(struct rsxx_cardinfo *card)
 	blk_queue_max_hw_sectors(card->queue, blkdev_max_hw_sectors);
 	blk_queue_max_hw_sectors(card->queue, blkdev_max_hw_sectors);
 	blk_queue_physical_block_size(card->queue, RSXX_HW_BLK_SIZE);
 	blk_queue_physical_block_size(card->queue, RSXX_HW_BLK_SIZE);
 
 
-	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, card->queue);
-	queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, card->queue);
+	blk_queue_flag_set(QUEUE_FLAG_NONROT, card->queue);
+	blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, card->queue);
 	if (rsxx_discard_supported(card)) {
 	if (rsxx_discard_supported(card)) {
-		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, card->queue);
+		blk_queue_flag_set(QUEUE_FLAG_DISCARD, card->queue);
 		blk_queue_max_discard_sectors(card->queue,
 		blk_queue_max_discard_sectors(card->queue,
 						RSXX_HW_BLK_SIZE >> 9);
 						RSXX_HW_BLK_SIZE >> 9);
 		card->queue->limits.discard_granularity = RSXX_HW_BLK_SIZE;
 		card->queue->limits.discard_granularity = RSXX_HW_BLK_SIZE;

+ 2 - 2
drivers/block/skd_main.c

@@ -2858,8 +2858,8 @@ static int skd_cons_disk(struct skd_device *skdev)
 	/* set optimal I/O size to 8KB */
 	/* set optimal I/O size to 8KB */
 	blk_queue_io_opt(q, 8192);
 	blk_queue_io_opt(q, 8192);
 
 
-	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
-	queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, q);
+	blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
+	blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, q);
 
 
 	blk_queue_rq_timeout(q, 8 * HZ);
 	blk_queue_rq_timeout(q, 8 * HZ);
 
 

+ 3 - 4
drivers/block/umem.c

@@ -888,13 +888,14 @@ static int mm_pci_probe(struct pci_dev *dev, const struct pci_device_id *id)
 	card->Active = -1;	/* no page is active */
 	card->Active = -1;	/* no page is active */
 	card->bio = NULL;
 	card->bio = NULL;
 	card->biotail = &card->bio;
 	card->biotail = &card->bio;
+	spin_lock_init(&card->lock);
 
 
-	card->queue = blk_alloc_queue(GFP_KERNEL);
+	card->queue = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE,
+					   &card->lock);
 	if (!card->queue)
 	if (!card->queue)
 		goto failed_alloc;
 		goto failed_alloc;
 
 
 	blk_queue_make_request(card->queue, mm_make_request);
 	blk_queue_make_request(card->queue, mm_make_request);
-	card->queue->queue_lock = &card->lock;
 	card->queue->queuedata = card;
 	card->queue->queuedata = card;
 
 
 	tasklet_init(&card->tasklet, process_page, (unsigned long)card);
 	tasklet_init(&card->tasklet, process_page, (unsigned long)card);
@@ -968,8 +969,6 @@ static int mm_pci_probe(struct pci_dev *dev, const struct pci_device_id *id)
 	dev_printk(KERN_INFO, &card->dev->dev,
 	dev_printk(KERN_INFO, &card->dev->dev,
 		"Window size %d bytes, IRQ %d\n", data, dev->irq);
 		"Window size %d bytes, IRQ %d\n", data, dev->irq);
 
 
-	spin_lock_init(&card->lock);
-
 	pci_set_drvdata(dev, card);
 	pci_set_drvdata(dev, card);
 
 
 	if (pci_write_cmd != 0x0F) 	/* If not Memory Write & Invalidate */
 	if (pci_write_cmd != 0x0F) 	/* If not Memory Write & Invalidate */

+ 5 - 5
drivers/block/xen-blkfront.c

@@ -932,15 +932,15 @@ static void blkif_set_queue_limits(struct blkfront_info *info)
 	unsigned int segments = info->max_indirect_segments ? :
 	unsigned int segments = info->max_indirect_segments ? :
 				BLKIF_MAX_SEGMENTS_PER_REQUEST;
 				BLKIF_MAX_SEGMENTS_PER_REQUEST;
 
 
-	queue_flag_set_unlocked(QUEUE_FLAG_VIRT, rq);
+	blk_queue_flag_set(QUEUE_FLAG_VIRT, rq);
 
 
 	if (info->feature_discard) {
 	if (info->feature_discard) {
-		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, rq);
+		blk_queue_flag_set(QUEUE_FLAG_DISCARD, rq);
 		blk_queue_max_discard_sectors(rq, get_capacity(gd));
 		blk_queue_max_discard_sectors(rq, get_capacity(gd));
 		rq->limits.discard_granularity = info->discard_granularity;
 		rq->limits.discard_granularity = info->discard_granularity;
 		rq->limits.discard_alignment = info->discard_alignment;
 		rq->limits.discard_alignment = info->discard_alignment;
 		if (info->feature_secdiscard)
 		if (info->feature_secdiscard)
-			queue_flag_set_unlocked(QUEUE_FLAG_SECERASE, rq);
+			blk_queue_flag_set(QUEUE_FLAG_SECERASE, rq);
 	}
 	}
 
 
 	/* Hard sector size and max sectors impersonate the equiv. hardware. */
 	/* Hard sector size and max sectors impersonate the equiv. hardware. */
@@ -1611,8 +1611,8 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
 				blkif_req(req)->error = BLK_STS_NOTSUPP;
 				blkif_req(req)->error = BLK_STS_NOTSUPP;
 				info->feature_discard = 0;
 				info->feature_discard = 0;
 				info->feature_secdiscard = 0;
 				info->feature_secdiscard = 0;
-				queue_flag_clear(QUEUE_FLAG_DISCARD, rq);
-				queue_flag_clear(QUEUE_FLAG_SECERASE, rq);
+				blk_queue_flag_clear(QUEUE_FLAG_DISCARD, rq);
+				blk_queue_flag_clear(QUEUE_FLAG_SECERASE, rq);
 			}
 			}
 			break;
 			break;
 		case BLKIF_OP_FLUSH_DISKCACHE:
 		case BLKIF_OP_FLUSH_DISKCACHE:

+ 4 - 4
drivers/block/zram/zram_drv.c

@@ -1530,8 +1530,8 @@ static int zram_add(void)
 	/* Actual capacity set using syfs (/sys/block/zram<id>/disksize */
 	/* Actual capacity set using syfs (/sys/block/zram<id>/disksize */
 	set_capacity(zram->disk, 0);
 	set_capacity(zram->disk, 0);
 	/* zram devices sort of resembles non-rotational disks */
 	/* zram devices sort of resembles non-rotational disks */
-	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zram->disk->queue);
-	queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, zram->disk->queue);
+	blk_queue_flag_set(QUEUE_FLAG_NONROT, zram->disk->queue);
+	blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, zram->disk->queue);
 
 
 	/*
 	/*
 	 * To ensure that we always get PAGE_SIZE aligned
 	 * To ensure that we always get PAGE_SIZE aligned
@@ -1544,7 +1544,7 @@ static int zram_add(void)
 	blk_queue_io_opt(zram->disk->queue, PAGE_SIZE);
 	blk_queue_io_opt(zram->disk->queue, PAGE_SIZE);
 	zram->disk->queue->limits.discard_granularity = PAGE_SIZE;
 	zram->disk->queue->limits.discard_granularity = PAGE_SIZE;
 	blk_queue_max_discard_sectors(zram->disk->queue, UINT_MAX);
 	blk_queue_max_discard_sectors(zram->disk->queue, UINT_MAX);
-	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, zram->disk->queue);
+	blk_queue_flag_set(QUEUE_FLAG_DISCARD, zram->disk->queue);
 
 
 	/*
 	/*
 	 * zram_bio_discard() will clear all logical blocks if logical block
 	 * zram_bio_discard() will clear all logical blocks if logical block
@@ -1620,8 +1620,8 @@ static int zram_remove(struct zram *zram)
 
 
 	pr_info("Removed device: %s\n", zram->disk->disk_name);
 	pr_info("Removed device: %s\n", zram->disk->disk_name);
 
 
-	blk_cleanup_queue(zram->disk->queue);
 	del_gendisk(zram->disk);
 	del_gendisk(zram->disk);
+	blk_cleanup_queue(zram->disk->queue);
 	put_disk(zram->disk);
 	put_disk(zram->disk);
 	kfree(zram);
 	kfree(zram);
 	return 0;
 	return 0;

+ 0 - 1
drivers/block/zram/zram_drv.h

@@ -37,7 +37,6 @@ static const size_t max_zpage_size = PAGE_SIZE / 4 * 3;
 
 
 /*-- End of configurable params */
 /*-- End of configurable params */
 
 
-#define SECTOR_SHIFT		9
 #define SECTORS_PER_PAGE_SHIFT	(PAGE_SHIFT - SECTOR_SHIFT)
 #define SECTORS_PER_PAGE_SHIFT	(PAGE_SHIFT - SECTOR_SHIFT)
 #define SECTORS_PER_PAGE	(1 << SECTORS_PER_PAGE_SHIFT)
 #define SECTORS_PER_PAGE	(1 << SECTORS_PER_PAGE_SHIFT)
 #define ZRAM_LOGICAL_BLOCK_SHIFT 12
 #define ZRAM_LOGICAL_BLOCK_SHIFT 12

+ 0 - 3
drivers/cdrom/cdrom.c

@@ -1152,9 +1152,6 @@ int cdrom_open(struct cdrom_device_info *cdi, struct block_device *bdev,
 
 
 	cd_dbg(CD_OPEN, "entering cdrom_open\n");
 	cd_dbg(CD_OPEN, "entering cdrom_open\n");
 
 
-	/* open is event synchronization point, check events first */
-	check_disk_change(bdev);
-
 	/* if this was a O_NONBLOCK open and we should honor the flags,
 	/* if this was a O_NONBLOCK open and we should honor the flags,
 	 * do a quick open without drive/disc integrity checks. */
 	 * do a quick open without drive/disc integrity checks. */
 	cdi->use_count++;
 	cdi->use_count++;

+ 3 - 0
drivers/cdrom/gdrom.c

@@ -497,6 +497,9 @@ static const struct cdrom_device_ops gdrom_ops = {
 static int gdrom_bdops_open(struct block_device *bdev, fmode_t mode)
 static int gdrom_bdops_open(struct block_device *bdev, fmode_t mode)
 {
 {
 	int ret;
 	int ret;
+
+	check_disk_change(bdev);
+
 	mutex_lock(&gdrom_mutex);
 	mutex_lock(&gdrom_mutex);
 	ret = cdrom_open(gd.cd_info, bdev, mode);
 	ret = cdrom_open(gd.cd_info, bdev, mode);
 	mutex_unlock(&gdrom_mutex);
 	mutex_unlock(&gdrom_mutex);

+ 6 - 4
drivers/ide/ide-cd.c

@@ -712,7 +712,7 @@ static ide_startstop_t cdrom_start_rw(ide_drive_t *drive, struct request *rq)
 	struct request_queue *q = drive->queue;
 	struct request_queue *q = drive->queue;
 	int write = rq_data_dir(rq) == WRITE;
 	int write = rq_data_dir(rq) == WRITE;
 	unsigned short sectors_per_frame =
 	unsigned short sectors_per_frame =
-		queue_logical_block_size(q) >> SECTOR_BITS;
+		queue_logical_block_size(q) >> SECTOR_SHIFT;
 
 
 	ide_debug_log(IDE_DBG_RQ, "rq->cmd[0]: 0x%x, rq->cmd_flags: 0x%x, "
 	ide_debug_log(IDE_DBG_RQ, "rq->cmd[0]: 0x%x, rq->cmd_flags: 0x%x, "
 				  "secs_per_frame: %u",
 				  "secs_per_frame: %u",
@@ -919,7 +919,7 @@ static int cdrom_read_capacity(ide_drive_t *drive, unsigned long *capacity,
 	 * end up being bogus.
 	 * end up being bogus.
 	 */
 	 */
 	blocklen = be32_to_cpu(capbuf.blocklen);
 	blocklen = be32_to_cpu(capbuf.blocklen);
-	blocklen = (blocklen >> SECTOR_BITS) << SECTOR_BITS;
+	blocklen = (blocklen >> SECTOR_SHIFT) << SECTOR_SHIFT;
 	switch (blocklen) {
 	switch (blocklen) {
 	case 512:
 	case 512:
 	case 1024:
 	case 1024:
@@ -935,7 +935,7 @@ static int cdrom_read_capacity(ide_drive_t *drive, unsigned long *capacity,
 	}
 	}
 
 
 	*capacity = 1 + be32_to_cpu(capbuf.lba);
 	*capacity = 1 + be32_to_cpu(capbuf.lba);
-	*sectors_per_frame = blocklen >> SECTOR_BITS;
+	*sectors_per_frame = blocklen >> SECTOR_SHIFT;
 
 
 	ide_debug_log(IDE_DBG_PROBE, "cap: %lu, sectors_per_frame: %lu",
 	ide_debug_log(IDE_DBG_PROBE, "cap: %lu, sectors_per_frame: %lu",
 				     *capacity, *sectors_per_frame);
 				     *capacity, *sectors_per_frame);
@@ -1012,7 +1012,7 @@ int ide_cd_read_toc(ide_drive_t *drive, struct request_sense *sense)
 	drive->probed_capacity = toc->capacity * sectors_per_frame;
 	drive->probed_capacity = toc->capacity * sectors_per_frame;
 
 
 	blk_queue_logical_block_size(drive->queue,
 	blk_queue_logical_block_size(drive->queue,
-				     sectors_per_frame << SECTOR_BITS);
+				     sectors_per_frame << SECTOR_SHIFT);
 
 
 	/* first read just the header, so we know how long the TOC is */
 	/* first read just the header, so we know how long the TOC is */
 	stat = cdrom_read_tocentry(drive, 0, 1, 0, (char *) &toc->hdr,
 	stat = cdrom_read_tocentry(drive, 0, 1, 0, (char *) &toc->hdr,
@@ -1613,6 +1613,8 @@ static int idecd_open(struct block_device *bdev, fmode_t mode)
 	struct cdrom_info *info;
 	struct cdrom_info *info;
 	int rc = -ENXIO;
 	int rc = -ENXIO;
 
 
+	check_disk_change(bdev);
+
 	mutex_lock(&ide_cd_mutex);
 	mutex_lock(&ide_cd_mutex);
 	info = ide_cd_get(bdev->bd_disk);
 	info = ide_cd_get(bdev->bd_disk);
 	if (!info)
 	if (!info)

+ 1 - 5
drivers/ide/ide-cd.h

@@ -21,11 +21,7 @@
 
 
 /************************************************************************/
 /************************************************************************/
 
 
-#define SECTOR_BITS 		9
-#ifndef SECTOR_SIZE
-#define SECTOR_SIZE		(1 << SECTOR_BITS)
-#endif
-#define SECTORS_PER_FRAME	(CD_FRAMESIZE >> SECTOR_BITS)
+#define SECTORS_PER_FRAME	(CD_FRAMESIZE >> SECTOR_SHIFT)
 #define SECTOR_BUFFER_SIZE	(CD_FRAMESIZE * 32)
 #define SECTOR_BUFFER_SIZE	(CD_FRAMESIZE * 32)
 
 
 /* Capabilities Page size including 8 bytes of Mode Page Header */
 /* Capabilities Page size including 8 bytes of Mode Page Header */

+ 2 - 2
drivers/ide/ide-disk.c

@@ -687,8 +687,8 @@ static void ide_disk_setup(ide_drive_t *drive)
 	       queue_max_sectors(q) / 2);
 	       queue_max_sectors(q) / 2);
 
 
 	if (ata_id_is_ssd(id)) {
 	if (ata_id_is_ssd(id)) {
-		queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
-		queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, q);
+		blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
+		blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, q);
 	}
 	}
 
 
 	/* calculate drive capacity, and select LBA if possible */
 	/* calculate drive capacity, and select LBA if possible */

+ 2 - 2
drivers/ide/ide-probe.c

@@ -766,14 +766,14 @@ static int ide_init_queue(ide_drive_t *drive)
 	 *	limits and LBA48 we could raise it but as yet
 	 *	limits and LBA48 we could raise it but as yet
 	 *	do not.
 	 *	do not.
 	 */
 	 */
-	q = blk_alloc_queue_node(GFP_KERNEL, hwif_to_node(hwif));
+	q = blk_alloc_queue_node(GFP_KERNEL, hwif_to_node(hwif), NULL);
 	if (!q)
 	if (!q)
 		return 1;
 		return 1;
 
 
 	q->request_fn = do_ide_request;
 	q->request_fn = do_ide_request;
 	q->initialize_rq_fn = ide_initialize_rq;
 	q->initialize_rq_fn = ide_initialize_rq;
 	q->cmd_size = sizeof(struct ide_request);
 	q->cmd_size = sizeof(struct ide_request);
-	queue_flag_set_unlocked(QUEUE_FLAG_SCSI_PASSTHROUGH, q);
+	blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, q);
 	if (blk_init_allocated_queue(q) < 0) {
 	if (blk_init_allocated_queue(q) < 0) {
 		blk_cleanup_queue(q);
 		blk_cleanup_queue(q);
 		return 1;
 		return 1;

+ 91 - 149
drivers/lightnvm/core.c

@@ -36,13 +36,13 @@ static DECLARE_RWSEM(nvm_lock);
 /* Map between virtual and physical channel and lun */
 /* Map between virtual and physical channel and lun */
 struct nvm_ch_map {
 struct nvm_ch_map {
 	int ch_off;
 	int ch_off;
-	int nr_luns;
+	int num_lun;
 	int *lun_offs;
 	int *lun_offs;
 };
 };
 
 
 struct nvm_dev_map {
 struct nvm_dev_map {
 	struct nvm_ch_map *chnls;
 	struct nvm_ch_map *chnls;
-	int nr_chnls;
+	int num_ch;
 };
 };
 
 
 static struct nvm_target *nvm_find_target(struct nvm_dev *dev, const char *name)
 static struct nvm_target *nvm_find_target(struct nvm_dev *dev, const char *name)
@@ -114,15 +114,15 @@ static void nvm_remove_tgt_dev(struct nvm_tgt_dev *tgt_dev, int clear)
 	struct nvm_dev_map *dev_map = tgt_dev->map;
 	struct nvm_dev_map *dev_map = tgt_dev->map;
 	int i, j;
 	int i, j;
 
 
-	for (i = 0; i < dev_map->nr_chnls; i++) {
+	for (i = 0; i < dev_map->num_ch; i++) {
 		struct nvm_ch_map *ch_map = &dev_map->chnls[i];
 		struct nvm_ch_map *ch_map = &dev_map->chnls[i];
 		int *lun_offs = ch_map->lun_offs;
 		int *lun_offs = ch_map->lun_offs;
 		int ch = i + ch_map->ch_off;
 		int ch = i + ch_map->ch_off;
 
 
 		if (clear) {
 		if (clear) {
-			for (j = 0; j < ch_map->nr_luns; j++) {
+			for (j = 0; j < ch_map->num_lun; j++) {
 				int lun = j + lun_offs[j];
 				int lun = j + lun_offs[j];
-				int lunid = (ch * dev->geo.nr_luns) + lun;
+				int lunid = (ch * dev->geo.num_lun) + lun;
 
 
 				WARN_ON(!test_and_clear_bit(lunid,
 				WARN_ON(!test_and_clear_bit(lunid,
 							dev->lun_map));
 							dev->lun_map));
@@ -147,47 +147,46 @@ static struct nvm_tgt_dev *nvm_create_tgt_dev(struct nvm_dev *dev,
 	struct nvm_dev_map *dev_rmap = dev->rmap;
 	struct nvm_dev_map *dev_rmap = dev->rmap;
 	struct nvm_dev_map *dev_map;
 	struct nvm_dev_map *dev_map;
 	struct ppa_addr *luns;
 	struct ppa_addr *luns;
-	int nr_luns = lun_end - lun_begin + 1;
-	int luns_left = nr_luns;
-	int nr_chnls = nr_luns / dev->geo.nr_luns;
-	int nr_chnls_mod = nr_luns % dev->geo.nr_luns;
-	int bch = lun_begin / dev->geo.nr_luns;
-	int blun = lun_begin % dev->geo.nr_luns;
+	int num_lun = lun_end - lun_begin + 1;
+	int luns_left = num_lun;
+	int num_ch = num_lun / dev->geo.num_lun;
+	int num_ch_mod = num_lun % dev->geo.num_lun;
+	int bch = lun_begin / dev->geo.num_lun;
+	int blun = lun_begin % dev->geo.num_lun;
 	int lunid = 0;
 	int lunid = 0;
 	int lun_balanced = 1;
 	int lun_balanced = 1;
-	int prev_nr_luns;
+	int sec_per_lun, prev_num_lun;
 	int i, j;
 	int i, j;
 
 
-	nr_chnls = (nr_chnls_mod == 0) ? nr_chnls : nr_chnls + 1;
+	num_ch = (num_ch_mod == 0) ? num_ch : num_ch + 1;
 
 
 	dev_map = kmalloc(sizeof(struct nvm_dev_map), GFP_KERNEL);
 	dev_map = kmalloc(sizeof(struct nvm_dev_map), GFP_KERNEL);
 	if (!dev_map)
 	if (!dev_map)
 		goto err_dev;
 		goto err_dev;
 
 
-	dev_map->chnls = kcalloc(nr_chnls, sizeof(struct nvm_ch_map),
-								GFP_KERNEL);
+	dev_map->chnls = kcalloc(num_ch, sizeof(struct nvm_ch_map), GFP_KERNEL);
 	if (!dev_map->chnls)
 	if (!dev_map->chnls)
 		goto err_chnls;
 		goto err_chnls;
 
 
-	luns = kcalloc(nr_luns, sizeof(struct ppa_addr), GFP_KERNEL);
+	luns = kcalloc(num_lun, sizeof(struct ppa_addr), GFP_KERNEL);
 	if (!luns)
 	if (!luns)
 		goto err_luns;
 		goto err_luns;
 
 
-	prev_nr_luns = (luns_left > dev->geo.nr_luns) ?
-					dev->geo.nr_luns : luns_left;
-	for (i = 0; i < nr_chnls; i++) {
+	prev_num_lun = (luns_left > dev->geo.num_lun) ?
+					dev->geo.num_lun : luns_left;
+	for (i = 0; i < num_ch; i++) {
 		struct nvm_ch_map *ch_rmap = &dev_rmap->chnls[i + bch];
 		struct nvm_ch_map *ch_rmap = &dev_rmap->chnls[i + bch];
 		int *lun_roffs = ch_rmap->lun_offs;
 		int *lun_roffs = ch_rmap->lun_offs;
 		struct nvm_ch_map *ch_map = &dev_map->chnls[i];
 		struct nvm_ch_map *ch_map = &dev_map->chnls[i];
 		int *lun_offs;
 		int *lun_offs;
-		int luns_in_chnl = (luns_left > dev->geo.nr_luns) ?
-					dev->geo.nr_luns : luns_left;
+		int luns_in_chnl = (luns_left > dev->geo.num_lun) ?
+					dev->geo.num_lun : luns_left;
 
 
-		if (lun_balanced && prev_nr_luns != luns_in_chnl)
+		if (lun_balanced && prev_num_lun != luns_in_chnl)
 			lun_balanced = 0;
 			lun_balanced = 0;
 
 
 		ch_map->ch_off = ch_rmap->ch_off = bch;
 		ch_map->ch_off = ch_rmap->ch_off = bch;
-		ch_map->nr_luns = luns_in_chnl;
+		ch_map->num_lun = luns_in_chnl;
 
 
 		lun_offs = kcalloc(luns_in_chnl, sizeof(int), GFP_KERNEL);
 		lun_offs = kcalloc(luns_in_chnl, sizeof(int), GFP_KERNEL);
 		if (!lun_offs)
 		if (!lun_offs)
@@ -195,8 +194,8 @@ static struct nvm_tgt_dev *nvm_create_tgt_dev(struct nvm_dev *dev,
 
 
 		for (j = 0; j < luns_in_chnl; j++) {
 		for (j = 0; j < luns_in_chnl; j++) {
 			luns[lunid].ppa = 0;
 			luns[lunid].ppa = 0;
-			luns[lunid].g.ch = i;
-			luns[lunid++].g.lun = j;
+			luns[lunid].a.ch = i;
+			luns[lunid++].a.lun = j;
 
 
 			lun_offs[j] = blun;
 			lun_offs[j] = blun;
 			lun_roffs[j + blun] = blun;
 			lun_roffs[j + blun] = blun;
@@ -209,24 +208,29 @@ static struct nvm_tgt_dev *nvm_create_tgt_dev(struct nvm_dev *dev,
 		luns_left -= luns_in_chnl;
 		luns_left -= luns_in_chnl;
 	}
 	}
 
 
-	dev_map->nr_chnls = nr_chnls;
+	dev_map->num_ch = num_ch;
 
 
 	tgt_dev = kmalloc(sizeof(struct nvm_tgt_dev), GFP_KERNEL);
 	tgt_dev = kmalloc(sizeof(struct nvm_tgt_dev), GFP_KERNEL);
 	if (!tgt_dev)
 	if (!tgt_dev)
 		goto err_ch;
 		goto err_ch;
 
 
+	/* Inherit device geometry from parent */
 	memcpy(&tgt_dev->geo, &dev->geo, sizeof(struct nvm_geo));
 	memcpy(&tgt_dev->geo, &dev->geo, sizeof(struct nvm_geo));
+
 	/* Target device only owns a portion of the physical device */
 	/* Target device only owns a portion of the physical device */
-	tgt_dev->geo.nr_chnls = nr_chnls;
-	tgt_dev->geo.all_luns = nr_luns;
-	tgt_dev->geo.nr_luns = (lun_balanced) ? prev_nr_luns : -1;
+	tgt_dev->geo.num_ch = num_ch;
+	tgt_dev->geo.num_lun = (lun_balanced) ? prev_num_lun : -1;
+	tgt_dev->geo.all_luns = num_lun;
+	tgt_dev->geo.all_chunks = num_lun * dev->geo.num_chk;
+
 	tgt_dev->geo.op = op;
 	tgt_dev->geo.op = op;
-	tgt_dev->total_secs = nr_luns * tgt_dev->geo.sec_per_lun;
+
+	sec_per_lun = dev->geo.clba * dev->geo.num_chk;
+	tgt_dev->geo.total_secs = num_lun * sec_per_lun;
+
 	tgt_dev->q = dev->q;
 	tgt_dev->q = dev->q;
 	tgt_dev->map = dev_map;
 	tgt_dev->map = dev_map;
 	tgt_dev->luns = luns;
 	tgt_dev->luns = luns;
-	memcpy(&tgt_dev->identity, &dev->identity, sizeof(struct nvm_id));
-
 	tgt_dev->parent = dev;
 	tgt_dev->parent = dev;
 
 
 	return tgt_dev;
 	return tgt_dev;
@@ -296,24 +300,20 @@ static int __nvm_config_simple(struct nvm_dev *dev,
 static int __nvm_config_extended(struct nvm_dev *dev,
 static int __nvm_config_extended(struct nvm_dev *dev,
 				 struct nvm_ioctl_create_extended *e)
 				 struct nvm_ioctl_create_extended *e)
 {
 {
-	struct nvm_geo *geo = &dev->geo;
-
 	if (e->lun_begin == 0xFFFF && e->lun_end == 0xFFFF) {
 	if (e->lun_begin == 0xFFFF && e->lun_end == 0xFFFF) {
 		e->lun_begin = 0;
 		e->lun_begin = 0;
 		e->lun_end = dev->geo.all_luns - 1;
 		e->lun_end = dev->geo.all_luns - 1;
 	}
 	}
 
 
 	/* op not set falls into target's default */
 	/* op not set falls into target's default */
-	if (e->op == 0xFFFF)
+	if (e->op == 0xFFFF) {
 		e->op = NVM_TARGET_DEFAULT_OP;
 		e->op = NVM_TARGET_DEFAULT_OP;
-
-	if (e->op < NVM_TARGET_MIN_OP ||
-	    e->op > NVM_TARGET_MAX_OP) {
+	} else if (e->op < NVM_TARGET_MIN_OP || e->op > NVM_TARGET_MAX_OP) {
 		pr_err("nvm: invalid over provisioning value\n");
 		pr_err("nvm: invalid over provisioning value\n");
 		return -EINVAL;
 		return -EINVAL;
 	}
 	}
 
 
-	return nvm_config_check_luns(geo, e->lun_begin, e->lun_end);
+	return nvm_config_check_luns(&dev->geo, e->lun_begin, e->lun_end);
 }
 }
 
 
 static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
 static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
@@ -384,7 +384,7 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
 		goto err_dev;
 		goto err_dev;
 	}
 	}
 
 
-	tqueue = blk_alloc_queue_node(GFP_KERNEL, dev->q->node);
+	tqueue = blk_alloc_queue_node(GFP_KERNEL, dev->q->node, NULL);
 	if (!tqueue) {
 	if (!tqueue) {
 		ret = -ENOMEM;
 		ret = -ENOMEM;
 		goto err_disk;
 		goto err_disk;
@@ -407,7 +407,8 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
 	tdisk->private_data = targetdata;
 	tdisk->private_data = targetdata;
 	tqueue->queuedata = targetdata;
 	tqueue->queuedata = targetdata;
 
 
-	blk_queue_max_hw_sectors(tqueue, 8 * dev->ops->max_phys_sect);
+	blk_queue_max_hw_sectors(tqueue,
+			(dev->geo.csecs >> 9) * NVM_MAX_VLBA);
 
 
 	set_capacity(tdisk, tt->capacity(targetdata));
 	set_capacity(tdisk, tt->capacity(targetdata));
 	add_disk(tdisk);
 	add_disk(tdisk);
@@ -503,20 +504,20 @@ static int nvm_register_map(struct nvm_dev *dev)
 	if (!rmap)
 	if (!rmap)
 		goto err_rmap;
 		goto err_rmap;
 
 
-	rmap->chnls = kcalloc(dev->geo.nr_chnls, sizeof(struct nvm_ch_map),
+	rmap->chnls = kcalloc(dev->geo.num_ch, sizeof(struct nvm_ch_map),
 								GFP_KERNEL);
 								GFP_KERNEL);
 	if (!rmap->chnls)
 	if (!rmap->chnls)
 		goto err_chnls;
 		goto err_chnls;
 
 
-	for (i = 0; i < dev->geo.nr_chnls; i++) {
+	for (i = 0; i < dev->geo.num_ch; i++) {
 		struct nvm_ch_map *ch_rmap;
 		struct nvm_ch_map *ch_rmap;
 		int *lun_roffs;
 		int *lun_roffs;
-		int luns_in_chnl = dev->geo.nr_luns;
+		int luns_in_chnl = dev->geo.num_lun;
 
 
 		ch_rmap = &rmap->chnls[i];
 		ch_rmap = &rmap->chnls[i];
 
 
 		ch_rmap->ch_off = -1;
 		ch_rmap->ch_off = -1;
-		ch_rmap->nr_luns = luns_in_chnl;
+		ch_rmap->num_lun = luns_in_chnl;
 
 
 		lun_roffs = kcalloc(luns_in_chnl, sizeof(int), GFP_KERNEL);
 		lun_roffs = kcalloc(luns_in_chnl, sizeof(int), GFP_KERNEL);
 		if (!lun_roffs)
 		if (!lun_roffs)
@@ -545,7 +546,7 @@ static void nvm_unregister_map(struct nvm_dev *dev)
 	struct nvm_dev_map *rmap = dev->rmap;
 	struct nvm_dev_map *rmap = dev->rmap;
 	int i;
 	int i;
 
 
-	for (i = 0; i < dev->geo.nr_chnls; i++)
+	for (i = 0; i < dev->geo.num_ch; i++)
 		kfree(rmap->chnls[i].lun_offs);
 		kfree(rmap->chnls[i].lun_offs);
 
 
 	kfree(rmap->chnls);
 	kfree(rmap->chnls);
@@ -555,22 +556,22 @@ static void nvm_unregister_map(struct nvm_dev *dev)
 static void nvm_map_to_dev(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *p)
 static void nvm_map_to_dev(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *p)
 {
 {
 	struct nvm_dev_map *dev_map = tgt_dev->map;
 	struct nvm_dev_map *dev_map = tgt_dev->map;
-	struct nvm_ch_map *ch_map = &dev_map->chnls[p->g.ch];
-	int lun_off = ch_map->lun_offs[p->g.lun];
+	struct nvm_ch_map *ch_map = &dev_map->chnls[p->a.ch];
+	int lun_off = ch_map->lun_offs[p->a.lun];
 
 
-	p->g.ch += ch_map->ch_off;
-	p->g.lun += lun_off;
+	p->a.ch += ch_map->ch_off;
+	p->a.lun += lun_off;
 }
 }
 
 
 static void nvm_map_to_tgt(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *p)
 static void nvm_map_to_tgt(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *p)
 {
 {
 	struct nvm_dev *dev = tgt_dev->parent;
 	struct nvm_dev *dev = tgt_dev->parent;
 	struct nvm_dev_map *dev_rmap = dev->rmap;
 	struct nvm_dev_map *dev_rmap = dev->rmap;
-	struct nvm_ch_map *ch_rmap = &dev_rmap->chnls[p->g.ch];
-	int lun_roff = ch_rmap->lun_offs[p->g.lun];
+	struct nvm_ch_map *ch_rmap = &dev_rmap->chnls[p->a.ch];
+	int lun_roff = ch_rmap->lun_offs[p->a.lun];
 
 
-	p->g.ch -= ch_rmap->ch_off;
-	p->g.lun -= lun_roff;
+	p->a.ch -= ch_rmap->ch_off;
+	p->a.lun -= lun_roff;
 }
 }
 
 
 static void nvm_ppa_tgt_to_dev(struct nvm_tgt_dev *tgt_dev,
 static void nvm_ppa_tgt_to_dev(struct nvm_tgt_dev *tgt_dev,
@@ -580,7 +581,7 @@ static void nvm_ppa_tgt_to_dev(struct nvm_tgt_dev *tgt_dev,
 
 
 	for (i = 0; i < nr_ppas; i++) {
 	for (i = 0; i < nr_ppas; i++) {
 		nvm_map_to_dev(tgt_dev, &ppa_list[i]);
 		nvm_map_to_dev(tgt_dev, &ppa_list[i]);
-		ppa_list[i] = generic_to_dev_addr(tgt_dev, ppa_list[i]);
+		ppa_list[i] = generic_to_dev_addr(tgt_dev->parent, ppa_list[i]);
 	}
 	}
 }
 }
 
 
@@ -590,7 +591,7 @@ static void nvm_ppa_dev_to_tgt(struct nvm_tgt_dev *tgt_dev,
 	int i;
 	int i;
 
 
 	for (i = 0; i < nr_ppas; i++) {
 	for (i = 0; i < nr_ppas; i++) {
-		ppa_list[i] = dev_to_generic_addr(tgt_dev, ppa_list[i]);
+		ppa_list[i] = dev_to_generic_addr(tgt_dev->parent, ppa_list[i]);
 		nvm_map_to_tgt(tgt_dev, &ppa_list[i]);
 		nvm_map_to_tgt(tgt_dev, &ppa_list[i]);
 	}
 	}
 }
 }
@@ -674,7 +675,7 @@ static int nvm_set_rqd_ppalist(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd,
 	int i, plane_cnt, pl_idx;
 	int i, plane_cnt, pl_idx;
 	struct ppa_addr ppa;
 	struct ppa_addr ppa;
 
 
-	if (geo->plane_mode == NVM_PLANE_SINGLE && nr_ppas == 1) {
+	if (geo->pln_mode == NVM_PLANE_SINGLE && nr_ppas == 1) {
 		rqd->nr_ppas = nr_ppas;
 		rqd->nr_ppas = nr_ppas;
 		rqd->ppa_addr = ppas[0];
 		rqd->ppa_addr = ppas[0];
 
 
@@ -688,7 +689,7 @@ static int nvm_set_rqd_ppalist(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd,
 		return -ENOMEM;
 		return -ENOMEM;
 	}
 	}
 
 
-	plane_cnt = geo->plane_mode;
+	plane_cnt = geo->pln_mode;
 	rqd->nr_ppas *= plane_cnt;
 	rqd->nr_ppas *= plane_cnt;
 
 
 	for (i = 0; i < nr_ppas; i++) {
 	for (i = 0; i < nr_ppas; i++) {
@@ -711,6 +712,17 @@ static void nvm_free_rqd_ppalist(struct nvm_tgt_dev *tgt_dev,
 	nvm_dev_dma_free(tgt_dev->parent, rqd->ppa_list, rqd->dma_ppa_list);
 	nvm_dev_dma_free(tgt_dev->parent, rqd->ppa_list, rqd->dma_ppa_list);
 }
 }
 
 
+int nvm_get_chunk_meta(struct nvm_tgt_dev *tgt_dev, struct nvm_chk_meta *meta,
+		struct ppa_addr ppa, int nchks)
+{
+	struct nvm_dev *dev = tgt_dev->parent;
+
+	nvm_ppa_tgt_to_dev(tgt_dev, &ppa, 1);
+
+	return dev->ops->get_chk_meta(tgt_dev->parent, meta,
+						(sector_t)ppa.ppa, nchks);
+}
+EXPORT_SYMBOL(nvm_get_chunk_meta);
 
 
 int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas,
 int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas,
 		       int nr_ppas, int type)
 		       int nr_ppas, int type)
@@ -719,7 +731,7 @@ int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas,
 	struct nvm_rq rqd;
 	struct nvm_rq rqd;
 	int ret;
 	int ret;
 
 
-	if (nr_ppas > dev->ops->max_phys_sect) {
+	if (nr_ppas > NVM_MAX_VLBA) {
 		pr_err("nvm: unable to update all blocks atomically\n");
 		pr_err("nvm: unable to update all blocks atomically\n");
 		return -EINVAL;
 		return -EINVAL;
 	}
 	}
@@ -740,14 +752,6 @@ int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas,
 }
 }
 EXPORT_SYMBOL(nvm_set_tgt_bb_tbl);
 EXPORT_SYMBOL(nvm_set_tgt_bb_tbl);
 
 
-int nvm_max_phys_sects(struct nvm_tgt_dev *tgt_dev)
-{
-	struct nvm_dev *dev = tgt_dev->parent;
-
-	return dev->ops->max_phys_sect;
-}
-EXPORT_SYMBOL(nvm_max_phys_sects);
-
 int nvm_submit_io(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd)
 int nvm_submit_io(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd)
 {
 {
 	struct nvm_dev *dev = tgt_dev->parent;
 	struct nvm_dev *dev = tgt_dev->parent;
@@ -814,15 +818,15 @@ int nvm_bb_tbl_fold(struct nvm_dev *dev, u8 *blks, int nr_blks)
 	struct nvm_geo *geo = &dev->geo;
 	struct nvm_geo *geo = &dev->geo;
 	int blk, offset, pl, blktype;
 	int blk, offset, pl, blktype;
 
 
-	if (nr_blks != geo->nr_chks * geo->plane_mode)
+	if (nr_blks != geo->num_chk * geo->pln_mode)
 		return -EINVAL;
 		return -EINVAL;
 
 
-	for (blk = 0; blk < geo->nr_chks; blk++) {
-		offset = blk * geo->plane_mode;
+	for (blk = 0; blk < geo->num_chk; blk++) {
+		offset = blk * geo->pln_mode;
 		blktype = blks[offset];
 		blktype = blks[offset];
 
 
 		/* Bad blocks on any planes take precedence over other types */
 		/* Bad blocks on any planes take precedence over other types */
-		for (pl = 0; pl < geo->plane_mode; pl++) {
+		for (pl = 0; pl < geo->pln_mode; pl++) {
 			if (blks[offset + pl] &
 			if (blks[offset + pl] &
 					(NVM_BLK_T_BAD|NVM_BLK_T_GRWN_BAD)) {
 					(NVM_BLK_T_BAD|NVM_BLK_T_GRWN_BAD)) {
 				blktype = blks[offset + pl];
 				blktype = blks[offset + pl];
@@ -833,7 +837,7 @@ int nvm_bb_tbl_fold(struct nvm_dev *dev, u8 *blks, int nr_blks)
 		blks[blk] = blktype;
 		blks[blk] = blktype;
 	}
 	}
 
 
-	return geo->nr_chks;
+	return geo->num_chk;
 }
 }
 EXPORT_SYMBOL(nvm_bb_tbl_fold);
 EXPORT_SYMBOL(nvm_bb_tbl_fold);
 
 
@@ -850,44 +854,9 @@ EXPORT_SYMBOL(nvm_get_tgt_bb_tbl);
 
 
 static int nvm_core_init(struct nvm_dev *dev)
 static int nvm_core_init(struct nvm_dev *dev)
 {
 {
-	struct nvm_id *id = &dev->identity;
-	struct nvm_id_group *grp = &id->grp;
 	struct nvm_geo *geo = &dev->geo;
 	struct nvm_geo *geo = &dev->geo;
 	int ret;
 	int ret;
 
 
-	memcpy(&geo->ppaf, &id->ppaf, sizeof(struct nvm_addr_format));
-
-	if (grp->mtype != 0) {
-		pr_err("nvm: memory type not supported\n");
-		return -EINVAL;
-	}
-
-	/* Whole device values */
-	geo->nr_chnls = grp->num_ch;
-	geo->nr_luns = grp->num_lun;
-
-	/* Generic device geometry values */
-	geo->ws_min = grp->ws_min;
-	geo->ws_opt = grp->ws_opt;
-	geo->ws_seq = grp->ws_seq;
-	geo->ws_per_chk = grp->ws_per_chk;
-	geo->nr_chks = grp->num_chk;
-	geo->sec_size = grp->csecs;
-	geo->oob_size = grp->sos;
-	geo->mccap = grp->mccap;
-	geo->max_rq_size = dev->ops->max_phys_sect * geo->sec_size;
-
-	geo->sec_per_chk = grp->clba;
-	geo->sec_per_lun = geo->sec_per_chk * geo->nr_chks;
-	geo->all_luns = geo->nr_luns * geo->nr_chnls;
-
-	/* 1.2 spec device geometry values */
-	geo->plane_mode = 1 << geo->ws_seq;
-	geo->nr_planes = geo->ws_opt / geo->ws_min;
-	geo->sec_per_pg = geo->ws_min;
-	geo->sec_per_pl = geo->sec_per_pg * geo->nr_planes;
-
-	dev->total_secs = geo->all_luns * geo->sec_per_lun;
 	dev->lun_map = kcalloc(BITS_TO_LONGS(geo->all_luns),
 	dev->lun_map = kcalloc(BITS_TO_LONGS(geo->all_luns),
 					sizeof(unsigned long), GFP_KERNEL);
 					sizeof(unsigned long), GFP_KERNEL);
 	if (!dev->lun_map)
 	if (!dev->lun_map)
@@ -902,7 +871,6 @@ static int nvm_core_init(struct nvm_dev *dev)
 	if (ret)
 	if (ret)
 		goto err_fmtype;
 		goto err_fmtype;
 
 
-	blk_queue_logical_block_size(dev->q, geo->sec_size);
 	return 0;
 	return 0;
 err_fmtype:
 err_fmtype:
 	kfree(dev->lun_map);
 	kfree(dev->lun_map);
@@ -927,18 +895,14 @@ static int nvm_init(struct nvm_dev *dev)
 	struct nvm_geo *geo = &dev->geo;
 	struct nvm_geo *geo = &dev->geo;
 	int ret = -EINVAL;
 	int ret = -EINVAL;
 
 
-	if (dev->ops->identity(dev, &dev->identity)) {
+	if (dev->ops->identity(dev)) {
 		pr_err("nvm: device could not be identified\n");
 		pr_err("nvm: device could not be identified\n");
 		goto err;
 		goto err;
 	}
 	}
 
 
-	pr_debug("nvm: ver:%x nvm_vendor:%x\n",
-			dev->identity.ver_id, dev->identity.vmnt);
-
-	if (dev->identity.ver_id != 1) {
-		pr_err("nvm: device not supported by kernel.");
-		goto err;
-	}
+	pr_debug("nvm: ver:%u.%u nvm_vendor:%x\n",
+				geo->major_ver_id, geo->minor_ver_id,
+				geo->vmnt);
 
 
 	ret = nvm_core_init(dev);
 	ret = nvm_core_init(dev);
 	if (ret) {
 	if (ret) {
@@ -946,10 +910,10 @@ static int nvm_init(struct nvm_dev *dev)
 		goto err;
 		goto err;
 	}
 	}
 
 
-	pr_info("nvm: registered %s [%u/%u/%u/%u/%u/%u]\n",
-			dev->name, geo->sec_per_pg, geo->nr_planes,
-			geo->ws_per_chk, geo->nr_chks,
-			geo->all_luns, geo->nr_chnls);
+	pr_info("nvm: registered %s [%u/%u/%u/%u/%u]\n",
+			dev->name, dev->geo.ws_min, dev->geo.ws_opt,
+			dev->geo.num_chk, dev->geo.all_luns,
+			dev->geo.num_ch);
 	return 0;
 	return 0;
 err:
 err:
 	pr_err("nvm: failed to initialize nvm\n");
 	pr_err("nvm: failed to initialize nvm\n");
@@ -969,17 +933,10 @@ int nvm_register(struct nvm_dev *dev)
 	if (!dev->q || !dev->ops)
 	if (!dev->q || !dev->ops)
 		return -EINVAL;
 		return -EINVAL;
 
 
-	if (dev->ops->max_phys_sect > 256) {
-		pr_info("nvm: max sectors supported is 256.\n");
-		return -EINVAL;
-	}
-
-	if (dev->ops->max_phys_sect > 1) {
-		dev->dma_pool = dev->ops->create_dma_pool(dev, "ppalist");
-		if (!dev->dma_pool) {
-			pr_err("nvm: could not create dma pool\n");
-			return -ENOMEM;
-		}
+	dev->dma_pool = dev->ops->create_dma_pool(dev, "ppalist");
+	if (!dev->dma_pool) {
+		pr_err("nvm: could not create dma pool\n");
+		return -ENOMEM;
 	}
 	}
 
 
 	ret = nvm_init(dev);
 	ret = nvm_init(dev);
@@ -1040,9 +997,6 @@ static long nvm_ioctl_info(struct file *file, void __user *arg)
 	struct nvm_tgt_type *tt;
 	struct nvm_tgt_type *tt;
 	int tgt_iter = 0;
 	int tgt_iter = 0;
 
 
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
 	info = memdup_user(arg, sizeof(struct nvm_ioctl_info));
 	info = memdup_user(arg, sizeof(struct nvm_ioctl_info));
 	if (IS_ERR(info))
 	if (IS_ERR(info))
 		return -EFAULT;
 		return -EFAULT;
@@ -1081,9 +1035,6 @@ static long nvm_ioctl_get_devices(struct file *file, void __user *arg)
 	struct nvm_dev *dev;
 	struct nvm_dev *dev;
 	int i = 0;
 	int i = 0;
 
 
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
 	devices = kzalloc(sizeof(struct nvm_ioctl_get_devices), GFP_KERNEL);
 	devices = kzalloc(sizeof(struct nvm_ioctl_get_devices), GFP_KERNEL);
 	if (!devices)
 	if (!devices)
 		return -ENOMEM;
 		return -ENOMEM;
@@ -1124,9 +1075,6 @@ static long nvm_ioctl_dev_create(struct file *file, void __user *arg)
 {
 {
 	struct nvm_ioctl_create create;
 	struct nvm_ioctl_create create;
 
 
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
 	if (copy_from_user(&create, arg, sizeof(struct nvm_ioctl_create)))
 	if (copy_from_user(&create, arg, sizeof(struct nvm_ioctl_create)))
 		return -EFAULT;
 		return -EFAULT;
 
 
@@ -1162,9 +1110,6 @@ static long nvm_ioctl_dev_remove(struct file *file, void __user *arg)
 	struct nvm_dev *dev;
 	struct nvm_dev *dev;
 	int ret = 0;
 	int ret = 0;
 
 
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
 	if (copy_from_user(&remove, arg, sizeof(struct nvm_ioctl_remove)))
 	if (copy_from_user(&remove, arg, sizeof(struct nvm_ioctl_remove)))
 		return -EFAULT;
 		return -EFAULT;
 
 
@@ -1189,9 +1134,6 @@ static long nvm_ioctl_dev_init(struct file *file, void __user *arg)
 {
 {
 	struct nvm_ioctl_dev_init init;
 	struct nvm_ioctl_dev_init init;
 
 
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
 	if (copy_from_user(&init, arg, sizeof(struct nvm_ioctl_dev_init)))
 	if (copy_from_user(&init, arg, sizeof(struct nvm_ioctl_dev_init)))
 		return -EFAULT;
 		return -EFAULT;
 
 
@@ -1208,9 +1150,6 @@ static long nvm_ioctl_dev_factory(struct file *file, void __user *arg)
 {
 {
 	struct nvm_ioctl_dev_factory fact;
 	struct nvm_ioctl_dev_factory fact;
 
 
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
 	if (copy_from_user(&fact, arg, sizeof(struct nvm_ioctl_dev_factory)))
 	if (copy_from_user(&fact, arg, sizeof(struct nvm_ioctl_dev_factory)))
 		return -EFAULT;
 		return -EFAULT;
 
 
@@ -1226,6 +1165,9 @@ static long nvm_ctl_ioctl(struct file *file, uint cmd, unsigned long arg)
 {
 {
 	void __user *argp = (void __user *)arg;
 	void __user *argp = (void __user *)arg;
 
 
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
 	switch (cmd) {
 	switch (cmd) {
 	case NVM_INFO:
 	case NVM_INFO:
 		return nvm_ioctl_info(file, argp);
 		return nvm_ioctl_info(file, argp);

+ 4 - 0
drivers/lightnvm/pblk-cache.c

@@ -63,6 +63,8 @@ retry:
 		bio_advance(bio, PBLK_EXPOSED_PAGE_SIZE);
 		bio_advance(bio, PBLK_EXPOSED_PAGE_SIZE);
 	}
 	}
 
 
+	atomic64_add(nr_entries, &pblk->user_wa);
+
 #ifdef CONFIG_NVM_DEBUG
 #ifdef CONFIG_NVM_DEBUG
 	atomic_long_add(nr_entries, &pblk->inflight_writes);
 	atomic_long_add(nr_entries, &pblk->inflight_writes);
 	atomic_long_add(nr_entries, &pblk->req_writes);
 	atomic_long_add(nr_entries, &pblk->req_writes);
@@ -117,6 +119,8 @@ retry:
 	WARN_ONCE(gc_rq->secs_to_gc != valid_entries,
 	WARN_ONCE(gc_rq->secs_to_gc != valid_entries,
 					"pblk: inconsistent GC write\n");
 					"pblk: inconsistent GC write\n");
 
 
+	atomic64_add(valid_entries, &pblk->gc_wa);
+
 #ifdef CONFIG_NVM_DEBUG
 #ifdef CONFIG_NVM_DEBUG
 	atomic_long_add(valid_entries, &pblk->inflight_writes);
 	atomic_long_add(valid_entries, &pblk->inflight_writes);
 	atomic_long_add(valid_entries, &pblk->recov_gc_writes);
 	atomic_long_add(valid_entries, &pblk->recov_gc_writes);

+ 154 - 48
drivers/lightnvm/pblk-core.c

@@ -44,11 +44,12 @@ static void pblk_line_mark_bb(struct work_struct *work)
 }
 }
 
 
 static void pblk_mark_bb(struct pblk *pblk, struct pblk_line *line,
 static void pblk_mark_bb(struct pblk *pblk, struct pblk_line *line,
-			 struct ppa_addr *ppa)
+			 struct ppa_addr ppa_addr)
 {
 {
 	struct nvm_tgt_dev *dev = pblk->dev;
 	struct nvm_tgt_dev *dev = pblk->dev;
 	struct nvm_geo *geo = &dev->geo;
 	struct nvm_geo *geo = &dev->geo;
-	int pos = pblk_ppa_to_pos(geo, *ppa);
+	struct ppa_addr *ppa;
+	int pos = pblk_ppa_to_pos(geo, ppa_addr);
 
 
 	pr_debug("pblk: erase failed: line:%d, pos:%d\n", line->id, pos);
 	pr_debug("pblk: erase failed: line:%d, pos:%d\n", line->id, pos);
 	atomic_long_inc(&pblk->erase_failed);
 	atomic_long_inc(&pblk->erase_failed);
@@ -58,26 +59,38 @@ static void pblk_mark_bb(struct pblk *pblk, struct pblk_line *line,
 		pr_err("pblk: attempted to erase bb: line:%d, pos:%d\n",
 		pr_err("pblk: attempted to erase bb: line:%d, pos:%d\n",
 							line->id, pos);
 							line->id, pos);
 
 
+	/* Not necessary to mark bad blocks on 2.0 spec. */
+	if (geo->version == NVM_OCSSD_SPEC_20)
+		return;
+
+	ppa = kmalloc(sizeof(struct ppa_addr), GFP_ATOMIC);
+	if (!ppa)
+		return;
+
+	*ppa = ppa_addr;
 	pblk_gen_run_ws(pblk, NULL, ppa, pblk_line_mark_bb,
 	pblk_gen_run_ws(pblk, NULL, ppa, pblk_line_mark_bb,
 						GFP_ATOMIC, pblk->bb_wq);
 						GFP_ATOMIC, pblk->bb_wq);
 }
 }
 
 
 static void __pblk_end_io_erase(struct pblk *pblk, struct nvm_rq *rqd)
 static void __pblk_end_io_erase(struct pblk *pblk, struct nvm_rq *rqd)
 {
 {
+	struct nvm_tgt_dev *dev = pblk->dev;
+	struct nvm_geo *geo = &dev->geo;
+	struct nvm_chk_meta *chunk;
 	struct pblk_line *line;
 	struct pblk_line *line;
+	int pos;
 
 
 	line = &pblk->lines[pblk_ppa_to_line(rqd->ppa_addr)];
 	line = &pblk->lines[pblk_ppa_to_line(rqd->ppa_addr)];
+	pos = pblk_ppa_to_pos(geo, rqd->ppa_addr);
+	chunk = &line->chks[pos];
+
 	atomic_dec(&line->left_seblks);
 	atomic_dec(&line->left_seblks);
 
 
 	if (rqd->error) {
 	if (rqd->error) {
-		struct ppa_addr *ppa;
-
-		ppa = kmalloc(sizeof(struct ppa_addr), GFP_ATOMIC);
-		if (!ppa)
-			return;
-
-		*ppa = rqd->ppa_addr;
-		pblk_mark_bb(pblk, line, ppa);
+		chunk->state = NVM_CHK_ST_OFFLINE;
+		pblk_mark_bb(pblk, line, rqd->ppa_addr);
+	} else {
+		chunk->state = NVM_CHK_ST_FREE;
 	}
 	}
 
 
 	atomic_dec(&pblk->inflight_io);
 	atomic_dec(&pblk->inflight_io);
@@ -92,6 +105,49 @@ static void pblk_end_io_erase(struct nvm_rq *rqd)
 	mempool_free(rqd, pblk->e_rq_pool);
 	mempool_free(rqd, pblk->e_rq_pool);
 }
 }
 
 
+/*
+ * Get information for all chunks from the device.
+ *
+ * The caller is responsible for freeing the returned structure
+ */
+struct nvm_chk_meta *pblk_chunk_get_info(struct pblk *pblk)
+{
+	struct nvm_tgt_dev *dev = pblk->dev;
+	struct nvm_geo *geo = &dev->geo;
+	struct nvm_chk_meta *meta;
+	struct ppa_addr ppa;
+	unsigned long len;
+	int ret;
+
+	ppa.ppa = 0;
+
+	len = geo->all_chunks * sizeof(*meta);
+	meta = kzalloc(len, GFP_KERNEL);
+	if (!meta)
+		return ERR_PTR(-ENOMEM);
+
+	ret = nvm_get_chunk_meta(dev, meta, ppa, geo->all_chunks);
+	if (ret) {
+		kfree(meta);
+		return ERR_PTR(-EIO);
+	}
+
+	return meta;
+}
+
+struct nvm_chk_meta *pblk_chunk_get_off(struct pblk *pblk,
+					      struct nvm_chk_meta *meta,
+					      struct ppa_addr ppa)
+{
+	struct nvm_tgt_dev *dev = pblk->dev;
+	struct nvm_geo *geo = &dev->geo;
+	int ch_off = ppa.m.grp * geo->num_chk * geo->num_lun;
+	int lun_off = ppa.m.pu * geo->num_chk;
+	int chk_off = ppa.m.chk;
+
+	return meta + ch_off + lun_off + chk_off;
+}
+
 void __pblk_map_invalidate(struct pblk *pblk, struct pblk_line *line,
 void __pblk_map_invalidate(struct pblk *pblk, struct pblk_line *line,
 			   u64 paddr)
 			   u64 paddr)
 {
 {
@@ -613,7 +669,7 @@ next_rq:
 	memset(&rqd, 0, sizeof(struct nvm_rq));
 	memset(&rqd, 0, sizeof(struct nvm_rq));
 
 
 	rq_ppas = pblk_calc_secs(pblk, left_ppas, 0);
 	rq_ppas = pblk_calc_secs(pblk, left_ppas, 0);
-	rq_len = rq_ppas * geo->sec_size;
+	rq_len = rq_ppas * geo->csecs;
 
 
 	bio = pblk_bio_map_addr(pblk, emeta_buf, rq_ppas, rq_len,
 	bio = pblk_bio_map_addr(pblk, emeta_buf, rq_ppas, rq_len,
 					l_mg->emeta_alloc_type, GFP_KERNEL);
 					l_mg->emeta_alloc_type, GFP_KERNEL);
@@ -722,7 +778,7 @@ u64 pblk_line_smeta_start(struct pblk *pblk, struct pblk_line *line)
 	if (bit >= lm->blk_per_line)
 	if (bit >= lm->blk_per_line)
 		return -1;
 		return -1;
 
 
-	return bit * geo->sec_per_pl;
+	return bit * geo->ws_opt;
 }
 }
 
 
 static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line,
 static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line,
@@ -885,7 +941,7 @@ int pblk_line_erase(struct pblk *pblk, struct pblk_line *line)
 		}
 		}
 
 
 		ppa = pblk->luns[bit].bppa; /* set ch and lun */
 		ppa = pblk->luns[bit].bppa; /* set ch and lun */
-		ppa.g.blk = line->id;
+		ppa.a.blk = line->id;
 
 
 		atomic_dec(&line->left_eblks);
 		atomic_dec(&line->left_eblks);
 		WARN_ON(test_and_set_bit(bit, line->erase_bitmap));
 		WARN_ON(test_and_set_bit(bit, line->erase_bitmap));
@@ -975,7 +1031,8 @@ static int pblk_line_init_metadata(struct pblk *pblk, struct pblk_line *line,
 	memcpy(smeta_buf->header.uuid, pblk->instance_uuid, 16);
 	memcpy(smeta_buf->header.uuid, pblk->instance_uuid, 16);
 	smeta_buf->header.id = cpu_to_le32(line->id);
 	smeta_buf->header.id = cpu_to_le32(line->id);
 	smeta_buf->header.type = cpu_to_le16(line->type);
 	smeta_buf->header.type = cpu_to_le16(line->type);
-	smeta_buf->header.version = SMETA_VERSION;
+	smeta_buf->header.version_major = SMETA_VERSION_MAJOR;
+	smeta_buf->header.version_minor = SMETA_VERSION_MINOR;
 
 
 	/* Start metadata */
 	/* Start metadata */
 	smeta_buf->seq_nr = cpu_to_le64(line->seq_nr);
 	smeta_buf->seq_nr = cpu_to_le64(line->seq_nr);
@@ -998,6 +1055,12 @@ static int pblk_line_init_metadata(struct pblk *pblk, struct pblk_line *line,
 	/* End metadata */
 	/* End metadata */
 	memcpy(&emeta_buf->header, &smeta_buf->header,
 	memcpy(&emeta_buf->header, &smeta_buf->header,
 						sizeof(struct line_header));
 						sizeof(struct line_header));
+
+	emeta_buf->header.version_major = EMETA_VERSION_MAJOR;
+	emeta_buf->header.version_minor = EMETA_VERSION_MINOR;
+	emeta_buf->header.crc = cpu_to_le32(
+			pblk_calc_meta_header_crc(pblk, &emeta_buf->header));
+
 	emeta_buf->seq_nr = cpu_to_le64(line->seq_nr);
 	emeta_buf->seq_nr = cpu_to_le64(line->seq_nr);
 	emeta_buf->nr_lbas = cpu_to_le64(line->sec_in_line);
 	emeta_buf->nr_lbas = cpu_to_le64(line->sec_in_line);
 	emeta_buf->nr_valid_lbas = cpu_to_le64(0);
 	emeta_buf->nr_valid_lbas = cpu_to_le64(0);
@@ -1018,28 +1081,26 @@ static int pblk_line_init_bb(struct pblk *pblk, struct pblk_line *line,
 	struct nvm_geo *geo = &dev->geo;
 	struct nvm_geo *geo = &dev->geo;
 	struct pblk_line_meta *lm = &pblk->lm;
 	struct pblk_line_meta *lm = &pblk->lm;
 	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
 	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
-	int nr_bb = 0;
 	u64 off;
 	u64 off;
 	int bit = -1;
 	int bit = -1;
+	int emeta_secs;
 
 
 	line->sec_in_line = lm->sec_per_line;
 	line->sec_in_line = lm->sec_per_line;
 
 
 	/* Capture bad block information on line mapping bitmaps */
 	/* Capture bad block information on line mapping bitmaps */
 	while ((bit = find_next_bit(line->blk_bitmap, lm->blk_per_line,
 	while ((bit = find_next_bit(line->blk_bitmap, lm->blk_per_line,
 					bit + 1)) < lm->blk_per_line) {
 					bit + 1)) < lm->blk_per_line) {
-		off = bit * geo->sec_per_pl;
+		off = bit * geo->ws_opt;
 		bitmap_shift_left(l_mg->bb_aux, l_mg->bb_template, off,
 		bitmap_shift_left(l_mg->bb_aux, l_mg->bb_template, off,
 							lm->sec_per_line);
 							lm->sec_per_line);
 		bitmap_or(line->map_bitmap, line->map_bitmap, l_mg->bb_aux,
 		bitmap_or(line->map_bitmap, line->map_bitmap, l_mg->bb_aux,
 							lm->sec_per_line);
 							lm->sec_per_line);
-		line->sec_in_line -= geo->sec_per_chk;
-		if (bit >= lm->emeta_bb)
-			nr_bb++;
+		line->sec_in_line -= geo->clba;
 	}
 	}
 
 
 	/* Mark smeta metadata sectors as bad sectors */
 	/* Mark smeta metadata sectors as bad sectors */
 	bit = find_first_zero_bit(line->blk_bitmap, lm->blk_per_line);
 	bit = find_first_zero_bit(line->blk_bitmap, lm->blk_per_line);
-	off = bit * geo->sec_per_pl;
+	off = bit * geo->ws_opt;
 	bitmap_set(line->map_bitmap, off, lm->smeta_sec);
 	bitmap_set(line->map_bitmap, off, lm->smeta_sec);
 	line->sec_in_line -= lm->smeta_sec;
 	line->sec_in_line -= lm->smeta_sec;
 	line->smeta_ssec = off;
 	line->smeta_ssec = off;
@@ -1055,18 +1116,18 @@ static int pblk_line_init_bb(struct pblk *pblk, struct pblk_line *line,
 	/* Mark emeta metadata sectors as bad sectors. We need to consider bad
 	/* Mark emeta metadata sectors as bad sectors. We need to consider bad
 	 * blocks to make sure that there are enough sectors to store emeta
 	 * blocks to make sure that there are enough sectors to store emeta
 	 */
 	 */
-	off = lm->sec_per_line - lm->emeta_sec[0];
-	bitmap_set(line->invalid_bitmap, off, lm->emeta_sec[0]);
-	while (nr_bb) {
-		off -= geo->sec_per_pl;
+	emeta_secs = lm->emeta_sec[0];
+	off = lm->sec_per_line;
+	while (emeta_secs) {
+		off -= geo->ws_opt;
 		if (!test_bit(off, line->invalid_bitmap)) {
 		if (!test_bit(off, line->invalid_bitmap)) {
-			bitmap_set(line->invalid_bitmap, off, geo->sec_per_pl);
-			nr_bb--;
+			bitmap_set(line->invalid_bitmap, off, geo->ws_opt);
+			emeta_secs -= geo->ws_opt;
 		}
 		}
 	}
 	}
 
 
-	line->sec_in_line -= lm->emeta_sec[0];
 	line->emeta_ssec = off;
 	line->emeta_ssec = off;
+	line->sec_in_line -= lm->emeta_sec[0];
 	line->nr_valid_lbas = 0;
 	line->nr_valid_lbas = 0;
 	line->left_msecs = line->sec_in_line;
 	line->left_msecs = line->sec_in_line;
 	*line->vsc = cpu_to_le32(line->sec_in_line);
 	*line->vsc = cpu_to_le32(line->sec_in_line);
@@ -1086,10 +1147,34 @@ static int pblk_line_init_bb(struct pblk *pblk, struct pblk_line *line,
 	return 1;
 	return 1;
 }
 }
 
 
+static int pblk_prepare_new_line(struct pblk *pblk, struct pblk_line *line)
+{
+	struct pblk_line_meta *lm = &pblk->lm;
+	struct nvm_tgt_dev *dev = pblk->dev;
+	struct nvm_geo *geo = &dev->geo;
+	int blk_to_erase = atomic_read(&line->blk_in_line);
+	int i;
+
+	for (i = 0; i < lm->blk_per_line; i++) {
+		struct pblk_lun *rlun = &pblk->luns[i];
+		int pos = pblk_ppa_to_pos(geo, rlun->bppa);
+		int state = line->chks[pos].state;
+
+		/* Free chunks should not be erased */
+		if (state & NVM_CHK_ST_FREE) {
+			set_bit(pblk_ppa_to_pos(geo, rlun->bppa),
+							line->erase_bitmap);
+			blk_to_erase--;
+		}
+	}
+
+	return blk_to_erase;
+}
+
 static int pblk_line_prepare(struct pblk *pblk, struct pblk_line *line)
 static int pblk_line_prepare(struct pblk *pblk, struct pblk_line *line)
 {
 {
 	struct pblk_line_meta *lm = &pblk->lm;
 	struct pblk_line_meta *lm = &pblk->lm;
-	int blk_in_line = atomic_read(&line->blk_in_line);
+	int blk_to_erase;
 
 
 	line->map_bitmap = kzalloc(lm->sec_bitmap_len, GFP_ATOMIC);
 	line->map_bitmap = kzalloc(lm->sec_bitmap_len, GFP_ATOMIC);
 	if (!line->map_bitmap)
 	if (!line->map_bitmap)
@@ -1102,7 +1187,21 @@ static int pblk_line_prepare(struct pblk *pblk, struct pblk_line *line)
 		return -ENOMEM;
 		return -ENOMEM;
 	}
 	}
 
 
+	/* Bad blocks do not need to be erased */
+	bitmap_copy(line->erase_bitmap, line->blk_bitmap, lm->blk_per_line);
+
 	spin_lock(&line->lock);
 	spin_lock(&line->lock);
+
+	/* If we have not written to this line, we need to mark up free chunks
+	 * as already erased
+	 */
+	if (line->state == PBLK_LINESTATE_NEW) {
+		blk_to_erase = pblk_prepare_new_line(pblk, line);
+		line->state = PBLK_LINESTATE_FREE;
+	} else {
+		blk_to_erase = atomic_read(&line->blk_in_line);
+	}
+
 	if (line->state != PBLK_LINESTATE_FREE) {
 	if (line->state != PBLK_LINESTATE_FREE) {
 		kfree(line->map_bitmap);
 		kfree(line->map_bitmap);
 		kfree(line->invalid_bitmap);
 		kfree(line->invalid_bitmap);
@@ -1114,15 +1213,12 @@ static int pblk_line_prepare(struct pblk *pblk, struct pblk_line *line)
 
 
 	line->state = PBLK_LINESTATE_OPEN;
 	line->state = PBLK_LINESTATE_OPEN;
 
 
-	atomic_set(&line->left_eblks, blk_in_line);
-	atomic_set(&line->left_seblks, blk_in_line);
+	atomic_set(&line->left_eblks, blk_to_erase);
+	atomic_set(&line->left_seblks, blk_to_erase);
 
 
 	line->meta_distance = lm->meta_distance;
 	line->meta_distance = lm->meta_distance;
 	spin_unlock(&line->lock);
 	spin_unlock(&line->lock);
 
 
-	/* Bad blocks do not need to be erased */
-	bitmap_copy(line->erase_bitmap, line->blk_bitmap, lm->blk_per_line);
-
 	kref_init(&line->ref);
 	kref_init(&line->ref);
 
 
 	return 0;
 	return 0;
@@ -1399,13 +1495,6 @@ struct pblk_line *pblk_line_replace_data(struct pblk *pblk)
 	l_mg->data_line = new;
 	l_mg->data_line = new;
 
 
 	spin_lock(&l_mg->free_lock);
 	spin_lock(&l_mg->free_lock);
-	if (pblk->state != PBLK_STATE_RUNNING) {
-		l_mg->data_line = NULL;
-		l_mg->data_next = NULL;
-		spin_unlock(&l_mg->free_lock);
-		goto out;
-	}
-
 	pblk_line_setup_metadata(new, l_mg, &pblk->lm);
 	pblk_line_setup_metadata(new, l_mg, &pblk->lm);
 	spin_unlock(&l_mg->free_lock);
 	spin_unlock(&l_mg->free_lock);
 
 
@@ -1585,12 +1674,14 @@ static void pblk_line_should_sync_meta(struct pblk *pblk)
 
 
 void pblk_line_close(struct pblk *pblk, struct pblk_line *line)
 void pblk_line_close(struct pblk *pblk, struct pblk_line *line)
 {
 {
+	struct nvm_tgt_dev *dev = pblk->dev;
+	struct nvm_geo *geo = &dev->geo;
+	struct pblk_line_meta *lm = &pblk->lm;
 	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
 	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
 	struct list_head *move_list;
 	struct list_head *move_list;
+	int i;
 
 
 #ifdef CONFIG_NVM_DEBUG
 #ifdef CONFIG_NVM_DEBUG
-	struct pblk_line_meta *lm = &pblk->lm;
-
 	WARN(!bitmap_full(line->map_bitmap, lm->sec_per_line),
 	WARN(!bitmap_full(line->map_bitmap, lm->sec_per_line),
 				"pblk: corrupt closed line %d\n", line->id);
 				"pblk: corrupt closed line %d\n", line->id);
 #endif
 #endif
@@ -1612,6 +1703,15 @@ void pblk_line_close(struct pblk *pblk, struct pblk_line *line)
 	line->smeta = NULL;
 	line->smeta = NULL;
 	line->emeta = NULL;
 	line->emeta = NULL;
 
 
+	for (i = 0; i < lm->blk_per_line; i++) {
+		struct pblk_lun *rlun = &pblk->luns[i];
+		int pos = pblk_ppa_to_pos(geo, rlun->bppa);
+		int state = line->chks[pos].state;
+
+		if (!(state & NVM_CHK_ST_OFFLINE))
+			state = NVM_CHK_ST_CLOSED;
+	}
+
 	spin_unlock(&line->lock);
 	spin_unlock(&line->lock);
 	spin_unlock(&l_mg->gc_lock);
 	spin_unlock(&l_mg->gc_lock);
 }
 }
@@ -1622,11 +1722,16 @@ void pblk_line_close_meta(struct pblk *pblk, struct pblk_line *line)
 	struct pblk_line_meta *lm = &pblk->lm;
 	struct pblk_line_meta *lm = &pblk->lm;
 	struct pblk_emeta *emeta = line->emeta;
 	struct pblk_emeta *emeta = line->emeta;
 	struct line_emeta *emeta_buf = emeta->buf;
 	struct line_emeta *emeta_buf = emeta->buf;
+	struct wa_counters *wa = emeta_to_wa(lm, emeta_buf);
 
 
 	/* No need for exact vsc value; avoid a big line lock and take aprox. */
 	/* No need for exact vsc value; avoid a big line lock and take aprox. */
 	memcpy(emeta_to_vsc(pblk, emeta_buf), l_mg->vsc_list, lm->vsc_list_len);
 	memcpy(emeta_to_vsc(pblk, emeta_buf), l_mg->vsc_list, lm->vsc_list_len);
 	memcpy(emeta_to_bb(emeta_buf), line->blk_bitmap, lm->blk_bitmap_len);
 	memcpy(emeta_to_bb(emeta_buf), line->blk_bitmap, lm->blk_bitmap_len);
 
 
+	wa->user = cpu_to_le64(atomic64_read(&pblk->user_wa));
+	wa->pad = cpu_to_le64(atomic64_read(&pblk->pad_wa));
+	wa->gc = cpu_to_le64(atomic64_read(&pblk->gc_wa));
+
 	emeta_buf->nr_valid_lbas = cpu_to_le64(line->nr_valid_lbas);
 	emeta_buf->nr_valid_lbas = cpu_to_le64(line->nr_valid_lbas);
 	emeta_buf->crc = cpu_to_le32(pblk_calc_emeta_crc(pblk, emeta_buf));
 	emeta_buf->crc = cpu_to_le32(pblk_calc_emeta_crc(pblk, emeta_buf));
 
 
@@ -1680,8 +1785,8 @@ static void __pblk_down_page(struct pblk *pblk, struct ppa_addr *ppa_list,
 	int i;
 	int i;
 
 
 	for (i = 1; i < nr_ppas; i++)
 	for (i = 1; i < nr_ppas; i++)
-		WARN_ON(ppa_list[0].g.lun != ppa_list[i].g.lun ||
-				ppa_list[0].g.ch != ppa_list[i].g.ch);
+		WARN_ON(ppa_list[0].a.lun != ppa_list[i].a.lun ||
+				ppa_list[0].a.ch != ppa_list[i].a.ch);
 #endif
 #endif
 
 
 	ret = down_timeout(&rlun->wr_sem, msecs_to_jiffies(30000));
 	ret = down_timeout(&rlun->wr_sem, msecs_to_jiffies(30000));
@@ -1725,8 +1830,8 @@ void pblk_up_page(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas)
 	int i;
 	int i;
 
 
 	for (i = 1; i < nr_ppas; i++)
 	for (i = 1; i < nr_ppas; i++)
-		WARN_ON(ppa_list[0].g.lun != ppa_list[i].g.lun ||
-				ppa_list[0].g.ch != ppa_list[i].g.ch);
+		WARN_ON(ppa_list[0].a.lun != ppa_list[i].a.lun ||
+				ppa_list[0].a.ch != ppa_list[i].a.ch);
 #endif
 #endif
 
 
 	rlun = &pblk->luns[pos];
 	rlun = &pblk->luns[pos];
@@ -1739,10 +1844,10 @@ void pblk_up_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas,
 	struct nvm_tgt_dev *dev = pblk->dev;
 	struct nvm_tgt_dev *dev = pblk->dev;
 	struct nvm_geo *geo = &dev->geo;
 	struct nvm_geo *geo = &dev->geo;
 	struct pblk_lun *rlun;
 	struct pblk_lun *rlun;
-	int nr_luns = geo->all_luns;
+	int num_lun = geo->all_luns;
 	int bit = -1;
 	int bit = -1;
 
 
-	while ((bit = find_next_bit(lun_bitmap, nr_luns, bit + 1)) < nr_luns) {
+	while ((bit = find_next_bit(lun_bitmap, num_lun, bit + 1)) < num_lun) {
 		rlun = &pblk->luns[bit];
 		rlun = &pblk->luns[bit];
 		up(&rlun->wr_sem);
 		up(&rlun->wr_sem);
 	}
 	}
@@ -1829,6 +1934,7 @@ void pblk_update_map_dev(struct pblk *pblk, sector_t lba,
 #endif
 #endif
 	/* Invalidate and discard padded entries */
 	/* Invalidate and discard padded entries */
 	if (lba == ADDR_EMPTY) {
 	if (lba == ADDR_EMPTY) {
+		atomic64_inc(&pblk->pad_wa);
 #ifdef CONFIG_NVM_DEBUG
 #ifdef CONFIG_NVM_DEBUG
 		atomic_long_inc(&pblk->padded_wb);
 		atomic_long_inc(&pblk->padded_wb);
 #endif
 #endif

+ 4 - 8
drivers/lightnvm/pblk-gc.c

@@ -88,7 +88,7 @@ static void pblk_gc_line_ws(struct work_struct *work)
 
 
 	up(&gc->gc_sem);
 	up(&gc->gc_sem);
 
 
-	gc_rq->data = vmalloc(gc_rq->nr_secs * geo->sec_size);
+	gc_rq->data = vmalloc(gc_rq->nr_secs * geo->csecs);
 	if (!gc_rq->data) {
 	if (!gc_rq->data) {
 		pr_err("pblk: could not GC line:%d (%d/%d)\n",
 		pr_err("pblk: could not GC line:%d (%d/%d)\n",
 					line->id, *line->vsc, gc_rq->nr_secs);
 					line->id, *line->vsc, gc_rq->nr_secs);
@@ -147,10 +147,8 @@ static void pblk_gc_line_prepare_ws(struct work_struct *work)
 	int ret;
 	int ret;
 
 
 	invalid_bitmap = kmalloc(lm->sec_bitmap_len, GFP_KERNEL);
 	invalid_bitmap = kmalloc(lm->sec_bitmap_len, GFP_KERNEL);
-	if (!invalid_bitmap) {
-		pr_err("pblk: could not allocate GC invalid bitmap\n");
+	if (!invalid_bitmap)
 		goto fail_free_ws;
 		goto fail_free_ws;
-	}
 
 
 	emeta_buf = pblk_malloc(lm->emeta_len[0], l_mg->emeta_alloc_type,
 	emeta_buf = pblk_malloc(lm->emeta_len[0], l_mg->emeta_alloc_type,
 								GFP_KERNEL);
 								GFP_KERNEL);
@@ -666,12 +664,10 @@ void pblk_gc_exit(struct pblk *pblk)
 		kthread_stop(gc->gc_reader_ts);
 		kthread_stop(gc->gc_reader_ts);
 
 
 	flush_workqueue(gc->gc_reader_wq);
 	flush_workqueue(gc->gc_reader_wq);
-	if (gc->gc_reader_wq)
-		destroy_workqueue(gc->gc_reader_wq);
+	destroy_workqueue(gc->gc_reader_wq);
 
 
 	flush_workqueue(gc->gc_line_reader_wq);
 	flush_workqueue(gc->gc_line_reader_wq);
-	if (gc->gc_line_reader_wq)
-		destroy_workqueue(gc->gc_line_reader_wq);
+	destroy_workqueue(gc->gc_line_reader_wq);
 
 
 	if (gc->gc_writer_ts)
 	if (gc->gc_writer_ts)
 		kthread_stop(gc->gc_writer_ts);
 		kthread_stop(gc->gc_writer_ts);

+ 499 - 321
drivers/lightnvm/pblk-init.c

@@ -80,7 +80,7 @@ static size_t pblk_trans_map_size(struct pblk *pblk)
 {
 {
 	int entry_size = 8;
 	int entry_size = 8;
 
 
-	if (pblk->ppaf_bitsize < 32)
+	if (pblk->addrf_len < 32)
 		entry_size = 4;
 		entry_size = 4;
 
 
 	return entry_size * pblk->rl.nr_secs;
 	return entry_size * pblk->rl.nr_secs;
@@ -103,7 +103,40 @@ static void pblk_l2p_free(struct pblk *pblk)
 	vfree(pblk->trans_map);
 	vfree(pblk->trans_map);
 }
 }
 
 
-static int pblk_l2p_init(struct pblk *pblk)
+static int pblk_l2p_recover(struct pblk *pblk, bool factory_init)
+{
+	struct pblk_line *line = NULL;
+
+	if (factory_init) {
+		pblk_setup_uuid(pblk);
+	} else {
+		line = pblk_recov_l2p(pblk);
+		if (IS_ERR(line)) {
+			pr_err("pblk: could not recover l2p table\n");
+			return -EFAULT;
+		}
+	}
+
+#ifdef CONFIG_NVM_DEBUG
+	pr_info("pblk init: L2P CRC: %x\n", pblk_l2p_crc(pblk));
+#endif
+
+	/* Free full lines directly as GC has not been started yet */
+	pblk_gc_free_full_lines(pblk);
+
+	if (!line) {
+		/* Configure next line for user data */
+		line = pblk_line_get_first_data(pblk);
+		if (!line) {
+			pr_err("pblk: line list corrupted\n");
+			return -EFAULT;
+		}
+	}
+
+	return 0;
+}
+
+static int pblk_l2p_init(struct pblk *pblk, bool factory_init)
 {
 {
 	sector_t i;
 	sector_t i;
 	struct ppa_addr ppa;
 	struct ppa_addr ppa;
@@ -119,7 +152,7 @@ static int pblk_l2p_init(struct pblk *pblk)
 	for (i = 0; i < pblk->rl.nr_secs; i++)
 	for (i = 0; i < pblk->rl.nr_secs; i++)
 		pblk_trans_map_set(pblk, i, ppa);
 		pblk_trans_map_set(pblk, i, ppa);
 
 
-	return 0;
+	return pblk_l2p_recover(pblk, factory_init);
 }
 }
 
 
 static void pblk_rwb_free(struct pblk *pblk)
 static void pblk_rwb_free(struct pblk *pblk)
@@ -146,7 +179,7 @@ static int pblk_rwb_init(struct pblk *pblk)
 		return -ENOMEM;
 		return -ENOMEM;
 
 
 	power_size = get_count_order(nr_entries);
 	power_size = get_count_order(nr_entries);
-	power_seg_sz = get_count_order(geo->sec_size);
+	power_seg_sz = get_count_order(geo->csecs);
 
 
 	return pblk_rb_init(&pblk->rwb, entries, power_size, power_seg_sz);
 	return pblk_rb_init(&pblk->rwb, entries, power_size, power_seg_sz);
 }
 }
@@ -154,47 +187,103 @@ static int pblk_rwb_init(struct pblk *pblk)
 /* Minimum pages needed within a lun */
 /* Minimum pages needed within a lun */
 #define ADDR_POOL_SIZE 64
 #define ADDR_POOL_SIZE 64
 
 
-static int pblk_set_ppaf(struct pblk *pblk)
+static int pblk_set_addrf_12(struct nvm_geo *geo, struct nvm_addrf_12 *dst)
 {
 {
-	struct nvm_tgt_dev *dev = pblk->dev;
-	struct nvm_geo *geo = &dev->geo;
-	struct nvm_addr_format ppaf = geo->ppaf;
+	struct nvm_addrf_12 *src = (struct nvm_addrf_12 *)&geo->addrf;
 	int power_len;
 	int power_len;
 
 
 	/* Re-calculate channel and lun format to adapt to configuration */
 	/* Re-calculate channel and lun format to adapt to configuration */
-	power_len = get_count_order(geo->nr_chnls);
-	if (1 << power_len != geo->nr_chnls) {
+	power_len = get_count_order(geo->num_ch);
+	if (1 << power_len != geo->num_ch) {
 		pr_err("pblk: supports only power-of-two channel config.\n");
 		pr_err("pblk: supports only power-of-two channel config.\n");
 		return -EINVAL;
 		return -EINVAL;
 	}
 	}
-	ppaf.ch_len = power_len;
+	dst->ch_len = power_len;
 
 
-	power_len = get_count_order(geo->nr_luns);
-	if (1 << power_len != geo->nr_luns) {
+	power_len = get_count_order(geo->num_lun);
+	if (1 << power_len != geo->num_lun) {
 		pr_err("pblk: supports only power-of-two LUN config.\n");
 		pr_err("pblk: supports only power-of-two LUN config.\n");
 		return -EINVAL;
 		return -EINVAL;
 	}
 	}
-	ppaf.lun_len = power_len;
-
-	pblk->ppaf.sec_offset = 0;
-	pblk->ppaf.pln_offset = ppaf.sect_len;
-	pblk->ppaf.ch_offset = pblk->ppaf.pln_offset + ppaf.pln_len;
-	pblk->ppaf.lun_offset = pblk->ppaf.ch_offset + ppaf.ch_len;
-	pblk->ppaf.pg_offset = pblk->ppaf.lun_offset + ppaf.lun_len;
-	pblk->ppaf.blk_offset = pblk->ppaf.pg_offset + ppaf.pg_len;
-	pblk->ppaf.sec_mask = (1ULL << ppaf.sect_len) - 1;
-	pblk->ppaf.pln_mask = ((1ULL << ppaf.pln_len) - 1) <<
-							pblk->ppaf.pln_offset;
-	pblk->ppaf.ch_mask = ((1ULL << ppaf.ch_len) - 1) <<
-							pblk->ppaf.ch_offset;
-	pblk->ppaf.lun_mask = ((1ULL << ppaf.lun_len) - 1) <<
-							pblk->ppaf.lun_offset;
-	pblk->ppaf.pg_mask = ((1ULL << ppaf.pg_len) - 1) <<
-							pblk->ppaf.pg_offset;
-	pblk->ppaf.blk_mask = ((1ULL << ppaf.blk_len) - 1) <<
-							pblk->ppaf.blk_offset;
-
-	pblk->ppaf_bitsize = pblk->ppaf.blk_offset + ppaf.blk_len;
+	dst->lun_len = power_len;
+
+	dst->blk_len = src->blk_len;
+	dst->pg_len = src->pg_len;
+	dst->pln_len = src->pln_len;
+	dst->sec_len = src->sec_len;
+
+	dst->sec_offset = 0;
+	dst->pln_offset = dst->sec_len;
+	dst->ch_offset = dst->pln_offset + dst->pln_len;
+	dst->lun_offset = dst->ch_offset + dst->ch_len;
+	dst->pg_offset = dst->lun_offset + dst->lun_len;
+	dst->blk_offset = dst->pg_offset + dst->pg_len;
+
+	dst->sec_mask = ((1ULL << dst->sec_len) - 1) << dst->sec_offset;
+	dst->pln_mask = ((1ULL << dst->pln_len) - 1) << dst->pln_offset;
+	dst->ch_mask = ((1ULL << dst->ch_len) - 1) << dst->ch_offset;
+	dst->lun_mask = ((1ULL << dst->lun_len) - 1) << dst->lun_offset;
+	dst->pg_mask = ((1ULL << dst->pg_len) - 1) << dst->pg_offset;
+	dst->blk_mask = ((1ULL << dst->blk_len) - 1) << dst->blk_offset;
+
+	return dst->blk_offset + src->blk_len;
+}
+
+static int pblk_set_addrf_20(struct nvm_geo *geo, struct nvm_addrf *adst,
+			     struct pblk_addrf *udst)
+{
+	struct nvm_addrf *src = &geo->addrf;
+
+	adst->ch_len = get_count_order(geo->num_ch);
+	adst->lun_len = get_count_order(geo->num_lun);
+	adst->chk_len = src->chk_len;
+	adst->sec_len = src->sec_len;
+
+	adst->sec_offset = 0;
+	adst->ch_offset = adst->sec_len;
+	adst->lun_offset = adst->ch_offset + adst->ch_len;
+	adst->chk_offset = adst->lun_offset + adst->lun_len;
+
+	adst->sec_mask = ((1ULL << adst->sec_len) - 1) << adst->sec_offset;
+	adst->chk_mask = ((1ULL << adst->chk_len) - 1) << adst->chk_offset;
+	adst->lun_mask = ((1ULL << adst->lun_len) - 1) << adst->lun_offset;
+	adst->ch_mask = ((1ULL << adst->ch_len) - 1) << adst->ch_offset;
+
+	udst->sec_stripe = geo->ws_opt;
+	udst->ch_stripe = geo->num_ch;
+	udst->lun_stripe = geo->num_lun;
+
+	udst->sec_lun_stripe = udst->sec_stripe * udst->ch_stripe;
+	udst->sec_ws_stripe = udst->sec_lun_stripe * udst->lun_stripe;
+
+	return adst->chk_offset + adst->chk_len;
+}
+
+static int pblk_set_addrf(struct pblk *pblk)
+{
+	struct nvm_tgt_dev *dev = pblk->dev;
+	struct nvm_geo *geo = &dev->geo;
+	int mod;
+
+	switch (geo->version) {
+	case NVM_OCSSD_SPEC_12:
+		div_u64_rem(geo->clba, pblk->min_write_pgs, &mod);
+		if (mod) {
+			pr_err("pblk: bad configuration of sectors/pages\n");
+			return -EINVAL;
+		}
+
+		pblk->addrf_len = pblk_set_addrf_12(geo, (void *)&pblk->addrf);
+		break;
+	case NVM_OCSSD_SPEC_20:
+		pblk->addrf_len = pblk_set_addrf_20(geo, (void *)&pblk->addrf,
+								&pblk->uaddrf);
+		break;
+	default:
+		pr_err("pblk: OCSSD revision not supported (%d)\n",
+								geo->version);
+		return -EINVAL;
+	}
 
 
 	return 0;
 	return 0;
 }
 }
@@ -252,16 +341,41 @@ static int pblk_core_init(struct pblk *pblk)
 {
 {
 	struct nvm_tgt_dev *dev = pblk->dev;
 	struct nvm_tgt_dev *dev = pblk->dev;
 	struct nvm_geo *geo = &dev->geo;
 	struct nvm_geo *geo = &dev->geo;
+	int max_write_ppas;
 
 
-	pblk->pgs_in_buffer = NVM_MEM_PAGE_WRITE * geo->sec_per_pg *
-						geo->nr_planes * geo->all_luns;
+	atomic64_set(&pblk->user_wa, 0);
+	atomic64_set(&pblk->pad_wa, 0);
+	atomic64_set(&pblk->gc_wa, 0);
+	pblk->user_rst_wa = 0;
+	pblk->pad_rst_wa = 0;
+	pblk->gc_rst_wa = 0;
 
 
-	if (pblk_init_global_caches(pblk))
+	atomic64_set(&pblk->nr_flush, 0);
+	pblk->nr_flush_rst = 0;
+
+	pblk->pgs_in_buffer = geo->mw_cunits * geo->all_luns;
+
+	pblk->min_write_pgs = geo->ws_opt * (geo->csecs / PAGE_SIZE);
+	max_write_ppas = pblk->min_write_pgs * geo->all_luns;
+	pblk->max_write_pgs = min_t(int, max_write_ppas, NVM_MAX_VLBA);
+	pblk_set_sec_per_write(pblk, pblk->min_write_pgs);
+
+	if (pblk->max_write_pgs > PBLK_MAX_REQ_ADDRS) {
+		pr_err("pblk: vector list too big(%u > %u)\n",
+				pblk->max_write_pgs, PBLK_MAX_REQ_ADDRS);
+		return -EINVAL;
+	}
+
+	pblk->pad_dist = kzalloc((pblk->min_write_pgs - 1) * sizeof(atomic64_t),
+								GFP_KERNEL);
+	if (!pblk->pad_dist)
 		return -ENOMEM;
 		return -ENOMEM;
 
 
+	if (pblk_init_global_caches(pblk))
+		goto fail_free_pad_dist;
+
 	/* Internal bios can be at most the sectors signaled by the device. */
 	/* Internal bios can be at most the sectors signaled by the device. */
-	pblk->page_bio_pool = mempool_create_page_pool(nvm_max_phys_sects(dev),
-									0);
+	pblk->page_bio_pool = mempool_create_page_pool(NVM_MAX_VLBA, 0);
 	if (!pblk->page_bio_pool)
 	if (!pblk->page_bio_pool)
 		goto free_global_caches;
 		goto free_global_caches;
 
 
@@ -305,13 +419,11 @@ static int pblk_core_init(struct pblk *pblk)
 	if (!pblk->r_end_wq)
 	if (!pblk->r_end_wq)
 		goto free_bb_wq;
 		goto free_bb_wq;
 
 
-	if (pblk_set_ppaf(pblk))
-		goto free_r_end_wq;
-
-	if (pblk_rwb_init(pblk))
+	if (pblk_set_addrf(pblk))
 		goto free_r_end_wq;
 		goto free_r_end_wq;
 
 
 	INIT_LIST_HEAD(&pblk->compl_list);
 	INIT_LIST_HEAD(&pblk->compl_list);
+
 	return 0;
 	return 0;
 
 
 free_r_end_wq:
 free_r_end_wq:
@@ -334,6 +446,8 @@ free_page_bio_pool:
 	mempool_destroy(pblk->page_bio_pool);
 	mempool_destroy(pblk->page_bio_pool);
 free_global_caches:
 free_global_caches:
 	pblk_free_global_caches(pblk);
 	pblk_free_global_caches(pblk);
+fail_free_pad_dist:
+	kfree(pblk->pad_dist);
 	return -ENOMEM;
 	return -ENOMEM;
 }
 }
 
 
@@ -355,20 +469,31 @@ static void pblk_core_free(struct pblk *pblk)
 	mempool_destroy(pblk->e_rq_pool);
 	mempool_destroy(pblk->e_rq_pool);
 	mempool_destroy(pblk->w_rq_pool);
 	mempool_destroy(pblk->w_rq_pool);
 
 
-	pblk_rwb_free(pblk);
-
 	pblk_free_global_caches(pblk);
 	pblk_free_global_caches(pblk);
+	kfree(pblk->pad_dist);
 }
 }
 
 
-static void pblk_luns_free(struct pblk *pblk)
+static void pblk_line_mg_free(struct pblk *pblk)
 {
 {
-	kfree(pblk->luns);
+	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+	int i;
+
+	kfree(l_mg->bb_template);
+	kfree(l_mg->bb_aux);
+	kfree(l_mg->vsc_list);
+
+	for (i = 0; i < PBLK_DATA_LINES; i++) {
+		kfree(l_mg->sline_meta[i]);
+		pblk_mfree(l_mg->eline_meta[i]->buf, l_mg->emeta_alloc_type);
+		kfree(l_mg->eline_meta[i]);
+	}
 }
 }
 
 
-static void pblk_free_line_bitmaps(struct pblk_line *line)
+static void pblk_line_meta_free(struct pblk_line *line)
 {
 {
 	kfree(line->blk_bitmap);
 	kfree(line->blk_bitmap);
 	kfree(line->erase_bitmap);
 	kfree(line->erase_bitmap);
+	kfree(line->chks);
 }
 }
 
 
 static void pblk_lines_free(struct pblk *pblk)
 static void pblk_lines_free(struct pblk *pblk)
@@ -382,40 +507,21 @@ static void pblk_lines_free(struct pblk *pblk)
 		line = &pblk->lines[i];
 		line = &pblk->lines[i];
 
 
 		pblk_line_free(pblk, line);
 		pblk_line_free(pblk, line);
-		pblk_free_line_bitmaps(line);
+		pblk_line_meta_free(line);
 	}
 	}
 	spin_unlock(&l_mg->free_lock);
 	spin_unlock(&l_mg->free_lock);
-}
-
-static void pblk_line_meta_free(struct pblk *pblk)
-{
-	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
-	int i;
-
-	kfree(l_mg->bb_template);
-	kfree(l_mg->bb_aux);
-	kfree(l_mg->vsc_list);
 
 
-	for (i = 0; i < PBLK_DATA_LINES; i++) {
-		kfree(l_mg->sline_meta[i]);
-		pblk_mfree(l_mg->eline_meta[i]->buf, l_mg->emeta_alloc_type);
-		kfree(l_mg->eline_meta[i]);
-	}
+	pblk_line_mg_free(pblk);
 
 
+	kfree(pblk->luns);
 	kfree(pblk->lines);
 	kfree(pblk->lines);
 }
 }
 
 
-static int pblk_bb_discovery(struct nvm_tgt_dev *dev, struct pblk_lun *rlun)
+static int pblk_bb_get_tbl(struct nvm_tgt_dev *dev, struct pblk_lun *rlun,
+			   u8 *blks, int nr_blks)
 {
 {
-	struct nvm_geo *geo = &dev->geo;
 	struct ppa_addr ppa;
 	struct ppa_addr ppa;
-	u8 *blks;
-	int nr_blks, ret;
-
-	nr_blks = geo->nr_chks * geo->plane_mode;
-	blks = kmalloc(nr_blks, GFP_KERNEL);
-	if (!blks)
-		return -ENOMEM;
+	int ret;
 
 
 	ppa.ppa = 0;
 	ppa.ppa = 0;
 	ppa.g.ch = rlun->bppa.g.ch;
 	ppa.g.ch = rlun->bppa.g.ch;
@@ -423,69 +529,64 @@ static int pblk_bb_discovery(struct nvm_tgt_dev *dev, struct pblk_lun *rlun)
 
 
 	ret = nvm_get_tgt_bb_tbl(dev, ppa, blks);
 	ret = nvm_get_tgt_bb_tbl(dev, ppa, blks);
 	if (ret)
 	if (ret)
-		goto out;
+		return ret;
 
 
 	nr_blks = nvm_bb_tbl_fold(dev->parent, blks, nr_blks);
 	nr_blks = nvm_bb_tbl_fold(dev->parent, blks, nr_blks);
-	if (nr_blks < 0) {
-		ret = nr_blks;
-		goto out;
-	}
-
-	rlun->bb_list = blks;
+	if (nr_blks < 0)
+		return -EIO;
 
 
 	return 0;
 	return 0;
-out:
-	kfree(blks);
-	return ret;
 }
 }
 
 
-static int pblk_bb_line(struct pblk *pblk, struct pblk_line *line,
-			int blk_per_line)
+static void *pblk_bb_get_meta(struct pblk *pblk)
 {
 {
 	struct nvm_tgt_dev *dev = pblk->dev;
 	struct nvm_tgt_dev *dev = pblk->dev;
 	struct nvm_geo *geo = &dev->geo;
 	struct nvm_geo *geo = &dev->geo;
-	struct pblk_lun *rlun;
-	int bb_cnt = 0;
-	int i;
+	u8 *meta;
+	int i, nr_blks, blk_per_lun;
+	int ret;
 
 
-	for (i = 0; i < blk_per_line; i++) {
-		rlun = &pblk->luns[i];
-		if (rlun->bb_list[line->id] == NVM_BLK_T_FREE)
-			continue;
+	blk_per_lun = geo->num_chk * geo->pln_mode;
+	nr_blks = blk_per_lun * geo->all_luns;
+
+	meta = kmalloc(nr_blks, GFP_KERNEL);
+	if (!meta)
+		return ERR_PTR(-ENOMEM);
+
+	for (i = 0; i < geo->all_luns; i++) {
+		struct pblk_lun *rlun = &pblk->luns[i];
+		u8 *meta_pos = meta + i * blk_per_lun;
 
 
-		set_bit(pblk_ppa_to_pos(geo, rlun->bppa), line->blk_bitmap);
-		bb_cnt++;
+		ret = pblk_bb_get_tbl(dev, rlun, meta_pos, blk_per_lun);
+		if (ret) {
+			kfree(meta);
+			return ERR_PTR(-EIO);
+		}
 	}
 	}
 
 
-	return bb_cnt;
+	return meta;
 }
 }
 
 
-static int pblk_alloc_line_bitmaps(struct pblk *pblk, struct pblk_line *line)
+static void *pblk_chunk_get_meta(struct pblk *pblk)
 {
 {
-	struct pblk_line_meta *lm = &pblk->lm;
-
-	line->blk_bitmap = kzalloc(lm->blk_bitmap_len, GFP_KERNEL);
-	if (!line->blk_bitmap)
-		return -ENOMEM;
-
-	line->erase_bitmap = kzalloc(lm->blk_bitmap_len, GFP_KERNEL);
-	if (!line->erase_bitmap) {
-		kfree(line->blk_bitmap);
-		return -ENOMEM;
-	}
+	struct nvm_tgt_dev *dev = pblk->dev;
+	struct nvm_geo *geo = &dev->geo;
 
 
-	return 0;
+	if (geo->version == NVM_OCSSD_SPEC_12)
+		return pblk_bb_get_meta(pblk);
+	else
+		return pblk_chunk_get_info(pblk);
 }
 }
 
 
-static int pblk_luns_init(struct pblk *pblk, struct ppa_addr *luns)
+static int pblk_luns_init(struct pblk *pblk)
 {
 {
 	struct nvm_tgt_dev *dev = pblk->dev;
 	struct nvm_tgt_dev *dev = pblk->dev;
 	struct nvm_geo *geo = &dev->geo;
 	struct nvm_geo *geo = &dev->geo;
 	struct pblk_lun *rlun;
 	struct pblk_lun *rlun;
-	int i, ret;
+	int i;
 
 
 	/* TODO: Implement unbalanced LUN support */
 	/* TODO: Implement unbalanced LUN support */
-	if (geo->nr_luns < 0) {
+	if (geo->num_lun < 0) {
 		pr_err("pblk: unbalanced LUN config.\n");
 		pr_err("pblk: unbalanced LUN config.\n");
 		return -EINVAL;
 		return -EINVAL;
 	}
 	}
@@ -497,58 +598,19 @@ static int pblk_luns_init(struct pblk *pblk, struct ppa_addr *luns)
 
 
 	for (i = 0; i < geo->all_luns; i++) {
 	for (i = 0; i < geo->all_luns; i++) {
 		/* Stripe across channels */
 		/* Stripe across channels */
-		int ch = i % geo->nr_chnls;
-		int lun_raw = i / geo->nr_chnls;
-		int lunid = lun_raw + ch * geo->nr_luns;
+		int ch = i % geo->num_ch;
+		int lun_raw = i / geo->num_ch;
+		int lunid = lun_raw + ch * geo->num_lun;
 
 
 		rlun = &pblk->luns[i];
 		rlun = &pblk->luns[i];
-		rlun->bppa = luns[lunid];
+		rlun->bppa = dev->luns[lunid];
 
 
 		sema_init(&rlun->wr_sem, 1);
 		sema_init(&rlun->wr_sem, 1);
-
-		ret = pblk_bb_discovery(dev, rlun);
-		if (ret) {
-			while (--i >= 0)
-				kfree(pblk->luns[i].bb_list);
-			return ret;
-		}
 	}
 	}
 
 
 	return 0;
 	return 0;
 }
 }
 
 
-static int pblk_lines_configure(struct pblk *pblk, int flags)
-{
-	struct pblk_line *line = NULL;
-	int ret = 0;
-
-	if (!(flags & NVM_TARGET_FACTORY)) {
-		line = pblk_recov_l2p(pblk);
-		if (IS_ERR(line)) {
-			pr_err("pblk: could not recover l2p table\n");
-			ret = -EFAULT;
-		}
-	}
-
-#ifdef CONFIG_NVM_DEBUG
-	pr_info("pblk init: L2P CRC: %x\n", pblk_l2p_crc(pblk));
-#endif
-
-	/* Free full lines directly as GC has not been started yet */
-	pblk_gc_free_full_lines(pblk);
-
-	if (!line) {
-		/* Configure next line for user data */
-		line = pblk_line_get_first_data(pblk);
-		if (!line) {
-			pr_err("pblk: line list corrupted\n");
-			ret = -EFAULT;
-		}
-	}
-
-	return ret;
-}
-
 /* See comment over struct line_emeta definition */
 /* See comment over struct line_emeta definition */
 static unsigned int calc_emeta_len(struct pblk *pblk)
 static unsigned int calc_emeta_len(struct pblk *pblk)
 {
 {
@@ -559,19 +621,19 @@ static unsigned int calc_emeta_len(struct pblk *pblk)
 
 
 	/* Round to sector size so that lba_list starts on its own sector */
 	/* Round to sector size so that lba_list starts on its own sector */
 	lm->emeta_sec[1] = DIV_ROUND_UP(
 	lm->emeta_sec[1] = DIV_ROUND_UP(
-			sizeof(struct line_emeta) + lm->blk_bitmap_len,
-			geo->sec_size);
-	lm->emeta_len[1] = lm->emeta_sec[1] * geo->sec_size;
+			sizeof(struct line_emeta) + lm->blk_bitmap_len +
+			sizeof(struct wa_counters), geo->csecs);
+	lm->emeta_len[1] = lm->emeta_sec[1] * geo->csecs;
 
 
 	/* Round to sector size so that vsc_list starts on its own sector */
 	/* Round to sector size so that vsc_list starts on its own sector */
 	lm->dsec_per_line = lm->sec_per_line - lm->emeta_sec[0];
 	lm->dsec_per_line = lm->sec_per_line - lm->emeta_sec[0];
 	lm->emeta_sec[2] = DIV_ROUND_UP(lm->dsec_per_line * sizeof(u64),
 	lm->emeta_sec[2] = DIV_ROUND_UP(lm->dsec_per_line * sizeof(u64),
-			geo->sec_size);
-	lm->emeta_len[2] = lm->emeta_sec[2] * geo->sec_size;
+			geo->csecs);
+	lm->emeta_len[2] = lm->emeta_sec[2] * geo->csecs;
 
 
 	lm->emeta_sec[3] = DIV_ROUND_UP(l_mg->nr_lines * sizeof(u32),
 	lm->emeta_sec[3] = DIV_ROUND_UP(l_mg->nr_lines * sizeof(u32),
-			geo->sec_size);
-	lm->emeta_len[3] = lm->emeta_sec[3] * geo->sec_size;
+			geo->csecs);
+	lm->emeta_len[3] = lm->emeta_sec[3] * geo->csecs;
 
 
 	lm->vsc_list_len = l_mg->nr_lines * sizeof(u32);
 	lm->vsc_list_len = l_mg->nr_lines * sizeof(u32);
 
 
@@ -602,23 +664,211 @@ static void pblk_set_provision(struct pblk *pblk, long nr_free_blks)
 	 * on user capacity consider only provisioned blocks
 	 * on user capacity consider only provisioned blocks
 	 */
 	 */
 	pblk->rl.total_blocks = nr_free_blks;
 	pblk->rl.total_blocks = nr_free_blks;
-	pblk->rl.nr_secs = nr_free_blks * geo->sec_per_chk;
+	pblk->rl.nr_secs = nr_free_blks * geo->clba;
 
 
 	/* Consider sectors used for metadata */
 	/* Consider sectors used for metadata */
 	sec_meta = (lm->smeta_sec + lm->emeta_sec[0]) * l_mg->nr_free_lines;
 	sec_meta = (lm->smeta_sec + lm->emeta_sec[0]) * l_mg->nr_free_lines;
-	blk_meta = DIV_ROUND_UP(sec_meta, geo->sec_per_chk);
+	blk_meta = DIV_ROUND_UP(sec_meta, geo->clba);
 
 
-	pblk->capacity = (provisioned - blk_meta) * geo->sec_per_chk;
+	pblk->capacity = (provisioned - blk_meta) * geo->clba;
 
 
 	atomic_set(&pblk->rl.free_blocks, nr_free_blks);
 	atomic_set(&pblk->rl.free_blocks, nr_free_blks);
 	atomic_set(&pblk->rl.free_user_blocks, nr_free_blks);
 	atomic_set(&pblk->rl.free_user_blocks, nr_free_blks);
 }
 }
 
 
-static int pblk_lines_alloc_metadata(struct pblk *pblk)
+static int pblk_setup_line_meta_12(struct pblk *pblk, struct pblk_line *line,
+				   void *chunk_meta)
+{
+	struct nvm_tgt_dev *dev = pblk->dev;
+	struct nvm_geo *geo = &dev->geo;
+	struct pblk_line_meta *lm = &pblk->lm;
+	int i, chk_per_lun, nr_bad_chks = 0;
+
+	chk_per_lun = geo->num_chk * geo->pln_mode;
+
+	for (i = 0; i < lm->blk_per_line; i++) {
+		struct pblk_lun *rlun = &pblk->luns[i];
+		struct nvm_chk_meta *chunk;
+		int pos = pblk_ppa_to_pos(geo, rlun->bppa);
+		u8 *lun_bb_meta = chunk_meta + pos * chk_per_lun;
+
+		chunk = &line->chks[pos];
+
+		/*
+		 * In 1.2 spec. chunk state is not persisted by the device. Thus
+		 * some of the values are reset each time pblk is instantiated.
+		 */
+		if (lun_bb_meta[line->id] == NVM_BLK_T_FREE)
+			chunk->state =  NVM_CHK_ST_FREE;
+		else
+			chunk->state = NVM_CHK_ST_OFFLINE;
+
+		chunk->type = NVM_CHK_TP_W_SEQ;
+		chunk->wi = 0;
+		chunk->slba = -1;
+		chunk->cnlb = geo->clba;
+		chunk->wp = 0;
+
+		if (!(chunk->state & NVM_CHK_ST_OFFLINE))
+			continue;
+
+		set_bit(pos, line->blk_bitmap);
+		nr_bad_chks++;
+	}
+
+	return nr_bad_chks;
+}
+
+static int pblk_setup_line_meta_20(struct pblk *pblk, struct pblk_line *line,
+				   struct nvm_chk_meta *meta)
+{
+	struct nvm_tgt_dev *dev = pblk->dev;
+	struct nvm_geo *geo = &dev->geo;
+	struct pblk_line_meta *lm = &pblk->lm;
+	int i, nr_bad_chks = 0;
+
+	for (i = 0; i < lm->blk_per_line; i++) {
+		struct pblk_lun *rlun = &pblk->luns[i];
+		struct nvm_chk_meta *chunk;
+		struct nvm_chk_meta *chunk_meta;
+		struct ppa_addr ppa;
+		int pos;
+
+		ppa = rlun->bppa;
+		pos = pblk_ppa_to_pos(geo, ppa);
+		chunk = &line->chks[pos];
+
+		ppa.m.chk = line->id;
+		chunk_meta = pblk_chunk_get_off(pblk, meta, ppa);
+
+		chunk->state = chunk_meta->state;
+		chunk->type = chunk_meta->type;
+		chunk->wi = chunk_meta->wi;
+		chunk->slba = chunk_meta->slba;
+		chunk->cnlb = chunk_meta->cnlb;
+		chunk->wp = chunk_meta->wp;
+
+		if (!(chunk->state & NVM_CHK_ST_OFFLINE))
+			continue;
+
+		if (chunk->type & NVM_CHK_TP_SZ_SPEC) {
+			WARN_ONCE(1, "pblk: custom-sized chunks unsupported\n");
+			continue;
+		}
+
+		set_bit(pos, line->blk_bitmap);
+		nr_bad_chks++;
+	}
+
+	return nr_bad_chks;
+}
+
+static long pblk_setup_line_meta(struct pblk *pblk, struct pblk_line *line,
+				 void *chunk_meta, int line_id)
 {
 {
+	struct nvm_tgt_dev *dev = pblk->dev;
+	struct nvm_geo *geo = &dev->geo;
 	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
 	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
 	struct pblk_line_meta *lm = &pblk->lm;
 	struct pblk_line_meta *lm = &pblk->lm;
-	int i;
+	long nr_bad_chks, chk_in_line;
+
+	line->pblk = pblk;
+	line->id = line_id;
+	line->type = PBLK_LINETYPE_FREE;
+	line->state = PBLK_LINESTATE_NEW;
+	line->gc_group = PBLK_LINEGC_NONE;
+	line->vsc = &l_mg->vsc_list[line_id];
+	spin_lock_init(&line->lock);
+
+	if (geo->version == NVM_OCSSD_SPEC_12)
+		nr_bad_chks = pblk_setup_line_meta_12(pblk, line, chunk_meta);
+	else
+		nr_bad_chks = pblk_setup_line_meta_20(pblk, line, chunk_meta);
+
+	chk_in_line = lm->blk_per_line - nr_bad_chks;
+	if (nr_bad_chks < 0 || nr_bad_chks > lm->blk_per_line ||
+					chk_in_line < lm->min_blk_line) {
+		line->state = PBLK_LINESTATE_BAD;
+		list_add_tail(&line->list, &l_mg->bad_list);
+		return 0;
+	}
+
+	atomic_set(&line->blk_in_line, chk_in_line);
+	list_add_tail(&line->list, &l_mg->free_list);
+	l_mg->nr_free_lines++;
+
+	return chk_in_line;
+}
+
+static int pblk_alloc_line_meta(struct pblk *pblk, struct pblk_line *line)
+{
+	struct pblk_line_meta *lm = &pblk->lm;
+
+	line->blk_bitmap = kzalloc(lm->blk_bitmap_len, GFP_KERNEL);
+	if (!line->blk_bitmap)
+		return -ENOMEM;
+
+	line->erase_bitmap = kzalloc(lm->blk_bitmap_len, GFP_KERNEL);
+	if (!line->erase_bitmap) {
+		kfree(line->blk_bitmap);
+		return -ENOMEM;
+	}
+
+	line->chks = kmalloc(lm->blk_per_line * sizeof(struct nvm_chk_meta),
+								GFP_KERNEL);
+	if (!line->chks) {
+		kfree(line->erase_bitmap);
+		kfree(line->blk_bitmap);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static int pblk_line_mg_init(struct pblk *pblk)
+{
+	struct nvm_tgt_dev *dev = pblk->dev;
+	struct nvm_geo *geo = &dev->geo;
+	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+	struct pblk_line_meta *lm = &pblk->lm;
+	int i, bb_distance;
+
+	l_mg->nr_lines = geo->num_chk;
+	l_mg->log_line = l_mg->data_line = NULL;
+	l_mg->l_seq_nr = l_mg->d_seq_nr = 0;
+	l_mg->nr_free_lines = 0;
+	bitmap_zero(&l_mg->meta_bitmap, PBLK_DATA_LINES);
+
+	INIT_LIST_HEAD(&l_mg->free_list);
+	INIT_LIST_HEAD(&l_mg->corrupt_list);
+	INIT_LIST_HEAD(&l_mg->bad_list);
+	INIT_LIST_HEAD(&l_mg->gc_full_list);
+	INIT_LIST_HEAD(&l_mg->gc_high_list);
+	INIT_LIST_HEAD(&l_mg->gc_mid_list);
+	INIT_LIST_HEAD(&l_mg->gc_low_list);
+	INIT_LIST_HEAD(&l_mg->gc_empty_list);
+
+	INIT_LIST_HEAD(&l_mg->emeta_list);
+
+	l_mg->gc_lists[0] = &l_mg->gc_high_list;
+	l_mg->gc_lists[1] = &l_mg->gc_mid_list;
+	l_mg->gc_lists[2] = &l_mg->gc_low_list;
+
+	spin_lock_init(&l_mg->free_lock);
+	spin_lock_init(&l_mg->close_lock);
+	spin_lock_init(&l_mg->gc_lock);
+
+	l_mg->vsc_list = kcalloc(l_mg->nr_lines, sizeof(__le32), GFP_KERNEL);
+	if (!l_mg->vsc_list)
+		goto fail;
+
+	l_mg->bb_template = kzalloc(lm->sec_bitmap_len, GFP_KERNEL);
+	if (!l_mg->bb_template)
+		goto fail_free_vsc_list;
+
+	l_mg->bb_aux = kzalloc(lm->sec_bitmap_len, GFP_KERNEL);
+	if (!l_mg->bb_aux)
+		goto fail_free_bb_template;
 
 
 	/* smeta is always small enough to fit on a kmalloc memory allocation,
 	/* smeta is always small enough to fit on a kmalloc memory allocation,
 	 * emeta depends on the number of LUNs allocated to the pblk instance
 	 * emeta depends on the number of LUNs allocated to the pblk instance
@@ -664,13 +914,13 @@ static int pblk_lines_alloc_metadata(struct pblk *pblk)
 		}
 		}
 	}
 	}
 
 
-	l_mg->vsc_list = kcalloc(l_mg->nr_lines, sizeof(__le32), GFP_KERNEL);
-	if (!l_mg->vsc_list)
-		goto fail_free_emeta;
-
 	for (i = 0; i < l_mg->nr_lines; i++)
 	for (i = 0; i < l_mg->nr_lines; i++)
 		l_mg->vsc_list[i] = cpu_to_le32(EMPTY_ENTRY);
 		l_mg->vsc_list[i] = cpu_to_le32(EMPTY_ENTRY);
 
 
+	bb_distance = (geo->all_luns) * geo->ws_opt;
+	for (i = 0; i < lm->sec_per_line; i += bb_distance)
+		bitmap_set(l_mg->bb_template, i, geo->ws_opt);
+
 	return 0;
 	return 0;
 
 
 fail_free_emeta:
 fail_free_emeta:
@@ -681,50 +931,27 @@ fail_free_emeta:
 			kfree(l_mg->eline_meta[i]->buf);
 			kfree(l_mg->eline_meta[i]->buf);
 		kfree(l_mg->eline_meta[i]);
 		kfree(l_mg->eline_meta[i]);
 	}
 	}
-
 fail_free_smeta:
 fail_free_smeta:
 	for (i = 0; i < PBLK_DATA_LINES; i++)
 	for (i = 0; i < PBLK_DATA_LINES; i++)
 		kfree(l_mg->sline_meta[i]);
 		kfree(l_mg->sline_meta[i]);
-
+	kfree(l_mg->bb_aux);
+fail_free_bb_template:
+	kfree(l_mg->bb_template);
+fail_free_vsc_list:
+	kfree(l_mg->vsc_list);
+fail:
 	return -ENOMEM;
 	return -ENOMEM;
 }
 }
 
 
-static int pblk_lines_init(struct pblk *pblk)
+static int pblk_line_meta_init(struct pblk *pblk)
 {
 {
 	struct nvm_tgt_dev *dev = pblk->dev;
 	struct nvm_tgt_dev *dev = pblk->dev;
 	struct nvm_geo *geo = &dev->geo;
 	struct nvm_geo *geo = &dev->geo;
-	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
 	struct pblk_line_meta *lm = &pblk->lm;
 	struct pblk_line_meta *lm = &pblk->lm;
-	struct pblk_line *line;
 	unsigned int smeta_len, emeta_len;
 	unsigned int smeta_len, emeta_len;
-	long nr_bad_blks, nr_free_blks;
-	int bb_distance, max_write_ppas, mod;
-	int i, ret;
-
-	pblk->min_write_pgs = geo->sec_per_pl * (geo->sec_size / PAGE_SIZE);
-	max_write_ppas = pblk->min_write_pgs * geo->all_luns;
-	pblk->max_write_pgs = (max_write_ppas < nvm_max_phys_sects(dev)) ?
-				max_write_ppas : nvm_max_phys_sects(dev);
-	pblk_set_sec_per_write(pblk, pblk->min_write_pgs);
-
-	if (pblk->max_write_pgs > PBLK_MAX_REQ_ADDRS) {
-		pr_err("pblk: cannot support device max_phys_sect\n");
-		return -EINVAL;
-	}
-
-	div_u64_rem(geo->sec_per_chk, pblk->min_write_pgs, &mod);
-	if (mod) {
-		pr_err("pblk: bad configuration of sectors/pages\n");
-		return -EINVAL;
-	}
-
-	l_mg->nr_lines = geo->nr_chks;
-	l_mg->log_line = l_mg->data_line = NULL;
-	l_mg->l_seq_nr = l_mg->d_seq_nr = 0;
-	l_mg->nr_free_lines = 0;
-	bitmap_zero(&l_mg->meta_bitmap, PBLK_DATA_LINES);
+	int i;
 
 
-	lm->sec_per_line = geo->sec_per_chk * geo->all_luns;
+	lm->sec_per_line = geo->clba * geo->all_luns;
 	lm->blk_per_line = geo->all_luns;
 	lm->blk_per_line = geo->all_luns;
 	lm->blk_bitmap_len = BITS_TO_LONGS(geo->all_luns) * sizeof(long);
 	lm->blk_bitmap_len = BITS_TO_LONGS(geo->all_luns) * sizeof(long);
 	lm->sec_bitmap_len = BITS_TO_LONGS(lm->sec_per_line) * sizeof(long);
 	lm->sec_bitmap_len = BITS_TO_LONGS(lm->sec_per_line) * sizeof(long);
@@ -738,8 +965,8 @@ static int pblk_lines_init(struct pblk *pblk)
 	 */
 	 */
 	i = 1;
 	i = 1;
 add_smeta_page:
 add_smeta_page:
-	lm->smeta_sec = i * geo->sec_per_pl;
-	lm->smeta_len = lm->smeta_sec * geo->sec_size;
+	lm->smeta_sec = i * geo->ws_opt;
+	lm->smeta_len = lm->smeta_sec * geo->csecs;
 
 
 	smeta_len = sizeof(struct line_smeta) + lm->lun_bitmap_len;
 	smeta_len = sizeof(struct line_smeta) + lm->lun_bitmap_len;
 	if (smeta_len > lm->smeta_len) {
 	if (smeta_len > lm->smeta_len) {
@@ -752,8 +979,8 @@ add_smeta_page:
 	 */
 	 */
 	i = 1;
 	i = 1;
 add_emeta_page:
 add_emeta_page:
-	lm->emeta_sec[0] = i * geo->sec_per_pl;
-	lm->emeta_len[0] = lm->emeta_sec[0] * geo->sec_size;
+	lm->emeta_sec[0] = i * geo->ws_opt;
+	lm->emeta_len[0] = lm->emeta_sec[0] * geo->csecs;
 
 
 	emeta_len = calc_emeta_len(pblk);
 	emeta_len = calc_emeta_len(pblk);
 	if (emeta_len > lm->emeta_len[0]) {
 	if (emeta_len > lm->emeta_len[0]) {
@@ -766,119 +993,75 @@ add_emeta_page:
 	lm->min_blk_line = 1;
 	lm->min_blk_line = 1;
 	if (geo->all_luns > 1)
 	if (geo->all_luns > 1)
 		lm->min_blk_line += DIV_ROUND_UP(lm->smeta_sec +
 		lm->min_blk_line += DIV_ROUND_UP(lm->smeta_sec +
-					lm->emeta_sec[0], geo->sec_per_chk);
+					lm->emeta_sec[0], geo->clba);
 
 
 	if (lm->min_blk_line > lm->blk_per_line) {
 	if (lm->min_blk_line > lm->blk_per_line) {
 		pr_err("pblk: config. not supported. Min. LUN in line:%d\n",
 		pr_err("pblk: config. not supported. Min. LUN in line:%d\n",
 							lm->blk_per_line);
 							lm->blk_per_line);
-		ret = -EINVAL;
-		goto fail;
-	}
-
-	ret = pblk_lines_alloc_metadata(pblk);
-	if (ret)
-		goto fail;
-
-	l_mg->bb_template = kzalloc(lm->sec_bitmap_len, GFP_KERNEL);
-	if (!l_mg->bb_template) {
-		ret = -ENOMEM;
-		goto fail_free_meta;
+		return -EINVAL;
 	}
 	}
 
 
-	l_mg->bb_aux = kzalloc(lm->sec_bitmap_len, GFP_KERNEL);
-	if (!l_mg->bb_aux) {
-		ret = -ENOMEM;
-		goto fail_free_bb_template;
-	}
+	return 0;
+}
 
 
-	bb_distance = (geo->all_luns) * geo->sec_per_pl;
-	for (i = 0; i < lm->sec_per_line; i += bb_distance)
-		bitmap_set(l_mg->bb_template, i, geo->sec_per_pl);
+static int pblk_lines_init(struct pblk *pblk)
+{
+	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+	struct pblk_line *line;
+	void *chunk_meta;
+	long nr_free_chks = 0;
+	int i, ret;
 
 
-	INIT_LIST_HEAD(&l_mg->free_list);
-	INIT_LIST_HEAD(&l_mg->corrupt_list);
-	INIT_LIST_HEAD(&l_mg->bad_list);
-	INIT_LIST_HEAD(&l_mg->gc_full_list);
-	INIT_LIST_HEAD(&l_mg->gc_high_list);
-	INIT_LIST_HEAD(&l_mg->gc_mid_list);
-	INIT_LIST_HEAD(&l_mg->gc_low_list);
-	INIT_LIST_HEAD(&l_mg->gc_empty_list);
+	ret = pblk_line_meta_init(pblk);
+	if (ret)
+		return ret;
 
 
-	INIT_LIST_HEAD(&l_mg->emeta_list);
+	ret = pblk_line_mg_init(pblk);
+	if (ret)
+		return ret;
 
 
-	l_mg->gc_lists[0] = &l_mg->gc_high_list;
-	l_mg->gc_lists[1] = &l_mg->gc_mid_list;
-	l_mg->gc_lists[2] = &l_mg->gc_low_list;
+	ret = pblk_luns_init(pblk);
+	if (ret)
+		goto fail_free_meta;
 
 
-	spin_lock_init(&l_mg->free_lock);
-	spin_lock_init(&l_mg->close_lock);
-	spin_lock_init(&l_mg->gc_lock);
+	chunk_meta = pblk_chunk_get_meta(pblk);
+	if (IS_ERR(chunk_meta)) {
+		ret = PTR_ERR(chunk_meta);
+		goto fail_free_luns;
+	}
 
 
 	pblk->lines = kcalloc(l_mg->nr_lines, sizeof(struct pblk_line),
 	pblk->lines = kcalloc(l_mg->nr_lines, sizeof(struct pblk_line),
 								GFP_KERNEL);
 								GFP_KERNEL);
 	if (!pblk->lines) {
 	if (!pblk->lines) {
 		ret = -ENOMEM;
 		ret = -ENOMEM;
-		goto fail_free_bb_aux;
+		goto fail_free_chunk_meta;
 	}
 	}
 
 
-	nr_free_blks = 0;
 	for (i = 0; i < l_mg->nr_lines; i++) {
 	for (i = 0; i < l_mg->nr_lines; i++) {
-		int blk_in_line;
-
 		line = &pblk->lines[i];
 		line = &pblk->lines[i];
 
 
-		line->pblk = pblk;
-		line->id = i;
-		line->type = PBLK_LINETYPE_FREE;
-		line->state = PBLK_LINESTATE_FREE;
-		line->gc_group = PBLK_LINEGC_NONE;
-		line->vsc = &l_mg->vsc_list[i];
-		spin_lock_init(&line->lock);
-
-		ret = pblk_alloc_line_bitmaps(pblk, line);
+		ret = pblk_alloc_line_meta(pblk, line);
 		if (ret)
 		if (ret)
 			goto fail_free_lines;
 			goto fail_free_lines;
 
 
-		nr_bad_blks = pblk_bb_line(pblk, line, lm->blk_per_line);
-		if (nr_bad_blks < 0 || nr_bad_blks > lm->blk_per_line) {
-			pblk_free_line_bitmaps(line);
-			ret = -EINVAL;
-			goto fail_free_lines;
-		}
-
-		blk_in_line = lm->blk_per_line - nr_bad_blks;
-		if (blk_in_line < lm->min_blk_line) {
-			line->state = PBLK_LINESTATE_BAD;
-			list_add_tail(&line->list, &l_mg->bad_list);
-			continue;
-		}
-
-		nr_free_blks += blk_in_line;
-		atomic_set(&line->blk_in_line, blk_in_line);
-
-		l_mg->nr_free_lines++;
-		list_add_tail(&line->list, &l_mg->free_list);
+		nr_free_chks += pblk_setup_line_meta(pblk, line, chunk_meta, i);
 	}
 	}
 
 
-	pblk_set_provision(pblk, nr_free_blks);
-
-	/* Cleanup per-LUN bad block lists - managed within lines on run-time */
-	for (i = 0; i < geo->all_luns; i++)
-		kfree(pblk->luns[i].bb_list);
+	pblk_set_provision(pblk, nr_free_chks);
 
 
+	kfree(chunk_meta);
 	return 0;
 	return 0;
+
 fail_free_lines:
 fail_free_lines:
 	while (--i >= 0)
 	while (--i >= 0)
-		pblk_free_line_bitmaps(&pblk->lines[i]);
-fail_free_bb_aux:
-	kfree(l_mg->bb_aux);
-fail_free_bb_template:
-	kfree(l_mg->bb_template);
+		pblk_line_meta_free(&pblk->lines[i]);
+	kfree(pblk->lines);
+fail_free_chunk_meta:
+	kfree(chunk_meta);
+fail_free_luns:
+	kfree(pblk->luns);
 fail_free_meta:
 fail_free_meta:
-	pblk_line_meta_free(pblk);
-fail:
-	for (i = 0; i < geo->all_luns; i++)
-		kfree(pblk->luns[i].bb_list);
+	pblk_line_mg_free(pblk);
 
 
 	return ret;
 	return ret;
 }
 }
@@ -912,18 +1095,17 @@ static void pblk_writer_stop(struct pblk *pblk)
 	WARN(pblk_rb_sync_count(&pblk->rwb),
 	WARN(pblk_rb_sync_count(&pblk->rwb),
 			"Stopping not fully synced write buffer\n");
 			"Stopping not fully synced write buffer\n");
 
 
+	del_timer_sync(&pblk->wtimer);
 	if (pblk->writer_ts)
 	if (pblk->writer_ts)
 		kthread_stop(pblk->writer_ts);
 		kthread_stop(pblk->writer_ts);
-	del_timer(&pblk->wtimer);
 }
 }
 
 
 static void pblk_free(struct pblk *pblk)
 static void pblk_free(struct pblk *pblk)
 {
 {
-	pblk_luns_free(pblk);
 	pblk_lines_free(pblk);
 	pblk_lines_free(pblk);
-	pblk_line_meta_free(pblk);
-	pblk_core_free(pblk);
 	pblk_l2p_free(pblk);
 	pblk_l2p_free(pblk);
+	pblk_rwb_free(pblk);
+	pblk_core_free(pblk);
 
 
 	kfree(pblk);
 	kfree(pblk);
 }
 }
@@ -970,9 +1152,17 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
 	struct pblk *pblk;
 	struct pblk *pblk;
 	int ret;
 	int ret;
 
 
-	if (dev->identity.dom & NVM_RSP_L2P) {
+	/* pblk supports 1.2 and 2.0 versions */
+	if (!(geo->version == NVM_OCSSD_SPEC_12 ||
+					geo->version == NVM_OCSSD_SPEC_20)) {
+		pr_err("pblk: OCSSD version not supported (%u)\n",
+							geo->version);
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (geo->version == NVM_OCSSD_SPEC_12 && geo->dom & NVM_RSP_L2P) {
 		pr_err("pblk: host-side L2P table not supported. (%x)\n",
 		pr_err("pblk: host-side L2P table not supported. (%x)\n",
-							dev->identity.dom);
+							geo->dom);
 		return ERR_PTR(-EINVAL);
 		return ERR_PTR(-EINVAL);
 	}
 	}
 
 
@@ -988,14 +1178,10 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
 	spin_lock_init(&pblk->trans_lock);
 	spin_lock_init(&pblk->trans_lock);
 	spin_lock_init(&pblk->lock);
 	spin_lock_init(&pblk->lock);
 
 
-	if (flags & NVM_TARGET_FACTORY)
-		pblk_setup_uuid(pblk);
-
 #ifdef CONFIG_NVM_DEBUG
 #ifdef CONFIG_NVM_DEBUG
 	atomic_long_set(&pblk->inflight_writes, 0);
 	atomic_long_set(&pblk->inflight_writes, 0);
 	atomic_long_set(&pblk->padded_writes, 0);
 	atomic_long_set(&pblk->padded_writes, 0);
 	atomic_long_set(&pblk->padded_wb, 0);
 	atomic_long_set(&pblk->padded_wb, 0);
-	atomic_long_set(&pblk->nr_flush, 0);
 	atomic_long_set(&pblk->req_writes, 0);
 	atomic_long_set(&pblk->req_writes, 0);
 	atomic_long_set(&pblk->sub_writes, 0);
 	atomic_long_set(&pblk->sub_writes, 0);
 	atomic_long_set(&pblk->sync_writes, 0);
 	atomic_long_set(&pblk->sync_writes, 0);
@@ -1015,41 +1201,35 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
 	atomic_long_set(&pblk->write_failed, 0);
 	atomic_long_set(&pblk->write_failed, 0);
 	atomic_long_set(&pblk->erase_failed, 0);
 	atomic_long_set(&pblk->erase_failed, 0);
 
 
-	ret = pblk_luns_init(pblk, dev->luns);
+	ret = pblk_core_init(pblk);
 	if (ret) {
 	if (ret) {
-		pr_err("pblk: could not initialize luns\n");
+		pr_err("pblk: could not initialize core\n");
 		goto fail;
 		goto fail;
 	}
 	}
 
 
 	ret = pblk_lines_init(pblk);
 	ret = pblk_lines_init(pblk);
 	if (ret) {
 	if (ret) {
 		pr_err("pblk: could not initialize lines\n");
 		pr_err("pblk: could not initialize lines\n");
-		goto fail_free_luns;
+		goto fail_free_core;
 	}
 	}
 
 
-	ret = pblk_core_init(pblk);
+	ret = pblk_rwb_init(pblk);
 	if (ret) {
 	if (ret) {
-		pr_err("pblk: could not initialize core\n");
-		goto fail_free_line_meta;
+		pr_err("pblk: could not initialize write buffer\n");
+		goto fail_free_lines;
 	}
 	}
 
 
-	ret = pblk_l2p_init(pblk);
+	ret = pblk_l2p_init(pblk, flags & NVM_TARGET_FACTORY);
 	if (ret) {
 	if (ret) {
 		pr_err("pblk: could not initialize maps\n");
 		pr_err("pblk: could not initialize maps\n");
-		goto fail_free_core;
-	}
-
-	ret = pblk_lines_configure(pblk, flags);
-	if (ret) {
-		pr_err("pblk: could not configure lines\n");
-		goto fail_free_l2p;
+		goto fail_free_rwb;
 	}
 	}
 
 
 	ret = pblk_writer_init(pblk);
 	ret = pblk_writer_init(pblk);
 	if (ret) {
 	if (ret) {
 		if (ret != -EINTR)
 		if (ret != -EINTR)
 			pr_err("pblk: could not initialize write thread\n");
 			pr_err("pblk: could not initialize write thread\n");
-		goto fail_free_lines;
+		goto fail_free_l2p;
 	}
 	}
 
 
 	ret = pblk_gc_init(pblk);
 	ret = pblk_gc_init(pblk);
@@ -1064,10 +1244,10 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
 
 
 	blk_queue_write_cache(tqueue, true, false);
 	blk_queue_write_cache(tqueue, true, false);
 
 
-	tqueue->limits.discard_granularity = geo->sec_per_chk * geo->sec_size;
+	tqueue->limits.discard_granularity = geo->clba * geo->csecs;
 	tqueue->limits.discard_alignment = 0;
 	tqueue->limits.discard_alignment = 0;
 	blk_queue_max_discard_sectors(tqueue, UINT_MAX >> 9);
 	blk_queue_max_discard_sectors(tqueue, UINT_MAX >> 9);
-	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, tqueue);
+	blk_queue_flag_set(QUEUE_FLAG_DISCARD, tqueue);
 
 
 	pr_info("pblk(%s): luns:%u, lines:%d, secs:%llu, buf entries:%u\n",
 	pr_info("pblk(%s): luns:%u, lines:%d, secs:%llu, buf entries:%u\n",
 			tdisk->disk_name,
 			tdisk->disk_name,
@@ -1084,16 +1264,14 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
 
 
 fail_stop_writer:
 fail_stop_writer:
 	pblk_writer_stop(pblk);
 	pblk_writer_stop(pblk);
-fail_free_lines:
-	pblk_lines_free(pblk);
 fail_free_l2p:
 fail_free_l2p:
 	pblk_l2p_free(pblk);
 	pblk_l2p_free(pblk);
+fail_free_rwb:
+	pblk_rwb_free(pblk);
+fail_free_lines:
+	pblk_lines_free(pblk);
 fail_free_core:
 fail_free_core:
 	pblk_core_free(pblk);
 	pblk_core_free(pblk);
-fail_free_line_meta:
-	pblk_line_meta_free(pblk);
-fail_free_luns:
-	pblk_luns_free(pblk);
 fail:
 fail:
 	kfree(pblk);
 	kfree(pblk);
 	return ERR_PTR(ret);
 	return ERR_PTR(ret);

+ 4 - 2
drivers/lightnvm/pblk-map.c

@@ -65,6 +65,8 @@ static void pblk_map_page_data(struct pblk *pblk, unsigned int sentry,
 			lba_list[paddr] = cpu_to_le64(w_ctx->lba);
 			lba_list[paddr] = cpu_to_le64(w_ctx->lba);
 			if (lba_list[paddr] != addr_empty)
 			if (lba_list[paddr] != addr_empty)
 				line->nr_valid_lbas++;
 				line->nr_valid_lbas++;
+			else
+				atomic64_inc(&pblk->pad_wa);
 		} else {
 		} else {
 			lba_list[paddr] = meta_list[i].lba = addr_empty;
 			lba_list[paddr] = meta_list[i].lba = addr_empty;
 			__pblk_map_invalidate(pblk, line, paddr);
 			__pblk_map_invalidate(pblk, line, paddr);
@@ -125,7 +127,7 @@ void pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd,
 			atomic_dec(&e_line->left_eblks);
 			atomic_dec(&e_line->left_eblks);
 
 
 			*erase_ppa = rqd->ppa_list[i];
 			*erase_ppa = rqd->ppa_list[i];
-			erase_ppa->g.blk = e_line->id;
+			erase_ppa->a.blk = e_line->id;
 
 
 			spin_unlock(&e_line->lock);
 			spin_unlock(&e_line->lock);
 
 
@@ -166,6 +168,6 @@ retry:
 		set_bit(bit, e_line->erase_bitmap);
 		set_bit(bit, e_line->erase_bitmap);
 		atomic_dec(&e_line->left_eblks);
 		atomic_dec(&e_line->left_eblks);
 		*erase_ppa = pblk->luns[bit].bppa; /* set ch and lun */
 		*erase_ppa = pblk->luns[bit].bppa; /* set ch and lun */
-		erase_ppa->g.blk = e_line->id;
+		erase_ppa->a.blk = e_line->id;
 	}
 	}
 }
 }

+ 13 - 8
drivers/lightnvm/pblk-rb.c

@@ -355,10 +355,13 @@ static int pblk_rb_flush_point_set(struct pblk_rb *rb, struct bio *bio,
 	struct pblk_rb_entry *entry;
 	struct pblk_rb_entry *entry;
 	unsigned int sync, flush_point;
 	unsigned int sync, flush_point;
 
 
+	pblk_rb_sync_init(rb, NULL);
 	sync = READ_ONCE(rb->sync);
 	sync = READ_ONCE(rb->sync);
 
 
-	if (pos == sync)
+	if (pos == sync) {
+		pblk_rb_sync_end(rb, NULL);
 		return 0;
 		return 0;
+	}
 
 
 #ifdef CONFIG_NVM_DEBUG
 #ifdef CONFIG_NVM_DEBUG
 	atomic_inc(&rb->inflight_flush_point);
 	atomic_inc(&rb->inflight_flush_point);
@@ -367,8 +370,6 @@ static int pblk_rb_flush_point_set(struct pblk_rb *rb, struct bio *bio,
 	flush_point = (pos == 0) ? (rb->nr_entries - 1) : (pos - 1);
 	flush_point = (pos == 0) ? (rb->nr_entries - 1) : (pos - 1);
 	entry = &rb->entries[flush_point];
 	entry = &rb->entries[flush_point];
 
 
-	pblk_rb_sync_init(rb, NULL);
-
 	/* Protect flush points */
 	/* Protect flush points */
 	smp_store_release(&rb->flush_point, flush_point);
 	smp_store_release(&rb->flush_point, flush_point);
 
 
@@ -437,9 +438,7 @@ static int pblk_rb_may_write_flush(struct pblk_rb *rb, unsigned int nr_entries,
 	if (bio->bi_opf & REQ_PREFLUSH) {
 	if (bio->bi_opf & REQ_PREFLUSH) {
 		struct pblk *pblk = container_of(rb, struct pblk, rwb);
 		struct pblk *pblk = container_of(rb, struct pblk, rwb);
 
 
-#ifdef CONFIG_NVM_DEBUG
-		atomic_long_inc(&pblk->nr_flush);
-#endif
+		atomic64_inc(&pblk->nr_flush);
 		if (pblk_rb_flush_point_set(&pblk->rwb, bio, mem))
 		if (pblk_rb_flush_point_set(&pblk->rwb, bio, mem))
 			*io_ret = NVM_IO_OK;
 			*io_ret = NVM_IO_OK;
 	}
 	}
@@ -620,11 +619,17 @@ try:
 			pr_err("pblk: could not pad page in write bio\n");
 			pr_err("pblk: could not pad page in write bio\n");
 			return NVM_IO_ERR;
 			return NVM_IO_ERR;
 		}
 		}
+
+		if (pad < pblk->min_write_pgs)
+			atomic64_inc(&pblk->pad_dist[pad - 1]);
+		else
+			pr_warn("pblk: padding more than min. sectors\n");
+
+		atomic64_add(pad, &pblk->pad_wa);
 	}
 	}
 
 
 #ifdef CONFIG_NVM_DEBUG
 #ifdef CONFIG_NVM_DEBUG
-	atomic_long_add(pad, &((struct pblk *)
-			(container_of(rb, struct pblk, rwb)))->padded_writes);
+	atomic_long_add(pad, &pblk->padded_writes);
 #endif
 #endif
 
 
 	return NVM_IO_OK;
 	return NVM_IO_OK;

+ 1 - 1
drivers/lightnvm/pblk-read.c

@@ -563,7 +563,7 @@ int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq)
 	if (!(gc_rq->secs_to_gc))
 	if (!(gc_rq->secs_to_gc))
 		goto out;
 		goto out;
 
 
-	data_len = (gc_rq->secs_to_gc) * geo->sec_size;
+	data_len = (gc_rq->secs_to_gc) * geo->csecs;
 	bio = pblk_bio_map_addr(pblk, gc_rq->data, gc_rq->secs_to_gc, data_len,
 	bio = pblk_bio_map_addr(pblk, gc_rq->data, gc_rq->secs_to_gc, data_len,
 						PBLK_VMALLOC_META, GFP_KERNEL);
 						PBLK_VMALLOC_META, GFP_KERNEL);
 	if (IS_ERR(bio)) {
 	if (IS_ERR(bio)) {

+ 76 - 15
drivers/lightnvm/pblk-recovery.c

@@ -21,17 +21,15 @@ void pblk_submit_rec(struct work_struct *work)
 	struct pblk_rec_ctx *recovery =
 	struct pblk_rec_ctx *recovery =
 			container_of(work, struct pblk_rec_ctx, ws_rec);
 			container_of(work, struct pblk_rec_ctx, ws_rec);
 	struct pblk *pblk = recovery->pblk;
 	struct pblk *pblk = recovery->pblk;
-	struct nvm_tgt_dev *dev = pblk->dev;
 	struct nvm_rq *rqd = recovery->rqd;
 	struct nvm_rq *rqd = recovery->rqd;
 	struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
 	struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
-	int max_secs = nvm_max_phys_sects(dev);
 	struct bio *bio;
 	struct bio *bio;
 	unsigned int nr_rec_secs;
 	unsigned int nr_rec_secs;
 	unsigned int pgs_read;
 	unsigned int pgs_read;
 	int ret;
 	int ret;
 
 
 	nr_rec_secs = bitmap_weight((unsigned long int *)&rqd->ppa_status,
 	nr_rec_secs = bitmap_weight((unsigned long int *)&rqd->ppa_status,
-								max_secs);
+								NVM_MAX_VLBA);
 
 
 	bio = bio_alloc(GFP_KERNEL, nr_rec_secs);
 	bio = bio_alloc(GFP_KERNEL, nr_rec_secs);
 
 
@@ -74,8 +72,6 @@ int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx,
 			struct pblk_rec_ctx *recovery, u64 *comp_bits,
 			struct pblk_rec_ctx *recovery, u64 *comp_bits,
 			unsigned int comp)
 			unsigned int comp)
 {
 {
-	struct nvm_tgt_dev *dev = pblk->dev;
-	int max_secs = nvm_max_phys_sects(dev);
 	struct nvm_rq *rec_rqd;
 	struct nvm_rq *rec_rqd;
 	struct pblk_c_ctx *rec_ctx;
 	struct pblk_c_ctx *rec_ctx;
 	int nr_entries = c_ctx->nr_valid + c_ctx->nr_padded;
 	int nr_entries = c_ctx->nr_valid + c_ctx->nr_padded;
@@ -86,7 +82,7 @@ int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx,
 	/* Copy completion bitmap, but exclude the first X completed entries */
 	/* Copy completion bitmap, but exclude the first X completed entries */
 	bitmap_shift_right((unsigned long int *)&rec_rqd->ppa_status,
 	bitmap_shift_right((unsigned long int *)&rec_rqd->ppa_status,
 				(unsigned long int *)comp_bits,
 				(unsigned long int *)comp_bits,
-				comp, max_secs);
+				comp, NVM_MAX_VLBA);
 
 
 	/* Save the context for the entries that need to be re-written and
 	/* Save the context for the entries that need to be re-written and
 	 * update current context with the completed entries.
 	 * update current context with the completed entries.
@@ -188,7 +184,7 @@ static int pblk_calc_sec_in_line(struct pblk *pblk, struct pblk_line *line)
 	int nr_bb = bitmap_weight(line->blk_bitmap, lm->blk_per_line);
 	int nr_bb = bitmap_weight(line->blk_bitmap, lm->blk_per_line);
 
 
 	return lm->sec_per_line - lm->smeta_sec - lm->emeta_sec[0] -
 	return lm->sec_per_line - lm->smeta_sec - lm->emeta_sec[0] -
-				nr_bb * geo->sec_per_chk;
+				nr_bb * geo->clba;
 }
 }
 
 
 struct pblk_recov_alloc {
 struct pblk_recov_alloc {
@@ -236,7 +232,7 @@ next_read_rq:
 	rq_ppas = pblk_calc_secs(pblk, left_ppas, 0);
 	rq_ppas = pblk_calc_secs(pblk, left_ppas, 0);
 	if (!rq_ppas)
 	if (!rq_ppas)
 		rq_ppas = pblk->min_write_pgs;
 		rq_ppas = pblk->min_write_pgs;
-	rq_len = rq_ppas * geo->sec_size;
+	rq_len = rq_ppas * geo->csecs;
 
 
 	bio = bio_map_kern(dev->q, data, rq_len, GFP_KERNEL);
 	bio = bio_map_kern(dev->q, data, rq_len, GFP_KERNEL);
 	if (IS_ERR(bio))
 	if (IS_ERR(bio))
@@ -355,7 +351,7 @@ static int pblk_recov_pad_oob(struct pblk *pblk, struct pblk_line *line,
 	if (!pad_rq)
 	if (!pad_rq)
 		return -ENOMEM;
 		return -ENOMEM;
 
 
-	data = vzalloc(pblk->max_write_pgs * geo->sec_size);
+	data = vzalloc(pblk->max_write_pgs * geo->csecs);
 	if (!data) {
 	if (!data) {
 		ret = -ENOMEM;
 		ret = -ENOMEM;
 		goto free_rq;
 		goto free_rq;
@@ -372,7 +368,7 @@ next_pad_rq:
 		goto fail_free_pad;
 		goto fail_free_pad;
 	}
 	}
 
 
-	rq_len = rq_ppas * geo->sec_size;
+	rq_len = rq_ppas * geo->csecs;
 
 
 	meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, &dma_meta_list);
 	meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, &dma_meta_list);
 	if (!meta_list) {
 	if (!meta_list) {
@@ -513,7 +509,7 @@ next_rq:
 	rq_ppas = pblk_calc_secs(pblk, left_ppas, 0);
 	rq_ppas = pblk_calc_secs(pblk, left_ppas, 0);
 	if (!rq_ppas)
 	if (!rq_ppas)
 		rq_ppas = pblk->min_write_pgs;
 		rq_ppas = pblk->min_write_pgs;
-	rq_len = rq_ppas * geo->sec_size;
+	rq_len = rq_ppas * geo->csecs;
 
 
 	bio = bio_map_kern(dev->q, data, rq_len, GFP_KERNEL);
 	bio = bio_map_kern(dev->q, data, rq_len, GFP_KERNEL);
 	if (IS_ERR(bio))
 	if (IS_ERR(bio))
@@ -644,7 +640,7 @@ next_rq:
 	rq_ppas = pblk_calc_secs(pblk, left_ppas, 0);
 	rq_ppas = pblk_calc_secs(pblk, left_ppas, 0);
 	if (!rq_ppas)
 	if (!rq_ppas)
 		rq_ppas = pblk->min_write_pgs;
 		rq_ppas = pblk->min_write_pgs;
-	rq_len = rq_ppas * geo->sec_size;
+	rq_len = rq_ppas * geo->csecs;
 
 
 	bio = bio_map_kern(dev->q, data, rq_len, GFP_KERNEL);
 	bio = bio_map_kern(dev->q, data, rq_len, GFP_KERNEL);
 	if (IS_ERR(bio))
 	if (IS_ERR(bio))
@@ -749,7 +745,7 @@ static int pblk_recov_l2p_from_oob(struct pblk *pblk, struct pblk_line *line)
 	ppa_list = (void *)(meta_list) + pblk_dma_meta_size;
 	ppa_list = (void *)(meta_list) + pblk_dma_meta_size;
 	dma_ppa_list = dma_meta_list + pblk_dma_meta_size;
 	dma_ppa_list = dma_meta_list + pblk_dma_meta_size;
 
 
-	data = kcalloc(pblk->max_write_pgs, geo->sec_size, GFP_KERNEL);
+	data = kcalloc(pblk->max_write_pgs, geo->csecs, GFP_KERNEL);
 	if (!data) {
 	if (!data) {
 		ret = -ENOMEM;
 		ret = -ENOMEM;
 		goto free_meta_list;
 		goto free_meta_list;
@@ -826,6 +822,63 @@ static u64 pblk_line_emeta_start(struct pblk *pblk, struct pblk_line *line)
 	return emeta_start;
 	return emeta_start;
 }
 }
 
 
+static int pblk_recov_check_line_version(struct pblk *pblk,
+					 struct line_emeta *emeta)
+{
+	struct line_header *header = &emeta->header;
+
+	if (header->version_major != EMETA_VERSION_MAJOR) {
+		pr_err("pblk: line major version mismatch: %d, expected: %d\n",
+		       header->version_major, EMETA_VERSION_MAJOR);
+		return 1;
+	}
+
+#ifdef NVM_DEBUG
+	if (header->version_minor > EMETA_VERSION_MINOR)
+		pr_info("pblk: newer line minor version found: %d\n", line_v);
+#endif
+
+	return 0;
+}
+
+static void pblk_recov_wa_counters(struct pblk *pblk,
+				   struct line_emeta *emeta)
+{
+	struct pblk_line_meta *lm = &pblk->lm;
+	struct line_header *header = &emeta->header;
+	struct wa_counters *wa = emeta_to_wa(lm, emeta);
+
+	/* WA counters were introduced in emeta version 0.2 */
+	if (header->version_major > 0 || header->version_minor >= 2) {
+		u64 user = le64_to_cpu(wa->user);
+		u64 pad = le64_to_cpu(wa->pad);
+		u64 gc = le64_to_cpu(wa->gc);
+
+		atomic64_set(&pblk->user_wa, user);
+		atomic64_set(&pblk->pad_wa, pad);
+		atomic64_set(&pblk->gc_wa, gc);
+
+		pblk->user_rst_wa = user;
+		pblk->pad_rst_wa = pad;
+		pblk->gc_rst_wa = gc;
+	}
+}
+
+static int pblk_line_was_written(struct pblk_line *line,
+			    struct pblk_line_meta *lm)
+{
+
+	int i;
+	int state_mask = NVM_CHK_ST_OFFLINE | NVM_CHK_ST_FREE;
+
+	for (i = 0; i < lm->blk_per_line; i++) {
+		if (!(line->chks[i].state & state_mask))
+			return 1;
+	}
+
+	return 0;
+}
+
 struct pblk_line *pblk_recov_l2p(struct pblk *pblk)
 struct pblk_line *pblk_recov_l2p(struct pblk *pblk)
 {
 {
 	struct pblk_line_meta *lm = &pblk->lm;
 	struct pblk_line_meta *lm = &pblk->lm;
@@ -862,6 +915,9 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk)
 		line->lun_bitmap = ((void *)(smeta_buf)) +
 		line->lun_bitmap = ((void *)(smeta_buf)) +
 						sizeof(struct line_smeta);
 						sizeof(struct line_smeta);
 
 
+		if (!pblk_line_was_written(line, lm))
+			continue;
+
 		/* Lines that cannot be read are assumed as not written here */
 		/* Lines that cannot be read are assumed as not written here */
 		if (pblk_line_read_smeta(pblk, line))
 		if (pblk_line_read_smeta(pblk, line))
 			continue;
 			continue;
@@ -873,9 +929,9 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk)
 		if (le32_to_cpu(smeta_buf->header.identifier) != PBLK_MAGIC)
 		if (le32_to_cpu(smeta_buf->header.identifier) != PBLK_MAGIC)
 			continue;
 			continue;
 
 
-		if (smeta_buf->header.version != SMETA_VERSION) {
+		if (smeta_buf->header.version_major != SMETA_VERSION_MAJOR) {
 			pr_err("pblk: found incompatible line version %u\n",
 			pr_err("pblk: found incompatible line version %u\n",
-					le16_to_cpu(smeta_buf->header.version));
+					smeta_buf->header.version_major);
 			return ERR_PTR(-EINVAL);
 			return ERR_PTR(-EINVAL);
 		}
 		}
 
 
@@ -943,6 +999,11 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk)
 			goto next;
 			goto next;
 		}
 		}
 
 
+		if (pblk_recov_check_line_version(pblk, line->emeta->buf))
+			return ERR_PTR(-EINVAL);
+
+		pblk_recov_wa_counters(pblk, line->emeta->buf);
+
 		if (pblk_recov_l2p_from_emeta(pblk, line))
 		if (pblk_recov_l2p_from_emeta(pblk, line))
 			pblk_recov_l2p_from_oob(pblk, line);
 			pblk_recov_l2p_from_oob(pblk, line);
 
 

+ 1 - 1
drivers/lightnvm/pblk-rl.c

@@ -200,7 +200,7 @@ void pblk_rl_init(struct pblk_rl *rl, int budget)
 
 
 	/* Consider sectors used for metadata */
 	/* Consider sectors used for metadata */
 	sec_meta = (lm->smeta_sec + lm->emeta_sec[0]) * l_mg->nr_free_lines;
 	sec_meta = (lm->smeta_sec + lm->emeta_sec[0]) * l_mg->nr_free_lines;
-	blk_meta = DIV_ROUND_UP(sec_meta, geo->sec_per_chk);
+	blk_meta = DIV_ROUND_UP(sec_meta, geo->clba);
 
 
 	rl->high = pblk->op_blks - blk_meta - lm->blk_per_line;
 	rl->high = pblk->op_blks - blk_meta - lm->blk_per_line;
 	rl->high_pw = get_count_order(rl->high);
 	rl->high_pw = get_count_order(rl->high);

+ 212 - 23
drivers/lightnvm/pblk-sysfs.c

@@ -39,8 +39,8 @@ static ssize_t pblk_sysfs_luns_show(struct pblk *pblk, char *page)
 		sz += snprintf(page + sz, PAGE_SIZE - sz,
 		sz += snprintf(page + sz, PAGE_SIZE - sz,
 				"pblk: pos:%d, ch:%d, lun:%d - %d\n",
 				"pblk: pos:%d, ch:%d, lun:%d - %d\n",
 					i,
 					i,
-					rlun->bppa.g.ch,
-					rlun->bppa.g.lun,
+					rlun->bppa.a.ch,
+					rlun->bppa.a.lun,
 					active);
 					active);
 	}
 	}
 
 
@@ -115,24 +115,47 @@ static ssize_t pblk_sysfs_ppaf(struct pblk *pblk, char *page)
 	struct nvm_geo *geo = &dev->geo;
 	struct nvm_geo *geo = &dev->geo;
 	ssize_t sz = 0;
 	ssize_t sz = 0;
 
 
-	sz = snprintf(page, PAGE_SIZE - sz,
-		"g:(b:%d)blk:%d/%d,pg:%d/%d,lun:%d/%d,ch:%d/%d,pl:%d/%d,sec:%d/%d\n",
-		pblk->ppaf_bitsize,
-		pblk->ppaf.blk_offset, geo->ppaf.blk_len,
-		pblk->ppaf.pg_offset, geo->ppaf.pg_len,
-		pblk->ppaf.lun_offset, geo->ppaf.lun_len,
-		pblk->ppaf.ch_offset, geo->ppaf.ch_len,
-		pblk->ppaf.pln_offset, geo->ppaf.pln_len,
-		pblk->ppaf.sec_offset, geo->ppaf.sect_len);
+	if (geo->version == NVM_OCSSD_SPEC_12) {
+		struct nvm_addrf_12 *ppaf = (struct nvm_addrf_12 *)&pblk->addrf;
+		struct nvm_addrf_12 *gppaf = (struct nvm_addrf_12 *)&geo->addrf;
 
 
-	sz += snprintf(page + sz, PAGE_SIZE - sz,
-		"d:blk:%d/%d,pg:%d/%d,lun:%d/%d,ch:%d/%d,pl:%d/%d,sec:%d/%d\n",
-		geo->ppaf.blk_offset, geo->ppaf.blk_len,
-		geo->ppaf.pg_offset, geo->ppaf.pg_len,
-		geo->ppaf.lun_offset, geo->ppaf.lun_len,
-		geo->ppaf.ch_offset, geo->ppaf.ch_len,
-		geo->ppaf.pln_offset, geo->ppaf.pln_len,
-		geo->ppaf.sect_offset, geo->ppaf.sect_len);
+		sz = snprintf(page, PAGE_SIZE,
+			"g:(b:%d)blk:%d/%d,pg:%d/%d,lun:%d/%d,ch:%d/%d,pl:%d/%d,sec:%d/%d\n",
+			pblk->addrf_len,
+			ppaf->blk_offset, ppaf->blk_len,
+			ppaf->pg_offset, ppaf->pg_len,
+			ppaf->lun_offset, ppaf->lun_len,
+			ppaf->ch_offset, ppaf->ch_len,
+			ppaf->pln_offset, ppaf->pln_len,
+			ppaf->sec_offset, ppaf->sec_len);
+
+		sz += snprintf(page + sz, PAGE_SIZE - sz,
+			"d:blk:%d/%d,pg:%d/%d,lun:%d/%d,ch:%d/%d,pl:%d/%d,sec:%d/%d\n",
+			gppaf->blk_offset, gppaf->blk_len,
+			gppaf->pg_offset, gppaf->pg_len,
+			gppaf->lun_offset, gppaf->lun_len,
+			gppaf->ch_offset, gppaf->ch_len,
+			gppaf->pln_offset, gppaf->pln_len,
+			gppaf->sec_offset, gppaf->sec_len);
+	} else {
+		struct nvm_addrf *ppaf = &pblk->addrf;
+		struct nvm_addrf *gppaf = &geo->addrf;
+
+		sz = snprintf(page, PAGE_SIZE,
+			"pblk:(s:%d)ch:%d/%d,lun:%d/%d,chk:%d/%d/sec:%d/%d\n",
+			pblk->addrf_len,
+			ppaf->ch_offset, ppaf->ch_len,
+			ppaf->lun_offset, ppaf->lun_len,
+			ppaf->chk_offset, ppaf->chk_len,
+			ppaf->sec_offset, ppaf->sec_len);
+
+		sz += snprintf(page + sz, PAGE_SIZE - sz,
+			"device:ch:%d/%d,lun:%d/%d,chk:%d/%d,sec:%d/%d\n",
+			gppaf->ch_offset, gppaf->ch_len,
+			gppaf->lun_offset, gppaf->lun_len,
+			gppaf->chk_offset, gppaf->chk_len,
+			gppaf->sec_offset, gppaf->sec_len);
+	}
 
 
 	return sz;
 	return sz;
 }
 }
@@ -288,7 +311,7 @@ static ssize_t pblk_sysfs_lines_info(struct pblk *pblk, char *page)
 				"blk_line:%d, sec_line:%d, sec_blk:%d\n",
 				"blk_line:%d, sec_line:%d, sec_blk:%d\n",
 					lm->blk_per_line,
 					lm->blk_per_line,
 					lm->sec_per_line,
 					lm->sec_per_line,
-					geo->sec_per_chk);
+					geo->clba);
 
 
 	return sz;
 	return sz;
 }
 }
@@ -298,15 +321,104 @@ static ssize_t pblk_sysfs_get_sec_per_write(struct pblk *pblk, char *page)
 	return snprintf(page, PAGE_SIZE, "%d\n", pblk->sec_per_write);
 	return snprintf(page, PAGE_SIZE, "%d\n", pblk->sec_per_write);
 }
 }
 
 
+static ssize_t pblk_get_write_amp(u64 user, u64 gc, u64 pad,
+				  char *page)
+{
+	int sz;
+
+
+	sz = snprintf(page, PAGE_SIZE,
+			"user:%lld gc:%lld pad:%lld WA:",
+			user, gc, pad);
+
+	if (!user) {
+		sz += snprintf(page + sz, PAGE_SIZE - sz, "NaN\n");
+	} else {
+		u64 wa_int;
+		u32 wa_frac;
+
+		wa_int = (user + gc + pad) * 100000;
+		wa_int = div_u64(wa_int, user);
+		wa_int = div_u64_rem(wa_int, 100000, &wa_frac);
+
+		sz += snprintf(page + sz, PAGE_SIZE - sz, "%llu.%05u\n",
+							wa_int, wa_frac);
+	}
+
+	return sz;
+}
+
+static ssize_t pblk_sysfs_get_write_amp_mileage(struct pblk *pblk, char *page)
+{
+	return pblk_get_write_amp(atomic64_read(&pblk->user_wa),
+		atomic64_read(&pblk->gc_wa), atomic64_read(&pblk->pad_wa),
+		page);
+}
+
+static ssize_t pblk_sysfs_get_write_amp_trip(struct pblk *pblk, char *page)
+{
+	return pblk_get_write_amp(
+		atomic64_read(&pblk->user_wa) - pblk->user_rst_wa,
+		atomic64_read(&pblk->gc_wa) - pblk->gc_rst_wa,
+		atomic64_read(&pblk->pad_wa) - pblk->pad_rst_wa, page);
+}
+
+static long long bucket_percentage(unsigned long long bucket,
+				   unsigned long long total)
+{
+	int p = bucket * 100;
+
+	p = div_u64(p, total);
+
+	return p;
+}
+
+static ssize_t pblk_sysfs_get_padding_dist(struct pblk *pblk, char *page)
+{
+	int sz = 0;
+	unsigned long long total;
+	unsigned long long total_buckets = 0;
+	int buckets = pblk->min_write_pgs - 1;
+	int i;
+
+	total = atomic64_read(&pblk->nr_flush) - pblk->nr_flush_rst;
+	if (!total) {
+		for (i = 0; i < (buckets + 1); i++)
+			sz += snprintf(page + sz, PAGE_SIZE - sz,
+				"%d:0 ", i);
+		sz += snprintf(page + sz, PAGE_SIZE - sz, "\n");
+
+		return sz;
+	}
+
+	for (i = 0; i < buckets; i++)
+		total_buckets += atomic64_read(&pblk->pad_dist[i]);
+
+	sz += snprintf(page + sz, PAGE_SIZE - sz, "0:%lld%% ",
+		bucket_percentage(total - total_buckets, total));
+
+	for (i = 0; i < buckets; i++) {
+		unsigned long long p;
+
+		p = bucket_percentage(atomic64_read(&pblk->pad_dist[i]),
+					  total);
+		sz += snprintf(page + sz, PAGE_SIZE - sz, "%d:%lld%% ",
+				i + 1, p);
+	}
+	sz += snprintf(page + sz, PAGE_SIZE - sz, "\n");
+
+	return sz;
+}
+
 #ifdef CONFIG_NVM_DEBUG
 #ifdef CONFIG_NVM_DEBUG
 static ssize_t pblk_sysfs_stats_debug(struct pblk *pblk, char *page)
 static ssize_t pblk_sysfs_stats_debug(struct pblk *pblk, char *page)
 {
 {
 	return snprintf(page, PAGE_SIZE,
 	return snprintf(page, PAGE_SIZE,
-		"%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\n",
+		"%lu\t%lu\t%ld\t%llu\t%ld\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\n",
 			atomic_long_read(&pblk->inflight_writes),
 			atomic_long_read(&pblk->inflight_writes),
 			atomic_long_read(&pblk->inflight_reads),
 			atomic_long_read(&pblk->inflight_reads),
 			atomic_long_read(&pblk->req_writes),
 			atomic_long_read(&pblk->req_writes),
-			atomic_long_read(&pblk->nr_flush),
+			(u64)atomic64_read(&pblk->nr_flush),
 			atomic_long_read(&pblk->padded_writes),
 			atomic_long_read(&pblk->padded_writes),
 			atomic_long_read(&pblk->padded_wb),
 			atomic_long_read(&pblk->padded_wb),
 			atomic_long_read(&pblk->sub_writes),
 			atomic_long_read(&pblk->sub_writes),
@@ -360,6 +472,56 @@ static ssize_t pblk_sysfs_set_sec_per_write(struct pblk *pblk,
 	return len;
 	return len;
 }
 }
 
 
+static ssize_t pblk_sysfs_set_write_amp_trip(struct pblk *pblk,
+			const char *page, size_t len)
+{
+	size_t c_len;
+	int reset_value;
+
+	c_len = strcspn(page, "\n");
+	if (c_len >= len)
+		return -EINVAL;
+
+	if (kstrtouint(page, 0, &reset_value))
+		return -EINVAL;
+
+	if (reset_value !=  0)
+		return -EINVAL;
+
+	pblk->user_rst_wa = atomic64_read(&pblk->user_wa);
+	pblk->pad_rst_wa = atomic64_read(&pblk->pad_wa);
+	pblk->gc_rst_wa = atomic64_read(&pblk->gc_wa);
+
+	return len;
+}
+
+
+static ssize_t pblk_sysfs_set_padding_dist(struct pblk *pblk,
+			const char *page, size_t len)
+{
+	size_t c_len;
+	int reset_value;
+	int buckets = pblk->min_write_pgs - 1;
+	int i;
+
+	c_len = strcspn(page, "\n");
+	if (c_len >= len)
+		return -EINVAL;
+
+	if (kstrtouint(page, 0, &reset_value))
+		return -EINVAL;
+
+	if (reset_value !=  0)
+		return -EINVAL;
+
+	for (i = 0; i < buckets; i++)
+		atomic64_set(&pblk->pad_dist[i], 0);
+
+	pblk->nr_flush_rst = atomic64_read(&pblk->nr_flush);
+
+	return len;
+}
+
 static struct attribute sys_write_luns = {
 static struct attribute sys_write_luns = {
 	.name = "write_luns",
 	.name = "write_luns",
 	.mode = 0444,
 	.mode = 0444,
@@ -410,6 +572,21 @@ static struct attribute sys_max_sec_per_write = {
 	.mode = 0644,
 	.mode = 0644,
 };
 };
 
 
+static struct attribute sys_write_amp_mileage = {
+	.name = "write_amp_mileage",
+	.mode = 0444,
+};
+
+static struct attribute sys_write_amp_trip = {
+	.name = "write_amp_trip",
+	.mode = 0644,
+};
+
+static struct attribute sys_padding_dist = {
+	.name = "padding_dist",
+	.mode = 0644,
+};
+
 #ifdef CONFIG_NVM_DEBUG
 #ifdef CONFIG_NVM_DEBUG
 static struct attribute sys_stats_debug_attr = {
 static struct attribute sys_stats_debug_attr = {
 	.name = "stats",
 	.name = "stats",
@@ -428,6 +605,9 @@ static struct attribute *pblk_attrs[] = {
 	&sys_stats_ppaf_attr,
 	&sys_stats_ppaf_attr,
 	&sys_lines_attr,
 	&sys_lines_attr,
 	&sys_lines_info_attr,
 	&sys_lines_info_attr,
+	&sys_write_amp_mileage,
+	&sys_write_amp_trip,
+	&sys_padding_dist,
 #ifdef CONFIG_NVM_DEBUG
 #ifdef CONFIG_NVM_DEBUG
 	&sys_stats_debug_attr,
 	&sys_stats_debug_attr,
 #endif
 #endif
@@ -457,6 +637,12 @@ static ssize_t pblk_sysfs_show(struct kobject *kobj, struct attribute *attr,
 		return pblk_sysfs_lines_info(pblk, buf);
 		return pblk_sysfs_lines_info(pblk, buf);
 	else if (strcmp(attr->name, "max_sec_per_write") == 0)
 	else if (strcmp(attr->name, "max_sec_per_write") == 0)
 		return pblk_sysfs_get_sec_per_write(pblk, buf);
 		return pblk_sysfs_get_sec_per_write(pblk, buf);
+	else if (strcmp(attr->name, "write_amp_mileage") == 0)
+		return pblk_sysfs_get_write_amp_mileage(pblk, buf);
+	else if (strcmp(attr->name, "write_amp_trip") == 0)
+		return pblk_sysfs_get_write_amp_trip(pblk, buf);
+	else if (strcmp(attr->name, "padding_dist") == 0)
+		return pblk_sysfs_get_padding_dist(pblk, buf);
 #ifdef CONFIG_NVM_DEBUG
 #ifdef CONFIG_NVM_DEBUG
 	else if (strcmp(attr->name, "stats") == 0)
 	else if (strcmp(attr->name, "stats") == 0)
 		return pblk_sysfs_stats_debug(pblk, buf);
 		return pblk_sysfs_stats_debug(pblk, buf);
@@ -473,7 +659,10 @@ static ssize_t pblk_sysfs_store(struct kobject *kobj, struct attribute *attr,
 		return pblk_sysfs_gc_force(pblk, buf, len);
 		return pblk_sysfs_gc_force(pblk, buf, len);
 	else if (strcmp(attr->name, "max_sec_per_write") == 0)
 	else if (strcmp(attr->name, "max_sec_per_write") == 0)
 		return pblk_sysfs_set_sec_per_write(pblk, buf, len);
 		return pblk_sysfs_set_sec_per_write(pblk, buf, len);
-
+	else if (strcmp(attr->name, "write_amp_trip") == 0)
+		return pblk_sysfs_set_write_amp_trip(pblk, buf, len);
+	else if (strcmp(attr->name, "padding_dist") == 0)
+		return pblk_sysfs_set_padding_dist(pblk, buf, len);
 	return 0;
 	return 0;
 }
 }
 
 

+ 1 - 1
drivers/lightnvm/pblk-write.c

@@ -333,7 +333,7 @@ int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line)
 	m_ctx = nvm_rq_to_pdu(rqd);
 	m_ctx = nvm_rq_to_pdu(rqd);
 	m_ctx->private = meta_line;
 	m_ctx->private = meta_line;
 
 
-	rq_len = rq_ppas * geo->sec_size;
+	rq_len = rq_ppas * geo->csecs;
 	data = ((void *)emeta->buf) + emeta->mem;
 	data = ((void *)emeta->buf) + emeta->mem;
 
 
 	bio = pblk_bio_map_addr(pblk, data, rq_ppas, rq_len,
 	bio = pblk_bio_map_addr(pblk, data, rq_ppas, rq_len,

+ 220 - 84
drivers/lightnvm/pblk.h

@@ -201,12 +201,6 @@ struct pblk_rb {
 
 
 struct pblk_lun {
 struct pblk_lun {
 	struct ppa_addr bppa;
 	struct ppa_addr bppa;
-
-	u8 *bb_list;			/* Bad block list for LUN. Only used on
-					 * bring up. Bad blocks are managed
-					 * within lines on run-time.
-					 */
-
 	struct semaphore wr_sem;
 	struct semaphore wr_sem;
 };
 };
 
 
@@ -303,6 +297,7 @@ enum {
 	PBLK_LINETYPE_DATA = 2,
 	PBLK_LINETYPE_DATA = 2,
 
 
 	/* Line state */
 	/* Line state */
+	PBLK_LINESTATE_NEW = 9,
 	PBLK_LINESTATE_FREE = 10,
 	PBLK_LINESTATE_FREE = 10,
 	PBLK_LINESTATE_OPEN = 11,
 	PBLK_LINESTATE_OPEN = 11,
 	PBLK_LINESTATE_CLOSED = 12,
 	PBLK_LINESTATE_CLOSED = 12,
@@ -320,14 +315,26 @@ enum {
 };
 };
 
 
 #define PBLK_MAGIC 0x70626c6b /*pblk*/
 #define PBLK_MAGIC 0x70626c6b /*pblk*/
-#define SMETA_VERSION cpu_to_le16(1)
+
+/* emeta/smeta persistent storage format versions:
+ * Changes in major version requires offline migration.
+ * Changes in minor version are handled automatically during
+ * recovery.
+ */
+
+#define SMETA_VERSION_MAJOR (0)
+#define SMETA_VERSION_MINOR (1)
+
+#define EMETA_VERSION_MAJOR (0)
+#define EMETA_VERSION_MINOR (2)
 
 
 struct line_header {
 struct line_header {
 	__le32 crc;
 	__le32 crc;
 	__le32 identifier;	/* pblk identifier */
 	__le32 identifier;	/* pblk identifier */
 	__u8 uuid[16];		/* instance uuid */
 	__u8 uuid[16];		/* instance uuid */
 	__le16 type;		/* line type */
 	__le16 type;		/* line type */
-	__le16 version;		/* type version */
+	__u8 version_major;	/* version major */
+	__u8 version_minor;	/* version minor */
 	__le32 id;		/* line id for current line */
 	__le32 id;		/* line id for current line */
 };
 };
 
 
@@ -349,11 +356,13 @@ struct line_smeta {
 	__le64 lun_bitmap[];
 	__le64 lun_bitmap[];
 };
 };
 
 
+
 /*
 /*
  * Metadata layout in media:
  * Metadata layout in media:
  *	First sector:
  *	First sector:
  *		1. struct line_emeta
  *		1. struct line_emeta
  *		2. bad block bitmap (u64 * window_wr_lun)
  *		2. bad block bitmap (u64 * window_wr_lun)
+ *		3. write amplification counters
  *	Mid sectors (start at lbas_sector):
  *	Mid sectors (start at lbas_sector):
  *		3. nr_lbas (u64) forming lba list
  *		3. nr_lbas (u64) forming lba list
  *	Last sectors (start at vsc_sector):
  *	Last sectors (start at vsc_sector):
@@ -377,7 +386,15 @@ struct line_emeta {
 	__le32 next_id;		/* Line id for next line */
 	__le32 next_id;		/* Line id for next line */
 	__le64 nr_lbas;		/* Number of lbas mapped in line */
 	__le64 nr_lbas;		/* Number of lbas mapped in line */
 	__le64 nr_valid_lbas;	/* Number of valid lbas mapped in line */
 	__le64 nr_valid_lbas;	/* Number of valid lbas mapped in line */
-	__le64 bb_bitmap[];	/* Updated bad block bitmap for line */
+	__le64 bb_bitmap[];     /* Updated bad block bitmap for line */
+};
+
+
+/* Write amplification counters stored on media */
+struct wa_counters {
+	__le64 user;		/* Number of user written sectors */
+	__le64 gc;		/* Number of sectors written by GC*/
+	__le64 pad;		/* Number of padded sectors */
 };
 };
 
 
 struct pblk_emeta {
 struct pblk_emeta {
@@ -410,6 +427,8 @@ struct pblk_line {
 
 
 	unsigned long *lun_bitmap;	/* Bitmap for LUNs mapped in line */
 	unsigned long *lun_bitmap;	/* Bitmap for LUNs mapped in line */
 
 
+	struct nvm_chk_meta *chks;	/* Chunks forming line */
+
 	struct pblk_smeta *smeta;	/* Start metadata */
 	struct pblk_smeta *smeta;	/* Start metadata */
 	struct pblk_emeta *emeta;	/* End medatada */
 	struct pblk_emeta *emeta;	/* End medatada */
 
 
@@ -507,10 +526,11 @@ struct pblk_line_meta {
 	unsigned int smeta_sec;		/* Sectors needed for smeta */
 	unsigned int smeta_sec;		/* Sectors needed for smeta */
 
 
 	unsigned int emeta_len[4];	/* Lengths for emeta:
 	unsigned int emeta_len[4];	/* Lengths for emeta:
-					 *  [0]: Total length
-					 *  [1]: struct line_emeta length
-					 *  [2]: L2P portion length
-					 *  [3]: vsc list length
+					 *  [0]: Total
+					 *  [1]: struct line_emeta +
+					 *       bb_bitmap + struct wa_counters
+					 *  [2]: L2P portion
+					 *  [3]: vsc
 					 */
 					 */
 	unsigned int emeta_sec[4];	/* Sectors needed for emeta. Same layout
 	unsigned int emeta_sec[4];	/* Sectors needed for emeta. Same layout
 					 * as emeta_len
 					 * as emeta_len
@@ -534,21 +554,6 @@ struct pblk_line_meta {
 	unsigned int meta_distance;	/* Distance between data and metadata */
 	unsigned int meta_distance;	/* Distance between data and metadata */
 };
 };
 
 
-struct pblk_addr_format {
-	u64	ch_mask;
-	u64	lun_mask;
-	u64	pln_mask;
-	u64	blk_mask;
-	u64	pg_mask;
-	u64	sec_mask;
-	u8	ch_offset;
-	u8	lun_offset;
-	u8	pln_offset;
-	u8	blk_offset;
-	u8	pg_offset;
-	u8	sec_offset;
-};
-
 enum {
 enum {
 	PBLK_STATE_RUNNING = 0,
 	PBLK_STATE_RUNNING = 0,
 	PBLK_STATE_STOPPING = 1,
 	PBLK_STATE_STOPPING = 1,
@@ -556,6 +561,18 @@ enum {
 	PBLK_STATE_STOPPED = 3,
 	PBLK_STATE_STOPPED = 3,
 };
 };
 
 
+/* Internal format to support not power-of-2 device formats */
+struct pblk_addrf {
+	/* gen to dev */
+	int sec_stripe;
+	int ch_stripe;
+	int lun_stripe;
+
+	/* dev to gen */
+	int sec_lun_stripe;
+	int sec_ws_stripe;
+};
+
 struct pblk {
 struct pblk {
 	struct nvm_tgt_dev *dev;
 	struct nvm_tgt_dev *dev;
 	struct gendisk *disk;
 	struct gendisk *disk;
@@ -568,8 +585,9 @@ struct pblk {
 	struct pblk_line_mgmt l_mg;		/* Line management */
 	struct pblk_line_mgmt l_mg;		/* Line management */
 	struct pblk_line_meta lm;		/* Line metadata */
 	struct pblk_line_meta lm;		/* Line metadata */
 
 
-	int ppaf_bitsize;
-	struct pblk_addr_format ppaf;
+	struct nvm_addrf addrf;		/* Aligned address format */
+	struct pblk_addrf uaddrf;	/* Unaligned address format */
+	int addrf_len;
 
 
 	struct pblk_rb rwb;
 	struct pblk_rb rwb;
 
 
@@ -592,12 +610,27 @@ struct pblk {
 	int sec_per_write;
 	int sec_per_write;
 
 
 	unsigned char instance_uuid[16];
 	unsigned char instance_uuid[16];
+
+	/* Persistent write amplification counters, 4kb sector I/Os */
+	atomic64_t user_wa;		/* Sectors written by user */
+	atomic64_t gc_wa;		/* Sectors written by GC */
+	atomic64_t pad_wa;		/* Padded sectors written */
+
+	/* Reset values for delta write amplification measurements */
+	u64 user_rst_wa;
+	u64 gc_rst_wa;
+	u64 pad_rst_wa;
+
+	/* Counters used for calculating padding distribution */
+	atomic64_t *pad_dist;		/* Padding distribution buckets */
+	u64 nr_flush_rst;		/* Flushes reset value for pad dist.*/
+	atomic64_t nr_flush;		/* Number of flush/fua I/O */
+
 #ifdef CONFIG_NVM_DEBUG
 #ifdef CONFIG_NVM_DEBUG
-	/* All debug counters apply to 4kb sector I/Os */
+	/* Non-persistent debug counters, 4kb sector I/Os */
 	atomic_long_t inflight_writes;	/* Inflight writes (user and gc) */
 	atomic_long_t inflight_writes;	/* Inflight writes (user and gc) */
 	atomic_long_t padded_writes;	/* Sectors padded due to flush/fua */
 	atomic_long_t padded_writes;	/* Sectors padded due to flush/fua */
 	atomic_long_t padded_wb;	/* Sectors padded in write buffer */
 	atomic_long_t padded_wb;	/* Sectors padded in write buffer */
-	atomic_long_t nr_flush;		/* Number of flush/fua I/O */
 	atomic_long_t req_writes;	/* Sectors stored on write buffer */
 	atomic_long_t req_writes;	/* Sectors stored on write buffer */
 	atomic_long_t sub_writes;	/* Sectors submitted from buffer */
 	atomic_long_t sub_writes;	/* Sectors submitted from buffer */
 	atomic_long_t sync_writes;	/* Sectors synced to media */
 	atomic_long_t sync_writes;	/* Sectors synced to media */
@@ -712,6 +745,10 @@ void pblk_set_sec_per_write(struct pblk *pblk, int sec_per_write);
 int pblk_setup_w_rec_rq(struct pblk *pblk, struct nvm_rq *rqd,
 int pblk_setup_w_rec_rq(struct pblk *pblk, struct nvm_rq *rqd,
 			struct pblk_c_ctx *c_ctx);
 			struct pblk_c_ctx *c_ctx);
 void pblk_discard(struct pblk *pblk, struct bio *bio);
 void pblk_discard(struct pblk *pblk, struct bio *bio);
+struct nvm_chk_meta *pblk_chunk_get_info(struct pblk *pblk);
+struct nvm_chk_meta *pblk_chunk_get_off(struct pblk *pblk,
+					      struct nvm_chk_meta *lp,
+					      struct ppa_addr ppa);
 void pblk_log_write_err(struct pblk *pblk, struct nvm_rq *rqd);
 void pblk_log_write_err(struct pblk *pblk, struct nvm_rq *rqd);
 void pblk_log_read_err(struct pblk *pblk, struct nvm_rq *rqd);
 void pblk_log_read_err(struct pblk *pblk, struct nvm_rq *rqd);
 int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd);
 int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd);
@@ -888,6 +925,12 @@ static inline void *emeta_to_bb(struct line_emeta *emeta)
 	return emeta->bb_bitmap;
 	return emeta->bb_bitmap;
 }
 }
 
 
+static inline void *emeta_to_wa(struct pblk_line_meta *lm,
+				struct line_emeta *emeta)
+{
+	return emeta->bb_bitmap + lm->blk_bitmap_len;
+}
+
 static inline void *emeta_to_lbas(struct pblk *pblk, struct line_emeta *emeta)
 static inline void *emeta_to_lbas(struct pblk *pblk, struct line_emeta *emeta)
 {
 {
 	return ((void *)emeta + pblk->lm.emeta_len[1]);
 	return ((void *)emeta + pblk->lm.emeta_len[1]);
@@ -903,38 +946,60 @@ static inline int pblk_line_vsc(struct pblk_line *line)
 	return le32_to_cpu(*line->vsc);
 	return le32_to_cpu(*line->vsc);
 }
 }
 
 
-#define NVM_MEM_PAGE_WRITE (8)
-
 static inline int pblk_pad_distance(struct pblk *pblk)
 static inline int pblk_pad_distance(struct pblk *pblk)
 {
 {
 	struct nvm_tgt_dev *dev = pblk->dev;
 	struct nvm_tgt_dev *dev = pblk->dev;
 	struct nvm_geo *geo = &dev->geo;
 	struct nvm_geo *geo = &dev->geo;
 
 
-	return NVM_MEM_PAGE_WRITE * geo->all_luns * geo->sec_per_pl;
+	return geo->mw_cunits * geo->all_luns * geo->ws_opt;
 }
 }
 
 
 static inline int pblk_ppa_to_line(struct ppa_addr p)
 static inline int pblk_ppa_to_line(struct ppa_addr p)
 {
 {
-	return p.g.blk;
+	return p.a.blk;
 }
 }
 
 
 static inline int pblk_ppa_to_pos(struct nvm_geo *geo, struct ppa_addr p)
 static inline int pblk_ppa_to_pos(struct nvm_geo *geo, struct ppa_addr p)
 {
 {
-	return p.g.lun * geo->nr_chnls + p.g.ch;
+	return p.a.lun * geo->num_ch + p.a.ch;
 }
 }
 
 
 static inline struct ppa_addr addr_to_gen_ppa(struct pblk *pblk, u64 paddr,
 static inline struct ppa_addr addr_to_gen_ppa(struct pblk *pblk, u64 paddr,
 					      u64 line_id)
 					      u64 line_id)
 {
 {
+	struct nvm_tgt_dev *dev = pblk->dev;
+	struct nvm_geo *geo = &dev->geo;
 	struct ppa_addr ppa;
 	struct ppa_addr ppa;
 
 
-	ppa.ppa = 0;
-	ppa.g.blk = line_id;
-	ppa.g.pg = (paddr & pblk->ppaf.pg_mask) >> pblk->ppaf.pg_offset;
-	ppa.g.lun = (paddr & pblk->ppaf.lun_mask) >> pblk->ppaf.lun_offset;
-	ppa.g.ch = (paddr & pblk->ppaf.ch_mask) >> pblk->ppaf.ch_offset;
-	ppa.g.pl = (paddr & pblk->ppaf.pln_mask) >> pblk->ppaf.pln_offset;
-	ppa.g.sec = (paddr & pblk->ppaf.sec_mask) >> pblk->ppaf.sec_offset;
+	if (geo->version == NVM_OCSSD_SPEC_12) {
+		struct nvm_addrf_12 *ppaf = (struct nvm_addrf_12 *)&pblk->addrf;
+
+		ppa.ppa = 0;
+		ppa.g.blk = line_id;
+		ppa.g.pg = (paddr & ppaf->pg_mask) >> ppaf->pg_offset;
+		ppa.g.lun = (paddr & ppaf->lun_mask) >> ppaf->lun_offset;
+		ppa.g.ch = (paddr & ppaf->ch_mask) >> ppaf->ch_offset;
+		ppa.g.pl = (paddr & ppaf->pln_mask) >> ppaf->pln_offset;
+		ppa.g.sec = (paddr & ppaf->sec_mask) >> ppaf->sec_offset;
+	} else {
+		struct pblk_addrf *uaddrf = &pblk->uaddrf;
+		int secs, chnls, luns;
+
+		ppa.ppa = 0;
+
+		ppa.m.chk = line_id;
+
+		paddr = div_u64_rem(paddr, uaddrf->sec_stripe, &secs);
+		ppa.m.sec = secs;
+
+		paddr = div_u64_rem(paddr, uaddrf->ch_stripe, &chnls);
+		ppa.m.grp = chnls;
+
+		paddr = div_u64_rem(paddr, uaddrf->lun_stripe, &luns);
+		ppa.m.pu = luns;
+
+		ppa.m.sec += uaddrf->sec_stripe * paddr;
+	}
 
 
 	return ppa;
 	return ppa;
 }
 }
@@ -942,13 +1007,30 @@ static inline struct ppa_addr addr_to_gen_ppa(struct pblk *pblk, u64 paddr,
 static inline u64 pblk_dev_ppa_to_line_addr(struct pblk *pblk,
 static inline u64 pblk_dev_ppa_to_line_addr(struct pblk *pblk,
 							struct ppa_addr p)
 							struct ppa_addr p)
 {
 {
+	struct nvm_tgt_dev *dev = pblk->dev;
+	struct nvm_geo *geo = &dev->geo;
 	u64 paddr;
 	u64 paddr;
 
 
-	paddr = (u64)p.g.pg << pblk->ppaf.pg_offset;
-	paddr |= (u64)p.g.lun << pblk->ppaf.lun_offset;
-	paddr |= (u64)p.g.ch << pblk->ppaf.ch_offset;
-	paddr |= (u64)p.g.pl << pblk->ppaf.pln_offset;
-	paddr |= (u64)p.g.sec << pblk->ppaf.sec_offset;
+	if (geo->version == NVM_OCSSD_SPEC_12) {
+		struct nvm_addrf_12 *ppaf = (struct nvm_addrf_12 *)&pblk->addrf;
+
+		paddr = (u64)p.g.ch << ppaf->ch_offset;
+		paddr |= (u64)p.g.lun << ppaf->lun_offset;
+		paddr |= (u64)p.g.pg << ppaf->pg_offset;
+		paddr |= (u64)p.g.pl << ppaf->pln_offset;
+		paddr |= (u64)p.g.sec << ppaf->sec_offset;
+	} else {
+		struct pblk_addrf *uaddrf = &pblk->uaddrf;
+		u64 secs = p.m.sec;
+		int sec_stripe;
+
+		paddr = (u64)p.m.grp * uaddrf->sec_stripe;
+		paddr += (u64)p.m.pu * uaddrf->sec_lun_stripe;
+
+		secs = div_u64_rem(secs, uaddrf->sec_stripe, &sec_stripe);
+		paddr += secs * uaddrf->sec_ws_stripe;
+		paddr += sec_stripe;
+	}
 
 
 	return paddr;
 	return paddr;
 }
 }
@@ -965,18 +1047,37 @@ static inline struct ppa_addr pblk_ppa32_to_ppa64(struct pblk *pblk, u32 ppa32)
 		ppa64.c.line = ppa32 & ((~0U) >> 1);
 		ppa64.c.line = ppa32 & ((~0U) >> 1);
 		ppa64.c.is_cached = 1;
 		ppa64.c.is_cached = 1;
 	} else {
 	} else {
-		ppa64.g.blk = (ppa32 & pblk->ppaf.blk_mask) >>
-							pblk->ppaf.blk_offset;
-		ppa64.g.pg = (ppa32 & pblk->ppaf.pg_mask) >>
-							pblk->ppaf.pg_offset;
-		ppa64.g.lun = (ppa32 & pblk->ppaf.lun_mask) >>
-							pblk->ppaf.lun_offset;
-		ppa64.g.ch = (ppa32 & pblk->ppaf.ch_mask) >>
-							pblk->ppaf.ch_offset;
-		ppa64.g.pl = (ppa32 & pblk->ppaf.pln_mask) >>
-							pblk->ppaf.pln_offset;
-		ppa64.g.sec = (ppa32 & pblk->ppaf.sec_mask) >>
-							pblk->ppaf.sec_offset;
+		struct nvm_tgt_dev *dev = pblk->dev;
+		struct nvm_geo *geo = &dev->geo;
+
+		if (geo->version == NVM_OCSSD_SPEC_12) {
+			struct nvm_addrf_12 *ppaf =
+					(struct nvm_addrf_12 *)&pblk->addrf;
+
+			ppa64.g.ch = (ppa32 & ppaf->ch_mask) >>
+							ppaf->ch_offset;
+			ppa64.g.lun = (ppa32 & ppaf->lun_mask) >>
+							ppaf->lun_offset;
+			ppa64.g.blk = (ppa32 & ppaf->blk_mask) >>
+							ppaf->blk_offset;
+			ppa64.g.pg = (ppa32 & ppaf->pg_mask) >>
+							ppaf->pg_offset;
+			ppa64.g.pl = (ppa32 & ppaf->pln_mask) >>
+							ppaf->pln_offset;
+			ppa64.g.sec = (ppa32 & ppaf->sec_mask) >>
+							ppaf->sec_offset;
+		} else {
+			struct nvm_addrf *lbaf = &pblk->addrf;
+
+			ppa64.m.grp = (ppa32 & lbaf->ch_mask) >>
+							lbaf->ch_offset;
+			ppa64.m.pu = (ppa32 & lbaf->lun_mask) >>
+							lbaf->lun_offset;
+			ppa64.m.chk = (ppa32 & lbaf->chk_mask) >>
+							lbaf->chk_offset;
+			ppa64.m.sec = (ppa32 & lbaf->sec_mask) >>
+							lbaf->sec_offset;
+		}
 	}
 	}
 
 
 	return ppa64;
 	return ppa64;
@@ -992,12 +1093,27 @@ static inline u32 pblk_ppa64_to_ppa32(struct pblk *pblk, struct ppa_addr ppa64)
 		ppa32 |= ppa64.c.line;
 		ppa32 |= ppa64.c.line;
 		ppa32 |= 1U << 31;
 		ppa32 |= 1U << 31;
 	} else {
 	} else {
-		ppa32 |= ppa64.g.blk << pblk->ppaf.blk_offset;
-		ppa32 |= ppa64.g.pg << pblk->ppaf.pg_offset;
-		ppa32 |= ppa64.g.lun << pblk->ppaf.lun_offset;
-		ppa32 |= ppa64.g.ch << pblk->ppaf.ch_offset;
-		ppa32 |= ppa64.g.pl << pblk->ppaf.pln_offset;
-		ppa32 |= ppa64.g.sec << pblk->ppaf.sec_offset;
+		struct nvm_tgt_dev *dev = pblk->dev;
+		struct nvm_geo *geo = &dev->geo;
+
+		if (geo->version == NVM_OCSSD_SPEC_12) {
+			struct nvm_addrf_12 *ppaf =
+					(struct nvm_addrf_12 *)&pblk->addrf;
+
+			ppa32 |= ppa64.g.ch << ppaf->ch_offset;
+			ppa32 |= ppa64.g.lun << ppaf->lun_offset;
+			ppa32 |= ppa64.g.blk << ppaf->blk_offset;
+			ppa32 |= ppa64.g.pg << ppaf->pg_offset;
+			ppa32 |= ppa64.g.pl << ppaf->pln_offset;
+			ppa32 |= ppa64.g.sec << ppaf->sec_offset;
+		} else {
+			struct nvm_addrf *lbaf = &pblk->addrf;
+
+			ppa32 |= ppa64.m.grp << lbaf->ch_offset;
+			ppa32 |= ppa64.m.pu << lbaf->lun_offset;
+			ppa32 |= ppa64.m.chk << lbaf->chk_offset;
+			ppa32 |= ppa64.m.sec << lbaf->sec_offset;
+		}
 	}
 	}
 
 
 	return ppa32;
 	return ppa32;
@@ -1008,7 +1124,7 @@ static inline struct ppa_addr pblk_trans_map_get(struct pblk *pblk,
 {
 {
 	struct ppa_addr ppa;
 	struct ppa_addr ppa;
 
 
-	if (pblk->ppaf_bitsize < 32) {
+	if (pblk->addrf_len < 32) {
 		u32 *map = (u32 *)pblk->trans_map;
 		u32 *map = (u32 *)pblk->trans_map;
 
 
 		ppa = pblk_ppa32_to_ppa64(pblk, map[lba]);
 		ppa = pblk_ppa32_to_ppa64(pblk, map[lba]);
@@ -1024,7 +1140,7 @@ static inline struct ppa_addr pblk_trans_map_get(struct pblk *pblk,
 static inline void pblk_trans_map_set(struct pblk *pblk, sector_t lba,
 static inline void pblk_trans_map_set(struct pblk *pblk, sector_t lba,
 						struct ppa_addr ppa)
 						struct ppa_addr ppa)
 {
 {
-	if (pblk->ppaf_bitsize < 32) {
+	if (pblk->addrf_len < 32) {
 		u32 *map = (u32 *)pblk->trans_map;
 		u32 *map = (u32 *)pblk->trans_map;
 
 
 		map[lba] = pblk_ppa64_to_ppa32(pblk, ppa);
 		map[lba] = pblk_ppa64_to_ppa32(pblk, ppa);
@@ -1115,7 +1231,10 @@ static inline int pblk_set_progr_mode(struct pblk *pblk, int type)
 	struct nvm_geo *geo = &dev->geo;
 	struct nvm_geo *geo = &dev->geo;
 	int flags;
 	int flags;
 
 
-	flags = geo->plane_mode >> 1;
+	if (geo->version == NVM_OCSSD_SPEC_20)
+		return 0;
+
+	flags = geo->pln_mode >> 1;
 
 
 	if (type == PBLK_WRITE)
 	if (type == PBLK_WRITE)
 		flags |= NVM_IO_SCRAMBLE_ENABLE;
 		flags |= NVM_IO_SCRAMBLE_ENABLE;
@@ -1134,9 +1253,12 @@ static inline int pblk_set_read_mode(struct pblk *pblk, int type)
 	struct nvm_geo *geo = &dev->geo;
 	struct nvm_geo *geo = &dev->geo;
 	int flags;
 	int flags;
 
 
+	if (geo->version == NVM_OCSSD_SPEC_20)
+		return 0;
+
 	flags = NVM_IO_SUSPEND | NVM_IO_SCRAMBLE_ENABLE;
 	flags = NVM_IO_SUSPEND | NVM_IO_SCRAMBLE_ENABLE;
 	if (type == PBLK_READ_SEQUENTIAL)
 	if (type == PBLK_READ_SEQUENTIAL)
-		flags |= geo->plane_mode >> 1;
+		flags |= geo->pln_mode >> 1;
 
 
 	return flags;
 	return flags;
 }
 }
@@ -1147,16 +1269,21 @@ static inline int pblk_io_aligned(struct pblk *pblk, int nr_secs)
 }
 }
 
 
 #ifdef CONFIG_NVM_DEBUG
 #ifdef CONFIG_NVM_DEBUG
-static inline void print_ppa(struct ppa_addr *p, char *msg, int error)
+static inline void print_ppa(struct nvm_geo *geo, struct ppa_addr *p,
+			     char *msg, int error)
 {
 {
 	if (p->c.is_cached) {
 	if (p->c.is_cached) {
 		pr_err("ppa: (%s: %x) cache line: %llu\n",
 		pr_err("ppa: (%s: %x) cache line: %llu\n",
 				msg, error, (u64)p->c.line);
 				msg, error, (u64)p->c.line);
-	} else {
+	} else if (geo->version == NVM_OCSSD_SPEC_12) {
 		pr_err("ppa: (%s: %x):ch:%d,lun:%d,blk:%d,pg:%d,pl:%d,sec:%d\n",
 		pr_err("ppa: (%s: %x):ch:%d,lun:%d,blk:%d,pg:%d,pl:%d,sec:%d\n",
 			msg, error,
 			msg, error,
 			p->g.ch, p->g.lun, p->g.blk,
 			p->g.ch, p->g.lun, p->g.blk,
 			p->g.pg, p->g.pl, p->g.sec);
 			p->g.pg, p->g.pl, p->g.sec);
+	} else {
+		pr_err("ppa: (%s: %x):ch:%d,lun:%d,chk:%d,sec:%d\n",
+			msg, error,
+			p->m.grp, p->m.pu, p->m.chk, p->m.sec);
 	}
 	}
 }
 }
 
 
@@ -1166,13 +1293,13 @@ static inline void pblk_print_failed_rqd(struct pblk *pblk, struct nvm_rq *rqd,
 	int bit = -1;
 	int bit = -1;
 
 
 	if (rqd->nr_ppas ==  1) {
 	if (rqd->nr_ppas ==  1) {
-		print_ppa(&rqd->ppa_addr, "rqd", error);
+		print_ppa(&pblk->dev->geo, &rqd->ppa_addr, "rqd", error);
 		return;
 		return;
 	}
 	}
 
 
 	while ((bit = find_next_bit((void *)&rqd->ppa_status, rqd->nr_ppas,
 	while ((bit = find_next_bit((void *)&rqd->ppa_status, rqd->nr_ppas,
 						bit + 1)) < rqd->nr_ppas) {
 						bit + 1)) < rqd->nr_ppas) {
-		print_ppa(&rqd->ppa_list[bit], "rqd", error);
+		print_ppa(&pblk->dev->geo, &rqd->ppa_list[bit], "rqd", error);
 	}
 	}
 
 
 	pr_err("error:%d, ppa_status:%llx\n", error, rqd->ppa_status);
 	pr_err("error:%d, ppa_status:%llx\n", error, rqd->ppa_status);
@@ -1188,16 +1315,25 @@ static inline int pblk_boundary_ppa_checks(struct nvm_tgt_dev *tgt_dev,
 	for (i = 0; i < nr_ppas; i++) {
 	for (i = 0; i < nr_ppas; i++) {
 		ppa = &ppas[i];
 		ppa = &ppas[i];
 
 
-		if (!ppa->c.is_cached &&
-				ppa->g.ch < geo->nr_chnls &&
-				ppa->g.lun < geo->nr_luns &&
-				ppa->g.pl < geo->nr_planes &&
-				ppa->g.blk < geo->nr_chks &&
-				ppa->g.pg < geo->ws_per_chk &&
-				ppa->g.sec < geo->sec_per_pg)
-			continue;
+		if (geo->version == NVM_OCSSD_SPEC_12) {
+			if (!ppa->c.is_cached &&
+					ppa->g.ch < geo->num_ch &&
+					ppa->g.lun < geo->num_lun &&
+					ppa->g.pl < geo->num_pln &&
+					ppa->g.blk < geo->num_chk &&
+					ppa->g.pg < geo->num_pg &&
+					ppa->g.sec < geo->ws_min)
+				continue;
+		} else {
+			if (!ppa->c.is_cached &&
+					ppa->m.grp < geo->num_ch &&
+					ppa->m.pu < geo->num_lun &&
+					ppa->m.chk < geo->num_chk &&
+					ppa->m.sec < geo->clba)
+				continue;
+		}
 
 
-		print_ppa(ppa, "boundary", i);
+		print_ppa(geo, ppa, "boundary", i);
 
 
 		return 1;
 		return 1;
 	}
 	}

+ 2 - 1
drivers/md/bcache/alloc.c

@@ -287,7 +287,8 @@ do {									\
 			break;						\
 			break;						\
 									\
 									\
 		mutex_unlock(&(ca)->set->bucket_lock);			\
 		mutex_unlock(&(ca)->set->bucket_lock);			\
-		if (kthread_should_stop()) {				\
+		if (kthread_should_stop() ||				\
+		    test_bit(CACHE_SET_IO_DISABLE, &ca->set->flags)) {	\
 			set_current_state(TASK_RUNNING);		\
 			set_current_state(TASK_RUNNING);		\
 			return 0;					\
 			return 0;					\
 		}							\
 		}							\

+ 53 - 4
drivers/md/bcache/bcache.h

@@ -188,6 +188,7 @@
 #include <linux/refcount.h>
 #include <linux/refcount.h>
 #include <linux/types.h>
 #include <linux/types.h>
 #include <linux/workqueue.h>
 #include <linux/workqueue.h>
+#include <linux/kthread.h>
 
 
 #include "bset.h"
 #include "bset.h"
 #include "util.h"
 #include "util.h"
@@ -258,10 +259,11 @@ struct bcache_device {
 	struct gendisk		*disk;
 	struct gendisk		*disk;
 
 
 	unsigned long		flags;
 	unsigned long		flags;
-#define BCACHE_DEV_CLOSING	0
-#define BCACHE_DEV_DETACHING	1
-#define BCACHE_DEV_UNLINK_DONE	2
-
+#define BCACHE_DEV_CLOSING		0
+#define BCACHE_DEV_DETACHING		1
+#define BCACHE_DEV_UNLINK_DONE		2
+#define BCACHE_DEV_WB_RUNNING		3
+#define BCACHE_DEV_RATE_DW_RUNNING	4
 	unsigned		nr_stripes;
 	unsigned		nr_stripes;
 	unsigned		stripe_size;
 	unsigned		stripe_size;
 	atomic_t		*stripe_sectors_dirty;
 	atomic_t		*stripe_sectors_dirty;
@@ -286,6 +288,12 @@ struct io {
 	sector_t		last;
 	sector_t		last;
 };
 };
 
 
+enum stop_on_failure {
+	BCH_CACHED_DEV_STOP_AUTO = 0,
+	BCH_CACHED_DEV_STOP_ALWAYS,
+	BCH_CACHED_DEV_STOP_MODE_MAX,
+};
+
 struct cached_dev {
 struct cached_dev {
 	struct list_head	list;
 	struct list_head	list;
 	struct bcache_device	disk;
 	struct bcache_device	disk;
@@ -359,6 +367,7 @@ struct cached_dev {
 	unsigned		sequential_cutoff;
 	unsigned		sequential_cutoff;
 	unsigned		readahead;
 	unsigned		readahead;
 
 
+	unsigned		io_disable:1;
 	unsigned		verify:1;
 	unsigned		verify:1;
 	unsigned		bypass_torture_test:1;
 	unsigned		bypass_torture_test:1;
 
 
@@ -378,6 +387,11 @@ struct cached_dev {
 	unsigned		writeback_rate_i_term_inverse;
 	unsigned		writeback_rate_i_term_inverse;
 	unsigned		writeback_rate_p_term_inverse;
 	unsigned		writeback_rate_p_term_inverse;
 	unsigned		writeback_rate_minimum;
 	unsigned		writeback_rate_minimum;
+
+	enum stop_on_failure	stop_when_cache_set_failed;
+#define DEFAULT_CACHED_DEV_ERROR_LIMIT	64
+	atomic_t		io_errors;
+	unsigned		error_limit;
 };
 };
 
 
 enum alloc_reserve {
 enum alloc_reserve {
@@ -474,10 +488,15 @@ struct gc_stat {
  *
  *
  * CACHE_SET_RUNNING means all cache devices have been registered and journal
  * CACHE_SET_RUNNING means all cache devices have been registered and journal
  * replay is complete.
  * replay is complete.
+ *
+ * CACHE_SET_IO_DISABLE is set when bcache is stopping the whold cache set, all
+ * external and internal I/O should be denied when this flag is set.
+ *
  */
  */
 #define CACHE_SET_UNREGISTERING		0
 #define CACHE_SET_UNREGISTERING		0
 #define	CACHE_SET_STOPPING		1
 #define	CACHE_SET_STOPPING		1
 #define	CACHE_SET_RUNNING		2
 #define	CACHE_SET_RUNNING		2
+#define CACHE_SET_IO_DISABLE		3
 
 
 struct cache_set {
 struct cache_set {
 	struct closure		cl;
 	struct closure		cl;
@@ -867,8 +886,36 @@ static inline void wake_up_allocators(struct cache_set *c)
 		wake_up_process(ca->alloc_thread);
 		wake_up_process(ca->alloc_thread);
 }
 }
 
 
+static inline void closure_bio_submit(struct cache_set *c,
+				      struct bio *bio,
+				      struct closure *cl)
+{
+	closure_get(cl);
+	if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &c->flags))) {
+		bio->bi_status = BLK_STS_IOERR;
+		bio_endio(bio);
+		return;
+	}
+	generic_make_request(bio);
+}
+
+/*
+ * Prevent the kthread exits directly, and make sure when kthread_stop()
+ * is called to stop a kthread, it is still alive. If a kthread might be
+ * stopped by CACHE_SET_IO_DISABLE bit set, wait_for_kthread_stop() is
+ * necessary before the kthread returns.
+ */
+static inline void wait_for_kthread_stop(void)
+{
+	while (!kthread_should_stop()) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		schedule();
+	}
+}
+
 /* Forward declarations */
 /* Forward declarations */
 
 
+void bch_count_backing_io_errors(struct cached_dev *dc, struct bio *bio);
 void bch_count_io_errors(struct cache *, blk_status_t, int, const char *);
 void bch_count_io_errors(struct cache *, blk_status_t, int, const char *);
 void bch_bbio_count_io_errors(struct cache_set *, struct bio *,
 void bch_bbio_count_io_errors(struct cache_set *, struct bio *,
 			      blk_status_t, const char *);
 			      blk_status_t, const char *);
@@ -896,6 +943,7 @@ int bch_bucket_alloc_set(struct cache_set *, unsigned,
 			 struct bkey *, int, bool);
 			 struct bkey *, int, bool);
 bool bch_alloc_sectors(struct cache_set *, struct bkey *, unsigned,
 bool bch_alloc_sectors(struct cache_set *, struct bkey *, unsigned,
 		       unsigned, unsigned, bool);
 		       unsigned, unsigned, bool);
+bool bch_cached_dev_error(struct cached_dev *dc);
 
 
 __printf(2, 3)
 __printf(2, 3)
 bool bch_cache_set_error(struct cache_set *, const char *, ...);
 bool bch_cache_set_error(struct cache_set *, const char *, ...);
@@ -905,6 +953,7 @@ void bch_write_bdev_super(struct cached_dev *, struct closure *);
 
 
 extern struct workqueue_struct *bcache_wq;
 extern struct workqueue_struct *bcache_wq;
 extern const char * const bch_cache_modes[];
 extern const char * const bch_cache_modes[];
+extern const char * const bch_stop_on_failure_modes[];
 extern struct mutex bch_register_lock;
 extern struct mutex bch_register_lock;
 extern struct list_head bch_cache_sets;
 extern struct list_head bch_cache_sets;
 
 

+ 2 - 2
drivers/md/bcache/bset.c

@@ -1072,7 +1072,7 @@ EXPORT_SYMBOL(bch_btree_iter_init);
 static inline struct bkey *__bch_btree_iter_next(struct btree_iter *iter,
 static inline struct bkey *__bch_btree_iter_next(struct btree_iter *iter,
 						 btree_iter_cmp_fn *cmp)
 						 btree_iter_cmp_fn *cmp)
 {
 {
-	struct btree_iter_set unused;
+	struct btree_iter_set b __maybe_unused;
 	struct bkey *ret = NULL;
 	struct bkey *ret = NULL;
 
 
 	if (!btree_iter_end(iter)) {
 	if (!btree_iter_end(iter)) {
@@ -1087,7 +1087,7 @@ static inline struct bkey *__bch_btree_iter_next(struct btree_iter *iter,
 		}
 		}
 
 
 		if (iter->data->k == iter->data->end)
 		if (iter->data->k == iter->data->end)
-			heap_pop(iter, unused, cmp);
+			heap_pop(iter, b, cmp);
 		else
 		else
 			heap_sift(iter, 0, cmp);
 			heap_sift(iter, 0, cmp);
 	}
 	}

+ 3 - 2
drivers/md/bcache/bset.h

@@ -531,14 +531,15 @@ int __bch_keylist_realloc(struct keylist *, unsigned);
 #ifdef CONFIG_BCACHE_DEBUG
 #ifdef CONFIG_BCACHE_DEBUG
 
 
 int __bch_count_data(struct btree_keys *);
 int __bch_count_data(struct btree_keys *);
-void __bch_check_keys(struct btree_keys *, const char *, ...);
+void __printf(2, 3) __bch_check_keys(struct btree_keys *, const char *, ...);
 void bch_dump_bset(struct btree_keys *, struct bset *, unsigned);
 void bch_dump_bset(struct btree_keys *, struct bset *, unsigned);
 void bch_dump_bucket(struct btree_keys *);
 void bch_dump_bucket(struct btree_keys *);
 
 
 #else
 #else
 
 
 static inline int __bch_count_data(struct btree_keys *b) { return -1; }
 static inline int __bch_count_data(struct btree_keys *b) { return -1; }
-static inline void __bch_check_keys(struct btree_keys *b, const char *fmt, ...) {}
+static inline void __printf(2, 3)
+	__bch_check_keys(struct btree_keys *b, const char *fmt, ...) {}
 static inline void bch_dump_bucket(struct btree_keys *b) {}
 static inline void bch_dump_bucket(struct btree_keys *b) {}
 void bch_dump_bset(struct btree_keys *, struct bset *, unsigned);
 void bch_dump_bset(struct btree_keys *, struct bset *, unsigned);
 
 

+ 17 - 9
drivers/md/bcache/btree.c

@@ -665,6 +665,7 @@ static unsigned long bch_mca_scan(struct shrinker *shrink,
 	struct btree *b, *t;
 	struct btree *b, *t;
 	unsigned long i, nr = sc->nr_to_scan;
 	unsigned long i, nr = sc->nr_to_scan;
 	unsigned long freed = 0;
 	unsigned long freed = 0;
+	unsigned int btree_cache_used;
 
 
 	if (c->shrinker_disabled)
 	if (c->shrinker_disabled)
 		return SHRINK_STOP;
 		return SHRINK_STOP;
@@ -689,9 +690,10 @@ static unsigned long bch_mca_scan(struct shrinker *shrink,
 	nr = min_t(unsigned long, nr, mca_can_free(c));
 	nr = min_t(unsigned long, nr, mca_can_free(c));
 
 
 	i = 0;
 	i = 0;
+	btree_cache_used = c->btree_cache_used;
 	list_for_each_entry_safe(b, t, &c->btree_cache_freeable, list) {
 	list_for_each_entry_safe(b, t, &c->btree_cache_freeable, list) {
-		if (freed >= nr)
-			break;
+		if (nr <= 0)
+			goto out;
 
 
 		if (++i > 3 &&
 		if (++i > 3 &&
 		    !mca_reap(b, 0, false)) {
 		    !mca_reap(b, 0, false)) {
@@ -699,9 +701,10 @@ static unsigned long bch_mca_scan(struct shrinker *shrink,
 			rw_unlock(true, b);
 			rw_unlock(true, b);
 			freed++;
 			freed++;
 		}
 		}
+		nr--;
 	}
 	}
 
 
-	for (i = 0; (nr--) && i < c->btree_cache_used; i++) {
+	for (;  (nr--) && i < btree_cache_used; i++) {
 		if (list_empty(&c->btree_cache))
 		if (list_empty(&c->btree_cache))
 			goto out;
 			goto out;
 
 
@@ -719,7 +722,7 @@ static unsigned long bch_mca_scan(struct shrinker *shrink,
 	}
 	}
 out:
 out:
 	mutex_unlock(&c->bucket_lock);
 	mutex_unlock(&c->bucket_lock);
-	return freed;
+	return freed * c->btree_pages;
 }
 }
 
 
 static unsigned long bch_mca_count(struct shrinker *shrink,
 static unsigned long bch_mca_count(struct shrinker *shrink,
@@ -959,7 +962,7 @@ err:
 	return b;
 	return b;
 }
 }
 
 
-/**
+/*
  * bch_btree_node_get - find a btree node in the cache and lock it, reading it
  * bch_btree_node_get - find a btree node in the cache and lock it, reading it
  * in from disk if necessary.
  * in from disk if necessary.
  *
  *
@@ -1744,6 +1747,7 @@ static void bch_btree_gc(struct cache_set *c)
 
 
 	btree_gc_start(c);
 	btree_gc_start(c);
 
 
+	/* if CACHE_SET_IO_DISABLE set, gc thread should stop too */
 	do {
 	do {
 		ret = btree_root(gc_root, c, &op, &writes, &stats);
 		ret = btree_root(gc_root, c, &op, &writes, &stats);
 		closure_sync(&writes);
 		closure_sync(&writes);
@@ -1751,7 +1755,7 @@ static void bch_btree_gc(struct cache_set *c)
 
 
 		if (ret && ret != -EAGAIN)
 		if (ret && ret != -EAGAIN)
 			pr_warn("gc failed!");
 			pr_warn("gc failed!");
-	} while (ret);
+	} while (ret && !test_bit(CACHE_SET_IO_DISABLE, &c->flags));
 
 
 	bch_btree_gc_finish(c);
 	bch_btree_gc_finish(c);
 	wake_up_allocators(c);
 	wake_up_allocators(c);
@@ -1789,15 +1793,19 @@ static int bch_gc_thread(void *arg)
 
 
 	while (1) {
 	while (1) {
 		wait_event_interruptible(c->gc_wait,
 		wait_event_interruptible(c->gc_wait,
-			   kthread_should_stop() || gc_should_run(c));
+			   kthread_should_stop() ||
+			   test_bit(CACHE_SET_IO_DISABLE, &c->flags) ||
+			   gc_should_run(c));
 
 
-		if (kthread_should_stop())
+		if (kthread_should_stop() ||
+		    test_bit(CACHE_SET_IO_DISABLE, &c->flags))
 			break;
 			break;
 
 
 		set_gc_sectors(c);
 		set_gc_sectors(c);
 		bch_btree_gc(c);
 		bch_btree_gc(c);
 	}
 	}
 
 
+	wait_for_kthread_stop();
 	return 0;
 	return 0;
 }
 }
 
 
@@ -2170,7 +2178,7 @@ int bch_btree_insert_check_key(struct btree *b, struct btree_op *op,
 
 
 		if (b->key.ptr[0] != btree_ptr ||
 		if (b->key.ptr[0] != btree_ptr ||
                    b->seq != seq + 1) {
                    b->seq != seq + 1) {
-                       op->lock = b->level;
+			op->lock = b->level;
 			goto out;
 			goto out;
                }
                }
 	}
 	}

+ 9 - 8
drivers/md/bcache/closure.c

@@ -46,7 +46,7 @@ void closure_sub(struct closure *cl, int v)
 }
 }
 EXPORT_SYMBOL(closure_sub);
 EXPORT_SYMBOL(closure_sub);
 
 
-/**
+/*
  * closure_put - decrement a closure's refcount
  * closure_put - decrement a closure's refcount
  */
  */
 void closure_put(struct closure *cl)
 void closure_put(struct closure *cl)
@@ -55,7 +55,7 @@ void closure_put(struct closure *cl)
 }
 }
 EXPORT_SYMBOL(closure_put);
 EXPORT_SYMBOL(closure_put);
 
 
-/**
+/*
  * closure_wake_up - wake up all closures on a wait list, without memory barrier
  * closure_wake_up - wake up all closures on a wait list, without memory barrier
  */
  */
 void __closure_wake_up(struct closure_waitlist *wait_list)
 void __closure_wake_up(struct closure_waitlist *wait_list)
@@ -79,9 +79,9 @@ EXPORT_SYMBOL(__closure_wake_up);
 
 
 /**
 /**
  * closure_wait - add a closure to a waitlist
  * closure_wait - add a closure to a waitlist
- *
- * @waitlist will own a ref on @cl, which will be released when
+ * @waitlist: will own a ref on @cl, which will be released when
  * closure_wake_up() is called on @waitlist.
  * closure_wake_up() is called on @waitlist.
+ * @cl: closure pointer.
  *
  *
  */
  */
 bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl)
 bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl)
@@ -157,7 +157,7 @@ void closure_debug_destroy(struct closure *cl)
 }
 }
 EXPORT_SYMBOL(closure_debug_destroy);
 EXPORT_SYMBOL(closure_debug_destroy);
 
 
-static struct dentry *debug;
+static struct dentry *closure_debug;
 
 
 static int debug_seq_show(struct seq_file *f, void *data)
 static int debug_seq_show(struct seq_file *f, void *data)
 {
 {
@@ -199,11 +199,12 @@ static const struct file_operations debug_ops = {
 	.release	= single_release
 	.release	= single_release
 };
 };
 
 
-void __init closure_debug_init(void)
+int __init closure_debug_init(void)
 {
 {
-	debug = debugfs_create_file("closures", 0400, NULL, NULL, &debug_ops);
+	closure_debug = debugfs_create_file("closures",
+				0400, bcache_debug, NULL, &debug_ops);
+	return IS_ERR_OR_NULL(closure_debug);
 }
 }
-
 #endif
 #endif
 
 
 MODULE_AUTHOR("Kent Overstreet <koverstreet@google.com>");
 MODULE_AUTHOR("Kent Overstreet <koverstreet@google.com>");

+ 3 - 2
drivers/md/bcache/closure.h

@@ -105,6 +105,7 @@
 struct closure;
 struct closure;
 struct closure_syncer;
 struct closure_syncer;
 typedef void (closure_fn) (struct closure *);
 typedef void (closure_fn) (struct closure *);
+extern struct dentry *bcache_debug;
 
 
 struct closure_waitlist {
 struct closure_waitlist {
 	struct llist_head	list;
 	struct llist_head	list;
@@ -185,13 +186,13 @@ static inline void closure_sync(struct closure *cl)
 
 
 #ifdef CONFIG_BCACHE_CLOSURES_DEBUG
 #ifdef CONFIG_BCACHE_CLOSURES_DEBUG
 
 
-void closure_debug_init(void);
+int closure_debug_init(void);
 void closure_debug_create(struct closure *cl);
 void closure_debug_create(struct closure *cl);
 void closure_debug_destroy(struct closure *cl);
 void closure_debug_destroy(struct closure *cl);
 
 
 #else
 #else
 
 
-static inline void closure_debug_init(void) {}
+static inline int closure_debug_init(void) { return 0; }
 static inline void closure_debug_create(struct closure *cl) {}
 static inline void closure_debug_create(struct closure *cl) {}
 static inline void closure_debug_destroy(struct closure *cl) {}
 static inline void closure_debug_destroy(struct closure *cl) {}
 
 

+ 7 - 7
drivers/md/bcache/debug.c

@@ -17,7 +17,7 @@
 #include <linux/random.h>
 #include <linux/random.h>
 #include <linux/seq_file.h>
 #include <linux/seq_file.h>
 
 
-static struct dentry *debug;
+struct dentry *bcache_debug;
 
 
 #ifdef CONFIG_BCACHE_DEBUG
 #ifdef CONFIG_BCACHE_DEBUG
 
 
@@ -232,11 +232,11 @@ static const struct file_operations cache_set_debug_ops = {
 
 
 void bch_debug_init_cache_set(struct cache_set *c)
 void bch_debug_init_cache_set(struct cache_set *c)
 {
 {
-	if (!IS_ERR_OR_NULL(debug)) {
+	if (!IS_ERR_OR_NULL(bcache_debug)) {
 		char name[50];
 		char name[50];
 		snprintf(name, 50, "bcache-%pU", c->sb.set_uuid);
 		snprintf(name, 50, "bcache-%pU", c->sb.set_uuid);
 
 
-		c->debug = debugfs_create_file(name, 0400, debug, c,
+		c->debug = debugfs_create_file(name, 0400, bcache_debug, c,
 					       &cache_set_debug_ops);
 					       &cache_set_debug_ops);
 	}
 	}
 }
 }
@@ -245,13 +245,13 @@ void bch_debug_init_cache_set(struct cache_set *c)
 
 
 void bch_debug_exit(void)
 void bch_debug_exit(void)
 {
 {
-	if (!IS_ERR_OR_NULL(debug))
-		debugfs_remove_recursive(debug);
+	if (!IS_ERR_OR_NULL(bcache_debug))
+		debugfs_remove_recursive(bcache_debug);
 }
 }
 
 
 int __init bch_debug_init(struct kobject *kobj)
 int __init bch_debug_init(struct kobject *kobj)
 {
 {
-	debug = debugfs_create_dir("bcache", NULL);
+	bcache_debug = debugfs_create_dir("bcache", NULL);
 
 
-	return IS_ERR_OR_NULL(debug);
+	return IS_ERR_OR_NULL(bcache_debug);
 }
 }

+ 0 - 2
drivers/md/bcache/extents.c

@@ -534,7 +534,6 @@ err:
 static bool bch_extent_bad(struct btree_keys *bk, const struct bkey *k)
 static bool bch_extent_bad(struct btree_keys *bk, const struct bkey *k)
 {
 {
 	struct btree *b = container_of(bk, struct btree, keys);
 	struct btree *b = container_of(bk, struct btree, keys);
-	struct bucket *g;
 	unsigned i, stale;
 	unsigned i, stale;
 
 
 	if (!KEY_PTRS(k) ||
 	if (!KEY_PTRS(k) ||
@@ -549,7 +548,6 @@ static bool bch_extent_bad(struct btree_keys *bk, const struct bkey *k)
 		return false;
 		return false;
 
 
 	for (i = 0; i < KEY_PTRS(k); i++) {
 	for (i = 0; i < KEY_PTRS(k); i++) {
-		g = PTR_BUCKET(b->c, k, i);
 		stale = ptr_stale(b->c, k, i);
 		stale = ptr_stale(b->c, k, i);
 
 
 		btree_bug_on(stale > 96, b,
 		btree_bug_on(stale > 96, b,

+ 15 - 1
drivers/md/bcache/io.c

@@ -38,7 +38,7 @@ void __bch_submit_bbio(struct bio *bio, struct cache_set *c)
 	bio_set_dev(bio, PTR_CACHE(c, &b->key, 0)->bdev);
 	bio_set_dev(bio, PTR_CACHE(c, &b->key, 0)->bdev);
 
 
 	b->submit_time_us = local_clock_us();
 	b->submit_time_us = local_clock_us();
-	closure_bio_submit(bio, bio->bi_private);
+	closure_bio_submit(c, bio, bio->bi_private);
 }
 }
 
 
 void bch_submit_bbio(struct bio *bio, struct cache_set *c,
 void bch_submit_bbio(struct bio *bio, struct cache_set *c,
@@ -50,6 +50,20 @@ void bch_submit_bbio(struct bio *bio, struct cache_set *c,
 }
 }
 
 
 /* IO errors */
 /* IO errors */
+void bch_count_backing_io_errors(struct cached_dev *dc, struct bio *bio)
+{
+	char buf[BDEVNAME_SIZE];
+	unsigned errors;
+
+	WARN_ONCE(!dc, "NULL pointer of struct cached_dev");
+
+	errors = atomic_add_return(1, &dc->io_errors);
+	if (errors < dc->error_limit)
+		pr_err("%s: IO error on backing device, unrecoverable",
+			bio_devname(bio, buf));
+	else
+		bch_cached_dev_error(dc);
+}
 
 
 void bch_count_io_errors(struct cache *ca,
 void bch_count_io_errors(struct cache *ca,
 			 blk_status_t error,
 			 blk_status_t error,

+ 5 - 3
drivers/md/bcache/journal.c

@@ -62,7 +62,7 @@ reread:		left = ca->sb.bucket_size - offset;
 		bio_set_op_attrs(bio, REQ_OP_READ, 0);
 		bio_set_op_attrs(bio, REQ_OP_READ, 0);
 		bch_bio_map(bio, data);
 		bch_bio_map(bio, data);
 
 
-		closure_bio_submit(bio, &cl);
+		closure_bio_submit(ca->set, bio, &cl);
 		closure_sync(&cl);
 		closure_sync(&cl);
 
 
 		/* This function could be simpler now since we no longer write
 		/* This function could be simpler now since we no longer write
@@ -493,7 +493,7 @@ static void journal_reclaim(struct cache_set *c)
 	struct cache *ca;
 	struct cache *ca;
 	uint64_t last_seq;
 	uint64_t last_seq;
 	unsigned iter, n = 0;
 	unsigned iter, n = 0;
-	atomic_t p;
+	atomic_t p __maybe_unused;
 
 
 	atomic_long_inc(&c->reclaim);
 	atomic_long_inc(&c->reclaim);
 
 
@@ -594,6 +594,7 @@ static void journal_write_done(struct closure *cl)
 }
 }
 
 
 static void journal_write_unlock(struct closure *cl)
 static void journal_write_unlock(struct closure *cl)
+	__releases(&c->journal.lock)
 {
 {
 	struct cache_set *c = container_of(cl, struct cache_set, journal.io);
 	struct cache_set *c = container_of(cl, struct cache_set, journal.io);
 
 
@@ -674,7 +675,7 @@ static void journal_write_unlocked(struct closure *cl)
 	spin_unlock(&c->journal.lock);
 	spin_unlock(&c->journal.lock);
 
 
 	while ((bio = bio_list_pop(&list)))
 	while ((bio = bio_list_pop(&list)))
-		closure_bio_submit(bio, cl);
+		closure_bio_submit(c, bio, cl);
 
 
 	continue_at(cl, journal_write_done, NULL);
 	continue_at(cl, journal_write_done, NULL);
 }
 }
@@ -705,6 +706,7 @@ static void journal_try_write(struct cache_set *c)
 
 
 static struct journal_write *journal_wait_for_write(struct cache_set *c,
 static struct journal_write *journal_wait_for_write(struct cache_set *c,
 						    unsigned nkeys)
 						    unsigned nkeys)
+	__acquires(&c->journal.lock)
 {
 {
 	size_t sectors;
 	size_t sectors;
 	struct closure cl;
 	struct closure cl;

+ 157 - 29
drivers/md/bcache/request.c

@@ -139,6 +139,7 @@ static void bch_data_invalidate(struct closure *cl)
 	}
 	}
 
 
 	op->insert_data_done = true;
 	op->insert_data_done = true;
+	/* get in bch_data_insert() */
 	bio_put(bio);
 	bio_put(bio);
 out:
 out:
 	continue_at(cl, bch_data_insert_keys, op->wq);
 	continue_at(cl, bch_data_insert_keys, op->wq);
@@ -295,6 +296,7 @@ err:
 
 
 /**
 /**
  * bch_data_insert - stick some data in the cache
  * bch_data_insert - stick some data in the cache
+ * @cl: closure pointer.
  *
  *
  * This is the starting point for any data to end up in a cache device; it could
  * This is the starting point for any data to end up in a cache device; it could
  * be from a normal write, or a writeback write, or a write to a flash only
  * be from a normal write, or a writeback write, or a write to a flash only
@@ -630,6 +632,41 @@ static void request_endio(struct bio *bio)
 	closure_put(cl);
 	closure_put(cl);
 }
 }
 
 
+static void backing_request_endio(struct bio *bio)
+{
+	struct closure *cl = bio->bi_private;
+
+	if (bio->bi_status) {
+		struct search *s = container_of(cl, struct search, cl);
+		struct cached_dev *dc = container_of(s->d,
+						     struct cached_dev, disk);
+		/*
+		 * If a bio has REQ_PREFLUSH for writeback mode, it is
+		 * speically assembled in cached_dev_write() for a non-zero
+		 * write request which has REQ_PREFLUSH. we don't set
+		 * s->iop.status by this failure, the status will be decided
+		 * by result of bch_data_insert() operation.
+		 */
+		if (unlikely(s->iop.writeback &&
+			     bio->bi_opf & REQ_PREFLUSH)) {
+			char buf[BDEVNAME_SIZE];
+
+			bio_devname(bio, buf);
+			pr_err("Can't flush %s: returned bi_status %i",
+				buf, bio->bi_status);
+		} else {
+			/* set to orig_bio->bi_status in bio_complete() */
+			s->iop.status = bio->bi_status;
+		}
+		s->recoverable = false;
+		/* should count I/O error for backing device here */
+		bch_count_backing_io_errors(dc, bio);
+	}
+
+	bio_put(bio);
+	closure_put(cl);
+}
+
 static void bio_complete(struct search *s)
 static void bio_complete(struct search *s)
 {
 {
 	if (s->orig_bio) {
 	if (s->orig_bio) {
@@ -644,13 +681,21 @@ static void bio_complete(struct search *s)
 	}
 	}
 }
 }
 
 
-static void do_bio_hook(struct search *s, struct bio *orig_bio)
+static void do_bio_hook(struct search *s,
+			struct bio *orig_bio,
+			bio_end_io_t *end_io_fn)
 {
 {
 	struct bio *bio = &s->bio.bio;
 	struct bio *bio = &s->bio.bio;
 
 
 	bio_init(bio, NULL, 0);
 	bio_init(bio, NULL, 0);
 	__bio_clone_fast(bio, orig_bio);
 	__bio_clone_fast(bio, orig_bio);
-	bio->bi_end_io		= request_endio;
+	/*
+	 * bi_end_io can be set separately somewhere else, e.g. the
+	 * variants in,
+	 * - cache_bio->bi_end_io from cached_dev_cache_miss()
+	 * - n->bi_end_io from cache_lookup_fn()
+	 */
+	bio->bi_end_io		= end_io_fn;
 	bio->bi_private		= &s->cl;
 	bio->bi_private		= &s->cl;
 
 
 	bio_cnt_set(bio, 3);
 	bio_cnt_set(bio, 3);
@@ -676,7 +721,7 @@ static inline struct search *search_alloc(struct bio *bio,
 	s = mempool_alloc(d->c->search, GFP_NOIO);
 	s = mempool_alloc(d->c->search, GFP_NOIO);
 
 
 	closure_init(&s->cl, NULL);
 	closure_init(&s->cl, NULL);
-	do_bio_hook(s, bio);
+	do_bio_hook(s, bio, request_endio);
 
 
 	s->orig_bio		= bio;
 	s->orig_bio		= bio;
 	s->cache_miss		= NULL;
 	s->cache_miss		= NULL;
@@ -743,11 +788,12 @@ static void cached_dev_read_error(struct closure *cl)
 		trace_bcache_read_retry(s->orig_bio);
 		trace_bcache_read_retry(s->orig_bio);
 
 
 		s->iop.status = 0;
 		s->iop.status = 0;
-		do_bio_hook(s, s->orig_bio);
+		do_bio_hook(s, s->orig_bio, backing_request_endio);
 
 
 		/* XXX: invalidate cache */
 		/* XXX: invalidate cache */
 
 
-		closure_bio_submit(bio, cl);
+		/* I/O request sent to backing device */
+		closure_bio_submit(s->iop.c, bio, cl);
 	}
 	}
 
 
 	continue_at(cl, cached_dev_cache_miss_done, NULL);
 	continue_at(cl, cached_dev_cache_miss_done, NULL);
@@ -859,7 +905,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
 	bio_copy_dev(cache_bio, miss);
 	bio_copy_dev(cache_bio, miss);
 	cache_bio->bi_iter.bi_size	= s->insert_bio_sectors << 9;
 	cache_bio->bi_iter.bi_size	= s->insert_bio_sectors << 9;
 
 
-	cache_bio->bi_end_io	= request_endio;
+	cache_bio->bi_end_io	= backing_request_endio;
 	cache_bio->bi_private	= &s->cl;
 	cache_bio->bi_private	= &s->cl;
 
 
 	bch_bio_map(cache_bio, NULL);
 	bch_bio_map(cache_bio, NULL);
@@ -872,15 +918,17 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
 	s->cache_miss	= miss;
 	s->cache_miss	= miss;
 	s->iop.bio	= cache_bio;
 	s->iop.bio	= cache_bio;
 	bio_get(cache_bio);
 	bio_get(cache_bio);
-	closure_bio_submit(cache_bio, &s->cl);
+	/* I/O request sent to backing device */
+	closure_bio_submit(s->iop.c, cache_bio, &s->cl);
 
 
 	return ret;
 	return ret;
 out_put:
 out_put:
 	bio_put(cache_bio);
 	bio_put(cache_bio);
 out_submit:
 out_submit:
-	miss->bi_end_io		= request_endio;
+	miss->bi_end_io		= backing_request_endio;
 	miss->bi_private	= &s->cl;
 	miss->bi_private	= &s->cl;
-	closure_bio_submit(miss, &s->cl);
+	/* I/O request sent to backing device */
+	closure_bio_submit(s->iop.c, miss, &s->cl);
 	return ret;
 	return ret;
 }
 }
 
 
@@ -943,31 +991,46 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s)
 		s->iop.bio = s->orig_bio;
 		s->iop.bio = s->orig_bio;
 		bio_get(s->iop.bio);
 		bio_get(s->iop.bio);
 
 
-		if ((bio_op(bio) != REQ_OP_DISCARD) ||
-		    blk_queue_discard(bdev_get_queue(dc->bdev)))
-			closure_bio_submit(bio, cl);
+		if (bio_op(bio) == REQ_OP_DISCARD &&
+		    !blk_queue_discard(bdev_get_queue(dc->bdev)))
+			goto insert_data;
+
+		/* I/O request sent to backing device */
+		bio->bi_end_io = backing_request_endio;
+		closure_bio_submit(s->iop.c, bio, cl);
+
 	} else if (s->iop.writeback) {
 	} else if (s->iop.writeback) {
 		bch_writeback_add(dc);
 		bch_writeback_add(dc);
 		s->iop.bio = bio;
 		s->iop.bio = bio;
 
 
 		if (bio->bi_opf & REQ_PREFLUSH) {
 		if (bio->bi_opf & REQ_PREFLUSH) {
-			/* Also need to send a flush to the backing device */
-			struct bio *flush = bio_alloc_bioset(GFP_NOIO, 0,
-							     dc->disk.bio_split);
-
+			/*
+			 * Also need to send a flush to the backing
+			 * device.
+			 */
+			struct bio *flush;
+
+			flush = bio_alloc_bioset(GFP_NOIO, 0,
+						 dc->disk.bio_split);
+			if (!flush) {
+				s->iop.status = BLK_STS_RESOURCE;
+				goto insert_data;
+			}
 			bio_copy_dev(flush, bio);
 			bio_copy_dev(flush, bio);
-			flush->bi_end_io = request_endio;
+			flush->bi_end_io = backing_request_endio;
 			flush->bi_private = cl;
 			flush->bi_private = cl;
 			flush->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
 			flush->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
-
-			closure_bio_submit(flush, cl);
+			/* I/O request sent to backing device */
+			closure_bio_submit(s->iop.c, flush, cl);
 		}
 		}
 	} else {
 	} else {
 		s->iop.bio = bio_clone_fast(bio, GFP_NOIO, dc->disk.bio_split);
 		s->iop.bio = bio_clone_fast(bio, GFP_NOIO, dc->disk.bio_split);
-
-		closure_bio_submit(bio, cl);
+		/* I/O request sent to backing device */
+		bio->bi_end_io = backing_request_endio;
+		closure_bio_submit(s->iop.c, bio, cl);
 	}
 	}
 
 
+insert_data:
 	closure_call(&s->iop.cl, bch_data_insert, NULL, cl);
 	closure_call(&s->iop.cl, bch_data_insert, NULL, cl);
 	continue_at(cl, cached_dev_write_complete, NULL);
 	continue_at(cl, cached_dev_write_complete, NULL);
 }
 }
@@ -981,11 +1044,67 @@ static void cached_dev_nodata(struct closure *cl)
 		bch_journal_meta(s->iop.c, cl);
 		bch_journal_meta(s->iop.c, cl);
 
 
 	/* If it's a flush, we send the flush to the backing device too */
 	/* If it's a flush, we send the flush to the backing device too */
-	closure_bio_submit(bio, cl);
+	bio->bi_end_io = backing_request_endio;
+	closure_bio_submit(s->iop.c, bio, cl);
 
 
 	continue_at(cl, cached_dev_bio_complete, NULL);
 	continue_at(cl, cached_dev_bio_complete, NULL);
 }
 }
 
 
+struct detached_dev_io_private {
+	struct bcache_device	*d;
+	unsigned long		start_time;
+	bio_end_io_t		*bi_end_io;
+	void			*bi_private;
+};
+
+static void detached_dev_end_io(struct bio *bio)
+{
+	struct detached_dev_io_private *ddip;
+
+	ddip = bio->bi_private;
+	bio->bi_end_io = ddip->bi_end_io;
+	bio->bi_private = ddip->bi_private;
+
+	generic_end_io_acct(ddip->d->disk->queue,
+			    bio_data_dir(bio),
+			    &ddip->d->disk->part0, ddip->start_time);
+
+	if (bio->bi_status) {
+		struct cached_dev *dc = container_of(ddip->d,
+						     struct cached_dev, disk);
+		/* should count I/O error for backing device here */
+		bch_count_backing_io_errors(dc, bio);
+	}
+
+	kfree(ddip);
+	bio->bi_end_io(bio);
+}
+
+static void detached_dev_do_request(struct bcache_device *d, struct bio *bio)
+{
+	struct detached_dev_io_private *ddip;
+	struct cached_dev *dc = container_of(d, struct cached_dev, disk);
+
+	/*
+	 * no need to call closure_get(&dc->disk.cl),
+	 * because upper layer had already opened bcache device,
+	 * which would call closure_get(&dc->disk.cl)
+	 */
+	ddip = kzalloc(sizeof(struct detached_dev_io_private), GFP_NOIO);
+	ddip->d = d;
+	ddip->start_time = jiffies;
+	ddip->bi_end_io = bio->bi_end_io;
+	ddip->bi_private = bio->bi_private;
+	bio->bi_end_io = detached_dev_end_io;
+	bio->bi_private = ddip;
+
+	if ((bio_op(bio) == REQ_OP_DISCARD) &&
+	    !blk_queue_discard(bdev_get_queue(dc->bdev)))
+		bio->bi_end_io(bio);
+	else
+		generic_make_request(bio);
+}
+
 /* Cached devices - read & write stuff */
 /* Cached devices - read & write stuff */
 
 
 static blk_qc_t cached_dev_make_request(struct request_queue *q,
 static blk_qc_t cached_dev_make_request(struct request_queue *q,
@@ -996,6 +1115,13 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q,
 	struct cached_dev *dc = container_of(d, struct cached_dev, disk);
 	struct cached_dev *dc = container_of(d, struct cached_dev, disk);
 	int rw = bio_data_dir(bio);
 	int rw = bio_data_dir(bio);
 
 
+	if (unlikely((d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags)) ||
+		     dc->io_disable)) {
+		bio->bi_status = BLK_STS_IOERR;
+		bio_endio(bio);
+		return BLK_QC_T_NONE;
+	}
+
 	atomic_set(&dc->backing_idle, 0);
 	atomic_set(&dc->backing_idle, 0);
 	generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0);
 	generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0);
 
 
@@ -1022,13 +1148,9 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q,
 			else
 			else
 				cached_dev_read(dc, s);
 				cached_dev_read(dc, s);
 		}
 		}
-	} else {
-		if ((bio_op(bio) == REQ_OP_DISCARD) &&
-		    !blk_queue_discard(bdev_get_queue(dc->bdev)))
-			bio_endio(bio);
-		else
-			generic_make_request(bio);
-	}
+	} else
+		/* I/O request sent to backing device */
+		detached_dev_do_request(d, bio);
 
 
 	return BLK_QC_T_NONE;
 	return BLK_QC_T_NONE;
 }
 }
@@ -1112,6 +1234,12 @@ static blk_qc_t flash_dev_make_request(struct request_queue *q,
 	struct bcache_device *d = bio->bi_disk->private_data;
 	struct bcache_device *d = bio->bi_disk->private_data;
 	int rw = bio_data_dir(bio);
 	int rw = bio_data_dir(bio);
 
 
+	if (unlikely(d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags))) {
+		bio->bi_status = BLK_STS_IOERR;
+		bio_endio(bio);
+		return BLK_QC_T_NONE;
+	}
+
 	generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0);
 	generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0);
 
 
 	s = search_alloc(bio, d);
 	s = search_alloc(bio, d);

+ 136 - 24
drivers/md/bcache/super.c

@@ -47,6 +47,14 @@ const char * const bch_cache_modes[] = {
 	NULL
 	NULL
 };
 };
 
 
+/* Default is -1; we skip past it for stop_when_cache_set_failed */
+const char * const bch_stop_on_failure_modes[] = {
+	"default",
+	"auto",
+	"always",
+	NULL
+};
+
 static struct kobject *bcache_kobj;
 static struct kobject *bcache_kobj;
 struct mutex bch_register_lock;
 struct mutex bch_register_lock;
 LIST_HEAD(bch_cache_sets);
 LIST_HEAD(bch_cache_sets);
@@ -265,6 +273,7 @@ void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent)
 	bio->bi_private = dc;
 	bio->bi_private = dc;
 
 
 	closure_get(cl);
 	closure_get(cl);
+	/* I/O request sent to backing device */
 	__write_super(&dc->sb, bio);
 	__write_super(&dc->sb, bio);
 
 
 	closure_return_with_destructor(cl, bch_write_bdev_super_unlock);
 	closure_return_with_destructor(cl, bch_write_bdev_super_unlock);
@@ -521,7 +530,7 @@ static void prio_io(struct cache *ca, uint64_t bucket, int op,
 	bio_set_op_attrs(bio, op, REQ_SYNC|REQ_META|op_flags);
 	bio_set_op_attrs(bio, op, REQ_SYNC|REQ_META|op_flags);
 	bch_bio_map(bio, ca->disk_buckets);
 	bch_bio_map(bio, ca->disk_buckets);
 
 
-	closure_bio_submit(bio, &ca->prio);
+	closure_bio_submit(ca->set, bio, &ca->prio);
 	closure_sync(cl);
 	closure_sync(cl);
 }
 }
 
 
@@ -769,6 +778,8 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
 			      sector_t sectors)
 			      sector_t sectors)
 {
 {
 	struct request_queue *q;
 	struct request_queue *q;
+	const size_t max_stripes = min_t(size_t, INT_MAX,
+					 SIZE_MAX / sizeof(atomic_t));
 	size_t n;
 	size_t n;
 	int idx;
 	int idx;
 
 
@@ -777,9 +788,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
 
 
 	d->nr_stripes = DIV_ROUND_UP_ULL(sectors, d->stripe_size);
 	d->nr_stripes = DIV_ROUND_UP_ULL(sectors, d->stripe_size);
 
 
-	if (!d->nr_stripes ||
-	    d->nr_stripes > INT_MAX ||
-	    d->nr_stripes > SIZE_MAX / sizeof(atomic_t)) {
+	if (!d->nr_stripes || d->nr_stripes > max_stripes) {
 		pr_err("nr_stripes too large or invalid: %u (start sector beyond end of disk?)",
 		pr_err("nr_stripes too large or invalid: %u (start sector beyond end of disk?)",
 			(unsigned)d->nr_stripes);
 			(unsigned)d->nr_stripes);
 		return -ENOMEM;
 		return -ENOMEM;
@@ -833,9 +842,9 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
 	q->limits.io_min		= block_size;
 	q->limits.io_min		= block_size;
 	q->limits.logical_block_size	= block_size;
 	q->limits.logical_block_size	= block_size;
 	q->limits.physical_block_size	= block_size;
 	q->limits.physical_block_size	= block_size;
-	set_bit(QUEUE_FLAG_NONROT,	&d->disk->queue->queue_flags);
-	clear_bit(QUEUE_FLAG_ADD_RANDOM, &d->disk->queue->queue_flags);
-	set_bit(QUEUE_FLAG_DISCARD,	&d->disk->queue->queue_flags);
+	blk_queue_flag_set(QUEUE_FLAG_NONROT, d->disk->queue);
+	blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, d->disk->queue);
+	blk_queue_flag_set(QUEUE_FLAG_DISCARD, d->disk->queue);
 
 
 	blk_queue_write_cache(q, true, true);
 	blk_queue_write_cache(q, true, true);
 
 
@@ -899,6 +908,31 @@ void bch_cached_dev_run(struct cached_dev *dc)
 		pr_debug("error creating sysfs link");
 		pr_debug("error creating sysfs link");
 }
 }
 
 
+/*
+ * If BCACHE_DEV_RATE_DW_RUNNING is set, it means routine of the delayed
+ * work dc->writeback_rate_update is running. Wait until the routine
+ * quits (BCACHE_DEV_RATE_DW_RUNNING is clear), then continue to
+ * cancel it. If BCACHE_DEV_RATE_DW_RUNNING is not clear after time_out
+ * seconds, give up waiting here and continue to cancel it too.
+ */
+static void cancel_writeback_rate_update_dwork(struct cached_dev *dc)
+{
+	int time_out = WRITEBACK_RATE_UPDATE_SECS_MAX * HZ;
+
+	do {
+		if (!test_bit(BCACHE_DEV_RATE_DW_RUNNING,
+			      &dc->disk.flags))
+			break;
+		time_out--;
+		schedule_timeout_interruptible(1);
+	} while (time_out > 0);
+
+	if (time_out == 0)
+		pr_warn("give up waiting for dc->writeback_write_update to quit");
+
+	cancel_delayed_work_sync(&dc->writeback_rate_update);
+}
+
 static void cached_dev_detach_finish(struct work_struct *w)
 static void cached_dev_detach_finish(struct work_struct *w)
 {
 {
 	struct cached_dev *dc = container_of(w, struct cached_dev, detach);
 	struct cached_dev *dc = container_of(w, struct cached_dev, detach);
@@ -911,7 +945,9 @@ static void cached_dev_detach_finish(struct work_struct *w)
 
 
 	mutex_lock(&bch_register_lock);
 	mutex_lock(&bch_register_lock);
 
 
-	cancel_delayed_work_sync(&dc->writeback_rate_update);
+	if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
+		cancel_writeback_rate_update_dwork(dc);
+
 	if (!IS_ERR_OR_NULL(dc->writeback_thread)) {
 	if (!IS_ERR_OR_NULL(dc->writeback_thread)) {
 		kthread_stop(dc->writeback_thread);
 		kthread_stop(dc->writeback_thread);
 		dc->writeback_thread = NULL;
 		dc->writeback_thread = NULL;
@@ -954,6 +990,7 @@ void bch_cached_dev_detach(struct cached_dev *dc)
 	closure_get(&dc->disk.cl);
 	closure_get(&dc->disk.cl);
 
 
 	bch_writeback_queue(dc);
 	bch_writeback_queue(dc);
+
 	cached_dev_put(dc);
 	cached_dev_put(dc);
 }
 }
 
 
@@ -1065,7 +1102,6 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
 	if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
 	if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
 		bch_sectors_dirty_init(&dc->disk);
 		bch_sectors_dirty_init(&dc->disk);
 		atomic_set(&dc->has_dirty, 1);
 		atomic_set(&dc->has_dirty, 1);
-		refcount_inc(&dc->count);
 		bch_writeback_queue(dc);
 		bch_writeback_queue(dc);
 	}
 	}
 
 
@@ -1093,14 +1129,16 @@ static void cached_dev_free(struct closure *cl)
 {
 {
 	struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
 	struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
 
 
-	cancel_delayed_work_sync(&dc->writeback_rate_update);
+	mutex_lock(&bch_register_lock);
+
+	if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
+		cancel_writeback_rate_update_dwork(dc);
+
 	if (!IS_ERR_OR_NULL(dc->writeback_thread))
 	if (!IS_ERR_OR_NULL(dc->writeback_thread))
 		kthread_stop(dc->writeback_thread);
 		kthread_stop(dc->writeback_thread);
 	if (dc->writeback_write_wq)
 	if (dc->writeback_write_wq)
 		destroy_workqueue(dc->writeback_write_wq);
 		destroy_workqueue(dc->writeback_write_wq);
 
 
-	mutex_lock(&bch_register_lock);
-
 	if (atomic_read(&dc->running))
 	if (atomic_read(&dc->running))
 		bd_unlink_disk_holder(dc->bdev, dc->disk.disk);
 		bd_unlink_disk_holder(dc->bdev, dc->disk.disk);
 	bcache_device_free(&dc->disk);
 	bcache_device_free(&dc->disk);
@@ -1170,6 +1208,12 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size)
 		max(dc->disk.disk->queue->backing_dev_info->ra_pages,
 		max(dc->disk.disk->queue->backing_dev_info->ra_pages,
 		    q->backing_dev_info->ra_pages);
 		    q->backing_dev_info->ra_pages);
 
 
+	atomic_set(&dc->io_errors, 0);
+	dc->io_disable = false;
+	dc->error_limit = DEFAULT_CACHED_DEV_ERROR_LIMIT;
+	/* default to auto */
+	dc->stop_when_cache_set_failed = BCH_CACHED_DEV_STOP_AUTO;
+
 	bch_cached_dev_request_init(dc);
 	bch_cached_dev_request_init(dc);
 	bch_cached_dev_writeback_init(dc);
 	bch_cached_dev_writeback_init(dc);
 	return 0;
 	return 0;
@@ -1321,6 +1365,24 @@ int bch_flash_dev_create(struct cache_set *c, uint64_t size)
 	return flash_dev_run(c, u);
 	return flash_dev_run(c, u);
 }
 }
 
 
+bool bch_cached_dev_error(struct cached_dev *dc)
+{
+	char name[BDEVNAME_SIZE];
+
+	if (!dc || test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags))
+		return false;
+
+	dc->io_disable = true;
+	/* make others know io_disable is true earlier */
+	smp_mb();
+
+	pr_err("stop %s: too many IO errors on backing device %s\n",
+		dc->disk.disk->disk_name, bdevname(dc->bdev, name));
+
+	bcache_device_stop(&dc->disk);
+	return true;
+}
+
 /* Cache set */
 /* Cache set */
 
 
 __printf(2, 3)
 __printf(2, 3)
@@ -1332,6 +1394,9 @@ bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...)
 	    test_bit(CACHE_SET_STOPPING, &c->flags))
 	    test_bit(CACHE_SET_STOPPING, &c->flags))
 		return false;
 		return false;
 
 
+	if (test_and_set_bit(CACHE_SET_IO_DISABLE, &c->flags))
+		pr_warn("CACHE_SET_IO_DISABLE already set");
+
 	/* XXX: we can be called from atomic context
 	/* XXX: we can be called from atomic context
 	acquire_console_sem();
 	acquire_console_sem();
 	*/
 	*/
@@ -1443,25 +1508,72 @@ static void cache_set_flush(struct closure *cl)
 	closure_return(cl);
 	closure_return(cl);
 }
 }
 
 
+/*
+ * This function is only called when CACHE_SET_IO_DISABLE is set, which means
+ * cache set is unregistering due to too many I/O errors. In this condition,
+ * the bcache device might be stopped, it depends on stop_when_cache_set_failed
+ * value and whether the broken cache has dirty data:
+ *
+ * dc->stop_when_cache_set_failed    dc->has_dirty   stop bcache device
+ *  BCH_CACHED_STOP_AUTO               0               NO
+ *  BCH_CACHED_STOP_AUTO               1               YES
+ *  BCH_CACHED_DEV_STOP_ALWAYS         0               YES
+ *  BCH_CACHED_DEV_STOP_ALWAYS         1               YES
+ *
+ * The expected behavior is, if stop_when_cache_set_failed is configured to
+ * "auto" via sysfs interface, the bcache device will not be stopped if the
+ * backing device is clean on the broken cache device.
+ */
+static void conditional_stop_bcache_device(struct cache_set *c,
+					   struct bcache_device *d,
+					   struct cached_dev *dc)
+{
+	if (dc->stop_when_cache_set_failed == BCH_CACHED_DEV_STOP_ALWAYS) {
+		pr_warn("stop_when_cache_set_failed of %s is \"always\", stop it for failed cache set %pU.",
+			d->disk->disk_name, c->sb.set_uuid);
+		bcache_device_stop(d);
+	} else if (atomic_read(&dc->has_dirty)) {
+		/*
+		 * dc->stop_when_cache_set_failed == BCH_CACHED_STOP_AUTO
+		 * and dc->has_dirty == 1
+		 */
+		pr_warn("stop_when_cache_set_failed of %s is \"auto\" and cache is dirty, stop it to avoid potential data corruption.",
+			d->disk->disk_name);
+			bcache_device_stop(d);
+	} else {
+		/*
+		 * dc->stop_when_cache_set_failed == BCH_CACHED_STOP_AUTO
+		 * and dc->has_dirty == 0
+		 */
+		pr_warn("stop_when_cache_set_failed of %s is \"auto\" and cache is clean, keep it alive.",
+			d->disk->disk_name);
+	}
+}
+
 static void __cache_set_unregister(struct closure *cl)
 static void __cache_set_unregister(struct closure *cl)
 {
 {
 	struct cache_set *c = container_of(cl, struct cache_set, caching);
 	struct cache_set *c = container_of(cl, struct cache_set, caching);
 	struct cached_dev *dc;
 	struct cached_dev *dc;
+	struct bcache_device *d;
 	size_t i;
 	size_t i;
 
 
 	mutex_lock(&bch_register_lock);
 	mutex_lock(&bch_register_lock);
 
 
-	for (i = 0; i < c->devices_max_used; i++)
-		if (c->devices[i]) {
-			if (!UUID_FLASH_ONLY(&c->uuids[i]) &&
-			    test_bit(CACHE_SET_UNREGISTERING, &c->flags)) {
-				dc = container_of(c->devices[i],
-						  struct cached_dev, disk);
-				bch_cached_dev_detach(dc);
-			} else {
-				bcache_device_stop(c->devices[i]);
-			}
+	for (i = 0; i < c->devices_max_used; i++) {
+		d = c->devices[i];
+		if (!d)
+			continue;
+
+		if (!UUID_FLASH_ONLY(&c->uuids[i]) &&
+		    test_bit(CACHE_SET_UNREGISTERING, &c->flags)) {
+			dc = container_of(d, struct cached_dev, disk);
+			bch_cached_dev_detach(dc);
+			if (test_bit(CACHE_SET_IO_DISABLE, &c->flags))
+				conditional_stop_bcache_device(c, d, dc);
+		} else {
+			bcache_device_stop(d);
 		}
 		}
+	}
 
 
 	mutex_unlock(&bch_register_lock);
 	mutex_unlock(&bch_register_lock);
 
 
@@ -1567,6 +1679,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
 	c->congested_read_threshold_us	= 2000;
 	c->congested_read_threshold_us	= 2000;
 	c->congested_write_threshold_us	= 20000;
 	c->congested_write_threshold_us	= 20000;
 	c->error_limit	= DEFAULT_IO_ERROR_LIMIT;
 	c->error_limit	= DEFAULT_IO_ERROR_LIMIT;
+	WARN_ON(test_and_clear_bit(CACHE_SET_IO_DISABLE, &c->flags));
 
 
 	return c;
 	return c;
 err:
 err:
@@ -2148,7 +2261,6 @@ static int __init bcache_init(void)
 	mutex_init(&bch_register_lock);
 	mutex_init(&bch_register_lock);
 	init_waitqueue_head(&unregister_wait);
 	init_waitqueue_head(&unregister_wait);
 	register_reboot_notifier(&reboot);
 	register_reboot_notifier(&reboot);
-	closure_debug_init();
 
 
 	bcache_major = register_blkdev(0, "bcache");
 	bcache_major = register_blkdev(0, "bcache");
 	if (bcache_major < 0) {
 	if (bcache_major < 0) {
@@ -2160,7 +2272,7 @@ static int __init bcache_init(void)
 	if (!(bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0)) ||
 	if (!(bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0)) ||
 	    !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) ||
 	    !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) ||
 	    bch_request_init() ||
 	    bch_request_init() ||
-	    bch_debug_init(bcache_kobj) ||
+	    bch_debug_init(bcache_kobj) || closure_debug_init() ||
 	    sysfs_create_files(bcache_kobj, files))
 	    sysfs_create_files(bcache_kobj, files))
 		goto err;
 		goto err;
 
 

+ 52 - 3
drivers/md/bcache/sysfs.c

@@ -78,6 +78,7 @@ rw_attribute(congested_write_threshold_us);
 rw_attribute(sequential_cutoff);
 rw_attribute(sequential_cutoff);
 rw_attribute(data_csum);
 rw_attribute(data_csum);
 rw_attribute(cache_mode);
 rw_attribute(cache_mode);
+rw_attribute(stop_when_cache_set_failed);
 rw_attribute(writeback_metadata);
 rw_attribute(writeback_metadata);
 rw_attribute(writeback_running);
 rw_attribute(writeback_running);
 rw_attribute(writeback_percent);
 rw_attribute(writeback_percent);
@@ -95,6 +96,7 @@ read_attribute(partial_stripes_expensive);
 
 
 rw_attribute(synchronous);
 rw_attribute(synchronous);
 rw_attribute(journal_delay_ms);
 rw_attribute(journal_delay_ms);
+rw_attribute(io_disable);
 rw_attribute(discard);
 rw_attribute(discard);
 rw_attribute(running);
 rw_attribute(running);
 rw_attribute(label);
 rw_attribute(label);
@@ -125,6 +127,12 @@ SHOW(__bch_cached_dev)
 					       bch_cache_modes + 1,
 					       bch_cache_modes + 1,
 					       BDEV_CACHE_MODE(&dc->sb));
 					       BDEV_CACHE_MODE(&dc->sb));
 
 
+	if (attr == &sysfs_stop_when_cache_set_failed)
+		return bch_snprint_string_list(buf, PAGE_SIZE,
+					       bch_stop_on_failure_modes + 1,
+					       dc->stop_when_cache_set_failed);
+
+
 	sysfs_printf(data_csum,		"%i", dc->disk.data_csum);
 	sysfs_printf(data_csum,		"%i", dc->disk.data_csum);
 	var_printf(verify,		"%i");
 	var_printf(verify,		"%i");
 	var_printf(bypass_torture_test,	"%i");
 	var_printf(bypass_torture_test,	"%i");
@@ -133,7 +141,9 @@ SHOW(__bch_cached_dev)
 	var_print(writeback_delay);
 	var_print(writeback_delay);
 	var_print(writeback_percent);
 	var_print(writeback_percent);
 	sysfs_hprint(writeback_rate,	dc->writeback_rate.rate << 9);
 	sysfs_hprint(writeback_rate,	dc->writeback_rate.rate << 9);
-
+	sysfs_hprint(io_errors,		atomic_read(&dc->io_errors));
+	sysfs_printf(io_error_limit,	"%i", dc->error_limit);
+	sysfs_printf(io_disable,	"%i", dc->io_disable);
 	var_print(writeback_rate_update_seconds);
 	var_print(writeback_rate_update_seconds);
 	var_print(writeback_rate_i_term_inverse);
 	var_print(writeback_rate_i_term_inverse);
 	var_print(writeback_rate_p_term_inverse);
 	var_print(writeback_rate_p_term_inverse);
@@ -173,7 +183,7 @@ SHOW(__bch_cached_dev)
 	sysfs_hprint(dirty_data,
 	sysfs_hprint(dirty_data,
 		     bcache_dev_sectors_dirty(&dc->disk) << 9);
 		     bcache_dev_sectors_dirty(&dc->disk) << 9);
 
 
-	sysfs_hprint(stripe_size,	dc->disk.stripe_size << 9);
+	sysfs_hprint(stripe_size,	 ((uint64_t)dc->disk.stripe_size) << 9);
 	var_printf(partial_stripes_expensive,	"%u");
 	var_printf(partial_stripes_expensive,	"%u");
 
 
 	var_hprint(sequential_cutoff);
 	var_hprint(sequential_cutoff);
@@ -224,6 +234,14 @@ STORE(__cached_dev)
 	d_strtoul(writeback_rate_i_term_inverse);
 	d_strtoul(writeback_rate_i_term_inverse);
 	d_strtoul_nonzero(writeback_rate_p_term_inverse);
 	d_strtoul_nonzero(writeback_rate_p_term_inverse);
 
 
+	sysfs_strtoul_clamp(io_error_limit, dc->error_limit, 0, INT_MAX);
+
+	if (attr == &sysfs_io_disable) {
+		int v = strtoul_or_return(buf);
+
+		dc->io_disable = v ? 1 : 0;
+	}
+
 	d_strtoi_h(sequential_cutoff);
 	d_strtoi_h(sequential_cutoff);
 	d_strtoi_h(readahead);
 	d_strtoi_h(readahead);
 
 
@@ -246,6 +264,15 @@ STORE(__cached_dev)
 		}
 		}
 	}
 	}
 
 
+	if (attr == &sysfs_stop_when_cache_set_failed) {
+		v = bch_read_string_list(buf, bch_stop_on_failure_modes + 1);
+
+		if (v < 0)
+			return v;
+
+		dc->stop_when_cache_set_failed = v;
+	}
+
 	if (attr == &sysfs_label) {
 	if (attr == &sysfs_label) {
 		if (size > SB_LABEL_SIZE)
 		if (size > SB_LABEL_SIZE)
 			return -EINVAL;
 			return -EINVAL;
@@ -309,7 +336,8 @@ STORE(bch_cached_dev)
 		bch_writeback_queue(dc);
 		bch_writeback_queue(dc);
 
 
 	if (attr == &sysfs_writeback_percent)
 	if (attr == &sysfs_writeback_percent)
-		schedule_delayed_work(&dc->writeback_rate_update,
+		if (!test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
+			schedule_delayed_work(&dc->writeback_rate_update,
 				      dc->writeback_rate_update_seconds * HZ);
 				      dc->writeback_rate_update_seconds * HZ);
 
 
 	mutex_unlock(&bch_register_lock);
 	mutex_unlock(&bch_register_lock);
@@ -324,6 +352,7 @@ static struct attribute *bch_cached_dev_files[] = {
 	&sysfs_data_csum,
 	&sysfs_data_csum,
 #endif
 #endif
 	&sysfs_cache_mode,
 	&sysfs_cache_mode,
+	&sysfs_stop_when_cache_set_failed,
 	&sysfs_writeback_metadata,
 	&sysfs_writeback_metadata,
 	&sysfs_writeback_running,
 	&sysfs_writeback_running,
 	&sysfs_writeback_delay,
 	&sysfs_writeback_delay,
@@ -333,6 +362,9 @@ static struct attribute *bch_cached_dev_files[] = {
 	&sysfs_writeback_rate_i_term_inverse,
 	&sysfs_writeback_rate_i_term_inverse,
 	&sysfs_writeback_rate_p_term_inverse,
 	&sysfs_writeback_rate_p_term_inverse,
 	&sysfs_writeback_rate_debug,
 	&sysfs_writeback_rate_debug,
+	&sysfs_errors,
+	&sysfs_io_error_limit,
+	&sysfs_io_disable,
 	&sysfs_dirty_data,
 	&sysfs_dirty_data,
 	&sysfs_stripe_size,
 	&sysfs_stripe_size,
 	&sysfs_partial_stripes_expensive,
 	&sysfs_partial_stripes_expensive,
@@ -590,6 +622,8 @@ SHOW(__bch_cache_set)
 	sysfs_printf(gc_always_rewrite,		"%i", c->gc_always_rewrite);
 	sysfs_printf(gc_always_rewrite,		"%i", c->gc_always_rewrite);
 	sysfs_printf(btree_shrinker_disabled,	"%i", c->shrinker_disabled);
 	sysfs_printf(btree_shrinker_disabled,	"%i", c->shrinker_disabled);
 	sysfs_printf(copy_gc_enabled,		"%i", c->copy_gc_enabled);
 	sysfs_printf(copy_gc_enabled,		"%i", c->copy_gc_enabled);
+	sysfs_printf(io_disable,		"%i",
+		     test_bit(CACHE_SET_IO_DISABLE, &c->flags));
 
 
 	if (attr == &sysfs_bset_tree_stats)
 	if (attr == &sysfs_bset_tree_stats)
 		return bch_bset_print_stats(c, buf);
 		return bch_bset_print_stats(c, buf);
@@ -679,6 +713,20 @@ STORE(__bch_cache_set)
 	if (attr == &sysfs_io_error_halflife)
 	if (attr == &sysfs_io_error_halflife)
 		c->error_decay = strtoul_or_return(buf) / 88;
 		c->error_decay = strtoul_or_return(buf) / 88;
 
 
+	if (attr == &sysfs_io_disable) {
+		int v = strtoul_or_return(buf);
+
+		if (v) {
+			if (test_and_set_bit(CACHE_SET_IO_DISABLE,
+					     &c->flags))
+				pr_warn("CACHE_SET_IO_DISABLE already set");
+		} else {
+			if (!test_and_clear_bit(CACHE_SET_IO_DISABLE,
+						&c->flags))
+				pr_warn("CACHE_SET_IO_DISABLE already cleared");
+		}
+	}
+
 	sysfs_strtoul(journal_delay_ms,		c->journal_delay_ms);
 	sysfs_strtoul(journal_delay_ms,		c->journal_delay_ms);
 	sysfs_strtoul(verify,			c->verify);
 	sysfs_strtoul(verify,			c->verify);
 	sysfs_strtoul(key_merging_disabled,	c->key_merging_disabled);
 	sysfs_strtoul(key_merging_disabled,	c->key_merging_disabled);
@@ -764,6 +812,7 @@ static struct attribute *bch_cache_set_internal_files[] = {
 	&sysfs_gc_always_rewrite,
 	&sysfs_gc_always_rewrite,
 	&sysfs_btree_shrinker_disabled,
 	&sysfs_btree_shrinker_disabled,
 	&sysfs_copy_gc_enabled,
 	&sysfs_copy_gc_enabled,
+	&sysfs_io_disable,
 	NULL
 	NULL
 };
 };
 KTYPE(bch_cache_set_internal);
 KTYPE(bch_cache_set_internal);

+ 15 - 10
drivers/md/bcache/util.c

@@ -32,20 +32,27 @@ int bch_ ## name ## _h(const char *cp, type *res)		\
 	case 'y':						\
 	case 'y':						\
 	case 'z':						\
 	case 'z':						\
 		u++;						\
 		u++;						\
+		/* fall through */				\
 	case 'e':						\
 	case 'e':						\
 		u++;						\
 		u++;						\
+		/* fall through */				\
 	case 'p':						\
 	case 'p':						\
 		u++;						\
 		u++;						\
+		/* fall through */				\
 	case 't':						\
 	case 't':						\
 		u++;						\
 		u++;						\
+		/* fall through */				\
 	case 'g':						\
 	case 'g':						\
 		u++;						\
 		u++;						\
+		/* fall through */				\
 	case 'm':						\
 	case 'm':						\
 		u++;						\
 		u++;						\
+		/* fall through */				\
 	case 'k':						\
 	case 'k':						\
 		u++;						\
 		u++;						\
 		if (e++ == cp)					\
 		if (e++ == cp)					\
 			return -EINVAL;				\
 			return -EINVAL;				\
+		/* fall through */				\
 	case '\n':						\
 	case '\n':						\
 	case '\0':						\
 	case '\0':						\
 		if (*e == '\n')					\
 		if (*e == '\n')					\
@@ -75,10 +82,9 @@ STRTO_H(strtoll, long long)
 STRTO_H(strtoull, unsigned long long)
 STRTO_H(strtoull, unsigned long long)
 
 
 /**
 /**
- * bch_hprint() - formats @v to human readable string for sysfs.
- *
- * @v - signed 64 bit integer
- * @buf - the (at least 8 byte) buffer to format the result into.
+ * bch_hprint - formats @v to human readable string for sysfs.
+ * @buf: the (at least 8 byte) buffer to format the result into.
+ * @v: signed 64 bit integer
  *
  *
  * Returns the number of bytes used by format.
  * Returns the number of bytes used by format.
  */
  */
@@ -218,13 +224,12 @@ void bch_time_stats_update(struct time_stats *stats, uint64_t start_time)
 }
 }
 
 
 /**
 /**
- * bch_next_delay() - increment @d by the amount of work done, and return how
- * long to delay until the next time to do some work.
- *
- * @d - the struct bch_ratelimit to update
- * @done - the amount of work done, in arbitrary units
+ * bch_next_delay() - update ratelimiting statistics and calculate next delay
+ * @d: the struct bch_ratelimit to update
+ * @done: the amount of work done, in arbitrary units
  *
  *
- * Returns the amount of time to delay by, in jiffies
+ * Increment @d by the amount of work done, and return how long to delay in
+ * jiffies until the next time to do some work.
  */
  */
 uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done)
 uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done)
 {
 {

+ 0 - 6
drivers/md/bcache/util.h

@@ -567,12 +567,6 @@ static inline sector_t bdev_sectors(struct block_device *bdev)
 	return bdev->bd_inode->i_size >> 9;
 	return bdev->bd_inode->i_size >> 9;
 }
 }
 
 
-#define closure_bio_submit(bio, cl)					\
-do {									\
-	closure_get(cl);						\
-	generic_make_request(bio);					\
-} while (0)
-
 uint64_t bch_crc64_update(uint64_t, const void *, size_t);
 uint64_t bch_crc64_update(uint64_t, const void *, size_t);
 uint64_t bch_crc64(const void *, size_t);
 uint64_t bch_crc64(const void *, size_t);
 
 

+ 79 - 13
drivers/md/bcache/writeback.c

@@ -114,6 +114,27 @@ static void update_writeback_rate(struct work_struct *work)
 	struct cached_dev *dc = container_of(to_delayed_work(work),
 	struct cached_dev *dc = container_of(to_delayed_work(work),
 					     struct cached_dev,
 					     struct cached_dev,
 					     writeback_rate_update);
 					     writeback_rate_update);
+	struct cache_set *c = dc->disk.c;
+
+	/*
+	 * should check BCACHE_DEV_RATE_DW_RUNNING before calling
+	 * cancel_delayed_work_sync().
+	 */
+	set_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags);
+	/* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */
+	smp_mb();
+
+	/*
+	 * CACHE_SET_IO_DISABLE might be set via sysfs interface,
+	 * check it here too.
+	 */
+	if (!test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags) ||
+	    test_bit(CACHE_SET_IO_DISABLE, &c->flags)) {
+		clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags);
+		/* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */
+		smp_mb();
+		return;
+	}
 
 
 	down_read(&dc->writeback_lock);
 	down_read(&dc->writeback_lock);
 
 
@@ -123,8 +144,23 @@ static void update_writeback_rate(struct work_struct *work)
 
 
 	up_read(&dc->writeback_lock);
 	up_read(&dc->writeback_lock);
 
 
-	schedule_delayed_work(&dc->writeback_rate_update,
+	/*
+	 * CACHE_SET_IO_DISABLE might be set via sysfs interface,
+	 * check it here too.
+	 */
+	if (test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags) &&
+	    !test_bit(CACHE_SET_IO_DISABLE, &c->flags)) {
+		schedule_delayed_work(&dc->writeback_rate_update,
 			      dc->writeback_rate_update_seconds * HZ);
 			      dc->writeback_rate_update_seconds * HZ);
+	}
+
+	/*
+	 * should check BCACHE_DEV_RATE_DW_RUNNING before calling
+	 * cancel_delayed_work_sync().
+	 */
+	clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags);
+	/* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */
+	smp_mb();
 }
 }
 
 
 static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors)
 static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors)
@@ -253,7 +289,8 @@ static void write_dirty(struct closure *cl)
 		bio_set_dev(&io->bio, io->dc->bdev);
 		bio_set_dev(&io->bio, io->dc->bdev);
 		io->bio.bi_end_io	= dirty_endio;
 		io->bio.bi_end_io	= dirty_endio;
 
 
-		closure_bio_submit(&io->bio, cl);
+		/* I/O request sent to backing device */
+		closure_bio_submit(io->dc->disk.c, &io->bio, cl);
 	}
 	}
 
 
 	atomic_set(&dc->writeback_sequence_next, next_sequence);
 	atomic_set(&dc->writeback_sequence_next, next_sequence);
@@ -279,7 +316,7 @@ static void read_dirty_submit(struct closure *cl)
 {
 {
 	struct dirty_io *io = container_of(cl, struct dirty_io, cl);
 	struct dirty_io *io = container_of(cl, struct dirty_io, cl);
 
 
-	closure_bio_submit(&io->bio, cl);
+	closure_bio_submit(io->dc->disk.c, &io->bio, cl);
 
 
 	continue_at(cl, write_dirty, io->dc->writeback_write_wq);
 	continue_at(cl, write_dirty, io->dc->writeback_write_wq);
 }
 }
@@ -305,7 +342,9 @@ static void read_dirty(struct cached_dev *dc)
 
 
 	next = bch_keybuf_next(&dc->writeback_keys);
 	next = bch_keybuf_next(&dc->writeback_keys);
 
 
-	while (!kthread_should_stop() && next) {
+	while (!kthread_should_stop() &&
+	       !test_bit(CACHE_SET_IO_DISABLE, &dc->disk.c->flags) &&
+	       next) {
 		size = 0;
 		size = 0;
 		nk = 0;
 		nk = 0;
 
 
@@ -402,7 +441,9 @@ static void read_dirty(struct cached_dev *dc)
 			}
 			}
 		}
 		}
 
 
-		while (!kthread_should_stop() && delay) {
+		while (!kthread_should_stop() &&
+		       !test_bit(CACHE_SET_IO_DISABLE, &dc->disk.c->flags) &&
+		       delay) {
 			schedule_timeout_interruptible(delay);
 			schedule_timeout_interruptible(delay);
 			delay = writeback_delay(dc, 0);
 			delay = writeback_delay(dc, 0);
 		}
 		}
@@ -558,21 +599,30 @@ static bool refill_dirty(struct cached_dev *dc)
 static int bch_writeback_thread(void *arg)
 static int bch_writeback_thread(void *arg)
 {
 {
 	struct cached_dev *dc = arg;
 	struct cached_dev *dc = arg;
+	struct cache_set *c = dc->disk.c;
 	bool searched_full_index;
 	bool searched_full_index;
 
 
 	bch_ratelimit_reset(&dc->writeback_rate);
 	bch_ratelimit_reset(&dc->writeback_rate);
 
 
-	while (!kthread_should_stop()) {
+	while (!kthread_should_stop() &&
+	       !test_bit(CACHE_SET_IO_DISABLE, &c->flags)) {
 		down_write(&dc->writeback_lock);
 		down_write(&dc->writeback_lock);
 		set_current_state(TASK_INTERRUPTIBLE);
 		set_current_state(TASK_INTERRUPTIBLE);
-		if (!atomic_read(&dc->has_dirty) ||
-		    (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) &&
-		     !dc->writeback_running)) {
+		/*
+		 * If the bache device is detaching, skip here and continue
+		 * to perform writeback. Otherwise, if no dirty data on cache,
+		 * or there is dirty data on cache but writeback is disabled,
+		 * the writeback thread should sleep here and wait for others
+		 * to wake up it.
+		 */
+		if (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) &&
+		    (!atomic_read(&dc->has_dirty) || !dc->writeback_running)) {
 			up_write(&dc->writeback_lock);
 			up_write(&dc->writeback_lock);
 
 
-			if (kthread_should_stop()) {
+			if (kthread_should_stop() ||
+			    test_bit(CACHE_SET_IO_DISABLE, &c->flags)) {
 				set_current_state(TASK_RUNNING);
 				set_current_state(TASK_RUNNING);
-				return 0;
+				break;
 			}
 			}
 
 
 			schedule();
 			schedule();
@@ -585,9 +635,16 @@ static int bch_writeback_thread(void *arg)
 		if (searched_full_index &&
 		if (searched_full_index &&
 		    RB_EMPTY_ROOT(&dc->writeback_keys.keys)) {
 		    RB_EMPTY_ROOT(&dc->writeback_keys.keys)) {
 			atomic_set(&dc->has_dirty, 0);
 			atomic_set(&dc->has_dirty, 0);
-			cached_dev_put(dc);
 			SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN);
 			SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN);
 			bch_write_bdev_super(dc, NULL);
 			bch_write_bdev_super(dc, NULL);
+			/*
+			 * If bcache device is detaching via sysfs interface,
+			 * writeback thread should stop after there is no dirty
+			 * data on cache. BCACHE_DEV_DETACHING flag is set in
+			 * bch_cached_dev_detach().
+			 */
+			if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
+				break;
 		}
 		}
 
 
 		up_write(&dc->writeback_lock);
 		up_write(&dc->writeback_lock);
@@ -599,6 +656,7 @@ static int bch_writeback_thread(void *arg)
 
 
 			while (delay &&
 			while (delay &&
 			       !kthread_should_stop() &&
 			       !kthread_should_stop() &&
+			       !test_bit(CACHE_SET_IO_DISABLE, &c->flags) &&
 			       !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
 			       !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
 				delay = schedule_timeout_interruptible(delay);
 				delay = schedule_timeout_interruptible(delay);
 
 
@@ -606,6 +664,9 @@ static int bch_writeback_thread(void *arg)
 		}
 		}
 	}
 	}
 
 
+	cached_dev_put(dc);
+	wait_for_kthread_stop();
+
 	return 0;
 	return 0;
 }
 }
 
 
@@ -659,6 +720,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
 	dc->writeback_rate_p_term_inverse = 40;
 	dc->writeback_rate_p_term_inverse = 40;
 	dc->writeback_rate_i_term_inverse = 10000;
 	dc->writeback_rate_i_term_inverse = 10000;
 
 
+	WARN_ON(test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags));
 	INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate);
 	INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate);
 }
 }
 
 
@@ -669,11 +731,15 @@ int bch_cached_dev_writeback_start(struct cached_dev *dc)
 	if (!dc->writeback_write_wq)
 	if (!dc->writeback_write_wq)
 		return -ENOMEM;
 		return -ENOMEM;
 
 
+	cached_dev_get(dc);
 	dc->writeback_thread = kthread_create(bch_writeback_thread, dc,
 	dc->writeback_thread = kthread_create(bch_writeback_thread, dc,
 					      "bcache_writeback");
 					      "bcache_writeback");
-	if (IS_ERR(dc->writeback_thread))
+	if (IS_ERR(dc->writeback_thread)) {
+		cached_dev_put(dc);
 		return PTR_ERR(dc->writeback_thread);
 		return PTR_ERR(dc->writeback_thread);
+	}
 
 
+	WARN_ON(test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags));
 	schedule_delayed_work(&dc->writeback_rate_update,
 	schedule_delayed_work(&dc->writeback_rate_update,
 			      dc->writeback_rate_update_seconds * HZ);
 			      dc->writeback_rate_update_seconds * HZ);
 
 

+ 1 - 3
drivers/md/bcache/writeback.h

@@ -39,7 +39,7 @@ static inline uint64_t  bcache_flash_devs_sectors_dirty(struct cache_set *c)
 
 
 		if (!d || !UUID_FLASH_ONLY(&c->uuids[i]))
 		if (!d || !UUID_FLASH_ONLY(&c->uuids[i]))
 			continue;
 			continue;
-	   ret += bcache_dev_sectors_dirty(d);
+		ret += bcache_dev_sectors_dirty(d);
 	}
 	}
 
 
 	mutex_unlock(&bch_register_lock);
 	mutex_unlock(&bch_register_lock);
@@ -105,8 +105,6 @@ static inline void bch_writeback_add(struct cached_dev *dc)
 {
 {
 	if (!atomic_read(&dc->has_dirty) &&
 	if (!atomic_read(&dc->has_dirty) &&
 	    !atomic_xchg(&dc->has_dirty, 1)) {
 	    !atomic_xchg(&dc->has_dirty, 1)) {
-		refcount_inc(&dc->count);
-
 		if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) {
 		if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) {
 			SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY);
 			SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY);
 			/* XXX: should do this synchronously */
 			/* XXX: should do this synchronously */

+ 8 - 8
drivers/md/dm-table.c

@@ -1857,7 +1857,7 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
 	q->limits = *limits;
 	q->limits = *limits;
 
 
 	if (!dm_table_supports_discards(t)) {
 	if (!dm_table_supports_discards(t)) {
-		queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q);
+		blk_queue_flag_clear(QUEUE_FLAG_DISCARD, q);
 		/* Must also clear discard limits... */
 		/* Must also clear discard limits... */
 		q->limits.max_discard_sectors = 0;
 		q->limits.max_discard_sectors = 0;
 		q->limits.max_hw_discard_sectors = 0;
 		q->limits.max_hw_discard_sectors = 0;
@@ -1865,7 +1865,7 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
 		q->limits.discard_alignment = 0;
 		q->limits.discard_alignment = 0;
 		q->limits.discard_misaligned = 0;
 		q->limits.discard_misaligned = 0;
 	} else
 	} else
-		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
+		blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
 
 
 	if (dm_table_supports_flush(t, (1UL << QUEUE_FLAG_WC))) {
 	if (dm_table_supports_flush(t, (1UL << QUEUE_FLAG_WC))) {
 		wc = true;
 		wc = true;
@@ -1875,15 +1875,15 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
 	blk_queue_write_cache(q, wc, fua);
 	blk_queue_write_cache(q, wc, fua);
 
 
 	if (dm_table_supports_dax(t))
 	if (dm_table_supports_dax(t))
-		queue_flag_set_unlocked(QUEUE_FLAG_DAX, q);
+		blk_queue_flag_set(QUEUE_FLAG_DAX, q);
 	if (dm_table_supports_dax_write_cache(t))
 	if (dm_table_supports_dax_write_cache(t))
 		dax_write_cache(t->md->dax_dev, true);
 		dax_write_cache(t->md->dax_dev, true);
 
 
 	/* Ensure that all underlying devices are non-rotational. */
 	/* Ensure that all underlying devices are non-rotational. */
 	if (dm_table_all_devices_attribute(t, device_is_nonrot))
 	if (dm_table_all_devices_attribute(t, device_is_nonrot))
-		queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
+		blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
 	else
 	else
-		queue_flag_clear_unlocked(QUEUE_FLAG_NONROT, q);
+		blk_queue_flag_clear(QUEUE_FLAG_NONROT, q);
 
 
 	if (!dm_table_supports_write_same(t))
 	if (!dm_table_supports_write_same(t))
 		q->limits.max_write_same_sectors = 0;
 		q->limits.max_write_same_sectors = 0;
@@ -1891,9 +1891,9 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
 		q->limits.max_write_zeroes_sectors = 0;
 		q->limits.max_write_zeroes_sectors = 0;
 
 
 	if (dm_table_all_devices_attribute(t, queue_supports_sg_merge))
 	if (dm_table_all_devices_attribute(t, queue_supports_sg_merge))
-		queue_flag_clear_unlocked(QUEUE_FLAG_NO_SG_MERGE, q);
+		blk_queue_flag_clear(QUEUE_FLAG_NO_SG_MERGE, q);
 	else
 	else
-		queue_flag_set_unlocked(QUEUE_FLAG_NO_SG_MERGE, q);
+		blk_queue_flag_set(QUEUE_FLAG_NO_SG_MERGE, q);
 
 
 	dm_table_verify_integrity(t);
 	dm_table_verify_integrity(t);
 
 
@@ -1904,7 +1904,7 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
 	 * have it set.
 	 * have it set.
 	 */
 	 */
 	if (blk_queue_add_random(q) && dm_table_all_devices_attribute(t, device_is_not_random))
 	if (blk_queue_add_random(q) && dm_table_all_devices_attribute(t, device_is_not_random))
-		queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, q);
+		blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, q);
 }
 }
 
 
 unsigned int dm_table_get_num_targets(struct dm_table *t)
 unsigned int dm_table_get_num_targets(struct dm_table *t)

+ 1 - 1
drivers/md/dm.c

@@ -1848,7 +1848,7 @@ static struct mapped_device *alloc_dev(int minor)
 	INIT_LIST_HEAD(&md->table_devices);
 	INIT_LIST_HEAD(&md->table_devices);
 	spin_lock_init(&md->uevent_lock);
 	spin_lock_init(&md->uevent_lock);
 
 
-	md->queue = blk_alloc_queue_node(GFP_KERNEL, numa_node_id);
+	md->queue = blk_alloc_queue_node(GFP_KERNEL, numa_node_id, NULL);
 	if (!md->queue)
 	if (!md->queue)
 		goto bad;
 		goto bad;
 	md->queue->queuedata = md;
 	md->queue->queuedata = md;

+ 2 - 2
drivers/md/md-linear.c

@@ -138,9 +138,9 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
 	}
 	}
 
 
 	if (!discard_supported)
 	if (!discard_supported)
-		queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
+		blk_queue_flag_clear(QUEUE_FLAG_DISCARD, mddev->queue);
 	else
 	else
-		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
+		blk_queue_flag_set(QUEUE_FLAG_DISCARD, mddev->queue);
 
 
 	/*
 	/*
 	 * Here we calculate the device offsets.
 	 * Here we calculate the device offsets.

+ 5 - 5
drivers/md/md.c

@@ -5206,12 +5206,12 @@ static void md_free(struct kobject *ko)
 	if (mddev->sysfs_state)
 	if (mddev->sysfs_state)
 		sysfs_put(mddev->sysfs_state);
 		sysfs_put(mddev->sysfs_state);
 
 
+	if (mddev->gendisk)
+		del_gendisk(mddev->gendisk);
 	if (mddev->queue)
 	if (mddev->queue)
 		blk_cleanup_queue(mddev->queue);
 		blk_cleanup_queue(mddev->queue);
-	if (mddev->gendisk) {
-		del_gendisk(mddev->gendisk);
+	if (mddev->gendisk)
 		put_disk(mddev->gendisk);
 		put_disk(mddev->gendisk);
-	}
 	percpu_ref_exit(&mddev->writes_pending);
 	percpu_ref_exit(&mddev->writes_pending);
 
 
 	kfree(mddev);
 	kfree(mddev);
@@ -5619,9 +5619,9 @@ int md_run(struct mddev *mddev)
 		if (mddev->degraded)
 		if (mddev->degraded)
 			nonrot = false;
 			nonrot = false;
 		if (nonrot)
 		if (nonrot)
-			queue_flag_set_unlocked(QUEUE_FLAG_NONROT, mddev->queue);
+			blk_queue_flag_set(QUEUE_FLAG_NONROT, mddev->queue);
 		else
 		else
-			queue_flag_clear_unlocked(QUEUE_FLAG_NONROT, mddev->queue);
+			blk_queue_flag_clear(QUEUE_FLAG_NONROT, mddev->queue);
 		mddev->queue->backing_dev_info->congested_data = mddev;
 		mddev->queue->backing_dev_info->congested_data = mddev;
 		mddev->queue->backing_dev_info->congested_fn = md_congested;
 		mddev->queue->backing_dev_info->congested_fn = md_congested;
 	}
 	}

+ 2 - 2
drivers/md/raid0.c

@@ -399,9 +399,9 @@ static int raid0_run(struct mddev *mddev)
 				discard_supported = true;
 				discard_supported = true;
 		}
 		}
 		if (!discard_supported)
 		if (!discard_supported)
-			queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
+			blk_queue_flag_clear(QUEUE_FLAG_DISCARD, mddev->queue);
 		else
 		else
-			queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
+			blk_queue_flag_set(QUEUE_FLAG_DISCARD, mddev->queue);
 	}
 	}
 
 
 	/* calculate array device size */
 	/* calculate array device size */

+ 3 - 3
drivers/md/raid1.c

@@ -1760,7 +1760,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 		}
 		}
 	}
 	}
 	if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
 	if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
-		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
+		blk_queue_flag_set(QUEUE_FLAG_DISCARD, mddev->queue);
 	print_conf(conf);
 	print_conf(conf);
 	return err;
 	return err;
 }
 }
@@ -3110,10 +3110,10 @@ static int raid1_run(struct mddev *mddev)
 
 
 	if (mddev->queue) {
 	if (mddev->queue) {
 		if (discard_supported)
 		if (discard_supported)
-			queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
+			blk_queue_flag_set(QUEUE_FLAG_DISCARD,
 						mddev->queue);
 						mddev->queue);
 		else
 		else
-			queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
+			blk_queue_flag_clear(QUEUE_FLAG_DISCARD,
 						  mddev->queue);
 						  mddev->queue);
 	}
 	}
 
 

+ 3 - 3
drivers/md/raid10.c

@@ -1845,7 +1845,7 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 		break;
 		break;
 	}
 	}
 	if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
 	if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
-		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
+		blk_queue_flag_set(QUEUE_FLAG_DISCARD, mddev->queue);
 
 
 	print_conf(conf);
 	print_conf(conf);
 	return err;
 	return err;
@@ -3846,10 +3846,10 @@ static int raid10_run(struct mddev *mddev)
 
 
 	if (mddev->queue) {
 	if (mddev->queue) {
 		if (discard_supported)
 		if (discard_supported)
-			queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
+			blk_queue_flag_set(QUEUE_FLAG_DISCARD,
 						mddev->queue);
 						mddev->queue);
 		else
 		else
-			queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
+			blk_queue_flag_clear(QUEUE_FLAG_DISCARD,
 						  mddev->queue);
 						  mddev->queue);
 	}
 	}
 	/* need to check that every block has at least one working mirror */
 	/* need to check that every block has at least one working mirror */

+ 2 - 2
drivers/md/raid5.c

@@ -7443,10 +7443,10 @@ static int raid5_run(struct mddev *mddev)
 		if (devices_handle_discard_safely &&
 		if (devices_handle_discard_safely &&
 		    mddev->queue->limits.max_discard_sectors >= (stripe >> 9) &&
 		    mddev->queue->limits.max_discard_sectors >= (stripe >> 9) &&
 		    mddev->queue->limits.discard_granularity >= stripe)
 		    mddev->queue->limits.discard_granularity >= stripe)
-			queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
+			blk_queue_flag_set(QUEUE_FLAG_DISCARD,
 						mddev->queue);
 						mddev->queue);
 		else
 		else
-			queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
+			blk_queue_flag_clear(QUEUE_FLAG_DISCARD,
 						mddev->queue);
 						mddev->queue);
 
 
 		blk_queue_max_hw_sectors(mddev->queue, UINT_MAX);
 		blk_queue_max_hw_sectors(mddev->queue, UINT_MAX);

+ 2 - 2
drivers/misc/cardreader/rtsx_pcr.c

@@ -444,12 +444,12 @@ static void rtsx_pci_add_sg_tbl(struct rtsx_pcr *pcr,
 {
 {
 	u64 *ptr = (u64 *)(pcr->host_sg_tbl_ptr) + pcr->sgi;
 	u64 *ptr = (u64 *)(pcr->host_sg_tbl_ptr) + pcr->sgi;
 	u64 val;
 	u64 val;
-	u8 option = SG_VALID | SG_TRANS_DATA;
+	u8 option = RTSX_SG_VALID | RTSX_SG_TRANS_DATA;
 
 
 	pcr_dbg(pcr, "DMA addr: 0x%x, Len: 0x%x\n", (unsigned int)addr, len);
 	pcr_dbg(pcr, "DMA addr: 0x%x, Len: 0x%x\n", (unsigned int)addr, len);
 
 
 	if (end)
 	if (end)
-		option |= SG_END;
+		option |= RTSX_SG_END;
 	val = ((u64)addr << 32) | ((u64)len << 12) | option;
 	val = ((u64)addr << 32) | ((u64)len << 12) | option;
 
 
 	put_unaligned_le64(val, ptr);
 	put_unaligned_le64(val, ptr);

+ 1 - 1
drivers/mmc/core/block.c

@@ -2659,7 +2659,6 @@ static void mmc_blk_remove_req(struct mmc_blk_data *md)
 		 * from being accepted.
 		 * from being accepted.
 		 */
 		 */
 		card = md->queue.card;
 		card = md->queue.card;
-		mmc_cleanup_queue(&md->queue);
 		if (md->disk->flags & GENHD_FL_UP) {
 		if (md->disk->flags & GENHD_FL_UP) {
 			device_remove_file(disk_to_dev(md->disk), &md->force_ro);
 			device_remove_file(disk_to_dev(md->disk), &md->force_ro);
 			if ((md->area_type & MMC_BLK_DATA_AREA_BOOT) &&
 			if ((md->area_type & MMC_BLK_DATA_AREA_BOOT) &&
@@ -2669,6 +2668,7 @@ static void mmc_blk_remove_req(struct mmc_blk_data *md)
 
 
 			del_gendisk(md->disk);
 			del_gendisk(md->disk);
 		}
 		}
+		mmc_cleanup_queue(&md->queue);
 		mmc_blk_put(md);
 		mmc_blk_put(md);
 	}
 	}
 }
 }

+ 4 - 4
drivers/mmc/core/queue.c

@@ -185,14 +185,14 @@ static void mmc_queue_setup_discard(struct request_queue *q,
 	if (!max_discard)
 	if (!max_discard)
 		return;
 		return;
 
 
-	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
+	blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
 	blk_queue_max_discard_sectors(q, max_discard);
 	blk_queue_max_discard_sectors(q, max_discard);
 	q->limits.discard_granularity = card->pref_erase << 9;
 	q->limits.discard_granularity = card->pref_erase << 9;
 	/* granularity must not be greater than max. discard */
 	/* granularity must not be greater than max. discard */
 	if (card->pref_erase > max_discard)
 	if (card->pref_erase > max_discard)
 		q->limits.discard_granularity = 0;
 		q->limits.discard_granularity = 0;
 	if (mmc_can_secure_erase_trim(card))
 	if (mmc_can_secure_erase_trim(card))
-		queue_flag_set_unlocked(QUEUE_FLAG_SECERASE, q);
+		blk_queue_flag_set(QUEUE_FLAG_SECERASE, q);
 }
 }
 
 
 /**
 /**
@@ -356,8 +356,8 @@ static void mmc_setup_queue(struct mmc_queue *mq, struct mmc_card *card)
 	if (mmc_dev(host)->dma_mask && *mmc_dev(host)->dma_mask)
 	if (mmc_dev(host)->dma_mask && *mmc_dev(host)->dma_mask)
 		limit = (u64)dma_max_pfn(mmc_dev(host)) << PAGE_SHIFT;
 		limit = (u64)dma_max_pfn(mmc_dev(host)) << PAGE_SHIFT;
 
 
-	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, mq->queue);
-	queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, mq->queue);
+	blk_queue_flag_set(QUEUE_FLAG_NONROT, mq->queue);
+	blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, mq->queue);
 	if (mmc_can_erase(card))
 	if (mmc_can_erase(card))
 		mmc_queue_setup_discard(mq->queue, card);
 		mmc_queue_setup_discard(mq->queue, card);
 
 

+ 3 - 3
drivers/mtd/mtd_blkdevs.c

@@ -419,11 +419,11 @@ int add_mtd_blktrans_dev(struct mtd_blktrans_dev *new)
 	blk_queue_logical_block_size(new->rq, tr->blksize);
 	blk_queue_logical_block_size(new->rq, tr->blksize);
 
 
 	blk_queue_bounce_limit(new->rq, BLK_BOUNCE_HIGH);
 	blk_queue_bounce_limit(new->rq, BLK_BOUNCE_HIGH);
-	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, new->rq);
-	queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, new->rq);
+	blk_queue_flag_set(QUEUE_FLAG_NONROT, new->rq);
+	blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, new->rq);
 
 
 	if (tr->discard) {
 	if (tr->discard) {
-		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, new->rq);
+		blk_queue_flag_set(QUEUE_FLAG_DISCARD, new->rq);
 		blk_queue_max_discard_sectors(new->rq, UINT_MAX);
 		blk_queue_max_discard_sectors(new->rq, UINT_MAX);
 	}
 	}
 
 

+ 1 - 1
drivers/nvdimm/blk.c

@@ -266,7 +266,7 @@ static int nsblk_attach_disk(struct nd_namespace_blk *nsblk)
 	blk_queue_make_request(q, nd_blk_make_request);
 	blk_queue_make_request(q, nd_blk_make_request);
 	blk_queue_max_hw_sectors(q, UINT_MAX);
 	blk_queue_max_hw_sectors(q, UINT_MAX);
 	blk_queue_logical_block_size(q, nsblk_sector_size(nsblk));
 	blk_queue_logical_block_size(q, nsblk_sector_size(nsblk));
-	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
+	blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
 	q->queuedata = nsblk;
 	q->queuedata = nsblk;
 
 
 	disk = alloc_disk(0);
 	disk = alloc_disk(0);

+ 1 - 1
drivers/nvdimm/btt.c

@@ -1542,7 +1542,7 @@ static int btt_blk_init(struct btt *btt)
 	blk_queue_make_request(btt->btt_queue, btt_make_request);
 	blk_queue_make_request(btt->btt_queue, btt_make_request);
 	blk_queue_logical_block_size(btt->btt_queue, btt->sector_size);
 	blk_queue_logical_block_size(btt->btt_queue, btt->sector_size);
 	blk_queue_max_hw_sectors(btt->btt_queue, UINT_MAX);
 	blk_queue_max_hw_sectors(btt->btt_queue, UINT_MAX);
-	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, btt->btt_queue);
+	blk_queue_flag_set(QUEUE_FLAG_NONROT, btt->btt_queue);
 	btt->btt_queue->queuedata = btt;
 	btt->btt_queue->queuedata = btt;
 
 
 	if (btt_meta_size(btt)) {
 	if (btt_meta_size(btt)) {

+ 0 - 1
drivers/nvdimm/nd.h

@@ -29,7 +29,6 @@ enum {
 	 * BTT instance
 	 * BTT instance
 	 */
 	 */
 	ND_MAX_LANES = 256,
 	ND_MAX_LANES = 256,
-	SECTOR_SHIFT = 9,
 	INT_LBASIZE_ALIGNMENT = 64,
 	INT_LBASIZE_ALIGNMENT = 64,
 	NVDIMM_IO_ATOMIC = 1,
 	NVDIMM_IO_ATOMIC = 1,
 };
 };

+ 3 - 3
drivers/nvdimm/pmem.c

@@ -343,7 +343,7 @@ static int pmem_attach_disk(struct device *dev,
 		return -EBUSY;
 		return -EBUSY;
 	}
 	}
 
 
-	q = blk_alloc_queue_node(GFP_KERNEL, dev_to_node(dev));
+	q = blk_alloc_queue_node(GFP_KERNEL, dev_to_node(dev), NULL);
 	if (!q)
 	if (!q)
 		return -ENOMEM;
 		return -ENOMEM;
 
 
@@ -387,8 +387,8 @@ static int pmem_attach_disk(struct device *dev,
 	blk_queue_physical_block_size(q, PAGE_SIZE);
 	blk_queue_physical_block_size(q, PAGE_SIZE);
 	blk_queue_logical_block_size(q, pmem_sector_size(ndns));
 	blk_queue_logical_block_size(q, pmem_sector_size(ndns));
 	blk_queue_max_hw_sectors(q, UINT_MAX);
 	blk_queue_max_hw_sectors(q, UINT_MAX);
-	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
-	queue_flag_set_unlocked(QUEUE_FLAG_DAX, q);
+	blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
+	blk_queue_flag_set(QUEUE_FLAG_DAX, q);
 	q->queuedata = pmem;
 	q->queuedata = pmem;
 
 
 	disk = alloc_disk_node(0, nid);
 	disk = alloc_disk_node(0, nid);

+ 1 - 0
drivers/nvme/host/Makefile

@@ -12,6 +12,7 @@ nvme-core-y				:= core.o
 nvme-core-$(CONFIG_TRACING)		+= trace.o
 nvme-core-$(CONFIG_TRACING)		+= trace.o
 nvme-core-$(CONFIG_NVME_MULTIPATH)	+= multipath.o
 nvme-core-$(CONFIG_NVME_MULTIPATH)	+= multipath.o
 nvme-core-$(CONFIG_NVM)			+= lightnvm.o
 nvme-core-$(CONFIG_NVM)			+= lightnvm.o
+nvme-core-$(CONFIG_FAULT_INJECTION_DEBUG_FS)	+= fault_inject.o
 
 
 nvme-y					+= pci.o
 nvme-y					+= pci.o
 
 

+ 79 - 44
drivers/nvme/host/core.c

@@ -100,11 +100,6 @@ static struct class *nvme_subsys_class;
 static void nvme_ns_remove(struct nvme_ns *ns);
 static void nvme_ns_remove(struct nvme_ns *ns);
 static int nvme_revalidate_disk(struct gendisk *disk);
 static int nvme_revalidate_disk(struct gendisk *disk);
 
 
-static __le32 nvme_get_log_dw10(u8 lid, size_t size)
-{
-	return cpu_to_le32((((size / 4) - 1) << 16) | lid);
-}
-
 int nvme_reset_ctrl(struct nvme_ctrl *ctrl)
 int nvme_reset_ctrl(struct nvme_ctrl *ctrl)
 {
 {
 	if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
 	if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
@@ -135,6 +130,9 @@ static void nvme_delete_ctrl_work(struct work_struct *work)
 	struct nvme_ctrl *ctrl =
 	struct nvme_ctrl *ctrl =
 		container_of(work, struct nvme_ctrl, delete_work);
 		container_of(work, struct nvme_ctrl, delete_work);
 
 
+	dev_info(ctrl->device,
+		 "Removing ctrl: NQN \"%s\"\n", ctrl->opts->subsysnqn);
+
 	flush_work(&ctrl->reset_work);
 	flush_work(&ctrl->reset_work);
 	nvme_stop_ctrl(ctrl);
 	nvme_stop_ctrl(ctrl);
 	nvme_remove_namespaces(ctrl);
 	nvme_remove_namespaces(ctrl);
@@ -948,7 +946,8 @@ static int nvme_identify_ns_list(struct nvme_ctrl *dev, unsigned nsid, __le32 *n
 	c.identify.opcode = nvme_admin_identify;
 	c.identify.opcode = nvme_admin_identify;
 	c.identify.cns = NVME_ID_CNS_NS_ACTIVE_LIST;
 	c.identify.cns = NVME_ID_CNS_NS_ACTIVE_LIST;
 	c.identify.nsid = cpu_to_le32(nsid);
 	c.identify.nsid = cpu_to_le32(nsid);
-	return nvme_submit_sync_cmd(dev->admin_q, &c, ns_list, 0x1000);
+	return nvme_submit_sync_cmd(dev->admin_q, &c, ns_list,
+				    NVME_IDENTIFY_DATA_SIZE);
 }
 }
 
 
 static struct nvme_id_ns *nvme_identify_ns(struct nvme_ctrl *ctrl,
 static struct nvme_id_ns *nvme_identify_ns(struct nvme_ctrl *ctrl,
@@ -1124,13 +1123,13 @@ static void nvme_update_formats(struct nvme_ctrl *ctrl)
 	struct nvme_ns *ns, *next;
 	struct nvme_ns *ns, *next;
 	LIST_HEAD(rm_list);
 	LIST_HEAD(rm_list);
 
 
-	mutex_lock(&ctrl->namespaces_mutex);
+	down_write(&ctrl->namespaces_rwsem);
 	list_for_each_entry(ns, &ctrl->namespaces, list) {
 	list_for_each_entry(ns, &ctrl->namespaces, list) {
 		if (ns->disk && nvme_revalidate_disk(ns->disk)) {
 		if (ns->disk && nvme_revalidate_disk(ns->disk)) {
 			list_move_tail(&ns->list, &rm_list);
 			list_move_tail(&ns->list, &rm_list);
 		}
 		}
 	}
 	}
-	mutex_unlock(&ctrl->namespaces_mutex);
+	up_write(&ctrl->namespaces_rwsem);
 
 
 	list_for_each_entry_safe(ns, next, &rm_list, list)
 	list_for_each_entry_safe(ns, next, &rm_list, list)
 		nvme_ns_remove(ns);
 		nvme_ns_remove(ns);
@@ -1358,7 +1357,7 @@ static void nvme_config_discard(struct nvme_ctrl *ctrl,
 
 
 	blk_queue_max_discard_sectors(queue, UINT_MAX);
 	blk_queue_max_discard_sectors(queue, UINT_MAX);
 	blk_queue_max_discard_segments(queue, NVME_DSM_MAX_RANGES);
 	blk_queue_max_discard_segments(queue, NVME_DSM_MAX_RANGES);
-	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, queue);
+	blk_queue_flag_set(QUEUE_FLAG_DISCARD, queue);
 
 
 	if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
 	if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
 		blk_queue_max_write_zeroes_sectors(queue, UINT_MAX);
 		blk_queue_max_write_zeroes_sectors(queue, UINT_MAX);
@@ -1449,6 +1448,8 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
 	if (ns->noiob)
 	if (ns->noiob)
 		nvme_set_chunk_size(ns);
 		nvme_set_chunk_size(ns);
 	nvme_update_disk_info(disk, ns, id);
 	nvme_update_disk_info(disk, ns, id);
+	if (ns->ndev)
+		nvme_nvm_update_nvm_info(ns);
 #ifdef CONFIG_NVME_MULTIPATH
 #ifdef CONFIG_NVME_MULTIPATH
 	if (ns->head->disk)
 	if (ns->head->disk)
 		nvme_update_disk_info(ns->head->disk, ns, id);
 		nvme_update_disk_info(ns->head->disk, ns, id);
@@ -2217,18 +2218,35 @@ out_unlock:
 	return ret;
 	return ret;
 }
 }
 
 
-static int nvme_get_log(struct nvme_ctrl *ctrl, u8 log_page, void *log,
-			size_t size)
+int nvme_get_log_ext(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
+		     u8 log_page, void *log,
+		     size_t size, size_t offset)
 {
 {
 	struct nvme_command c = { };
 	struct nvme_command c = { };
+	unsigned long dwlen = size / 4 - 1;
+
+	c.get_log_page.opcode = nvme_admin_get_log_page;
+
+	if (ns)
+		c.get_log_page.nsid = cpu_to_le32(ns->head->ns_id);
+	else
+		c.get_log_page.nsid = cpu_to_le32(NVME_NSID_ALL);
 
 
-	c.common.opcode = nvme_admin_get_log_page;
-	c.common.nsid = cpu_to_le32(NVME_NSID_ALL);
-	c.common.cdw10[0] = nvme_get_log_dw10(log_page, size);
+	c.get_log_page.lid = log_page;
+	c.get_log_page.numdl = cpu_to_le16(dwlen & ((1 << 16) - 1));
+	c.get_log_page.numdu = cpu_to_le16(dwlen >> 16);
+	c.get_log_page.lpol = cpu_to_le32(offset & ((1ULL << 32) - 1));
+	c.get_log_page.lpou = cpu_to_le32(offset >> 32ULL);
 
 
 	return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size);
 	return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size);
 }
 }
 
 
+static int nvme_get_log(struct nvme_ctrl *ctrl, u8 log_page, void *log,
+			size_t size)
+{
+	return nvme_get_log_ext(ctrl, NULL, log_page, log, size, 0);
+}
+
 static int nvme_get_effects_log(struct nvme_ctrl *ctrl)
 static int nvme_get_effects_log(struct nvme_ctrl *ctrl)
 {
 {
 	int ret;
 	int ret;
@@ -2440,7 +2458,7 @@ static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp)
 	struct nvme_ns *ns;
 	struct nvme_ns *ns;
 	int ret;
 	int ret;
 
 
-	mutex_lock(&ctrl->namespaces_mutex);
+	down_read(&ctrl->namespaces_rwsem);
 	if (list_empty(&ctrl->namespaces)) {
 	if (list_empty(&ctrl->namespaces)) {
 		ret = -ENOTTY;
 		ret = -ENOTTY;
 		goto out_unlock;
 		goto out_unlock;
@@ -2457,14 +2475,14 @@ static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp)
 	dev_warn(ctrl->device,
 	dev_warn(ctrl->device,
 		"using deprecated NVME_IOCTL_IO_CMD ioctl on the char device!\n");
 		"using deprecated NVME_IOCTL_IO_CMD ioctl on the char device!\n");
 	kref_get(&ns->kref);
 	kref_get(&ns->kref);
-	mutex_unlock(&ctrl->namespaces_mutex);
+	up_read(&ctrl->namespaces_rwsem);
 
 
 	ret = nvme_user_cmd(ctrl, ns, argp);
 	ret = nvme_user_cmd(ctrl, ns, argp);
 	nvme_put_ns(ns);
 	nvme_put_ns(ns);
 	return ret;
 	return ret;
 
 
 out_unlock:
 out_unlock:
-	mutex_unlock(&ctrl->namespaces_mutex);
+	up_read(&ctrl->namespaces_rwsem);
 	return ret;
 	return ret;
 }
 }
 
 
@@ -2793,6 +2811,7 @@ static int __nvme_check_ids(struct nvme_subsystem *subsys,
 
 
 	list_for_each_entry(h, &subsys->nsheads, entry) {
 	list_for_each_entry(h, &subsys->nsheads, entry) {
 		if (nvme_ns_ids_valid(&new->ids) &&
 		if (nvme_ns_ids_valid(&new->ids) &&
+		    !list_empty(&h->list) &&
 		    nvme_ns_ids_equal(&new->ids, &h->ids))
 		    nvme_ns_ids_equal(&new->ids, &h->ids))
 			return -EINVAL;
 			return -EINVAL;
 	}
 	}
@@ -2893,7 +2912,7 @@ static struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 {
 {
 	struct nvme_ns *ns, *ret = NULL;
 	struct nvme_ns *ns, *ret = NULL;
 
 
-	mutex_lock(&ctrl->namespaces_mutex);
+	down_read(&ctrl->namespaces_rwsem);
 	list_for_each_entry(ns, &ctrl->namespaces, list) {
 	list_for_each_entry(ns, &ctrl->namespaces, list) {
 		if (ns->head->ns_id == nsid) {
 		if (ns->head->ns_id == nsid) {
 			if (!kref_get_unless_zero(&ns->kref))
 			if (!kref_get_unless_zero(&ns->kref))
@@ -2904,7 +2923,7 @@ static struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 		if (ns->head->ns_id > nsid)
 		if (ns->head->ns_id > nsid)
 			break;
 			break;
 	}
 	}
-	mutex_unlock(&ctrl->namespaces_mutex);
+	up_read(&ctrl->namespaces_rwsem);
 	return ret;
 	return ret;
 }
 }
 
 
@@ -2949,7 +2968,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 	ns->queue = blk_mq_init_queue(ctrl->tagset);
 	ns->queue = blk_mq_init_queue(ctrl->tagset);
 	if (IS_ERR(ns->queue))
 	if (IS_ERR(ns->queue))
 		goto out_free_ns;
 		goto out_free_ns;
-	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue);
+	blk_queue_flag_set(QUEUE_FLAG_NONROT, ns->queue);
 	ns->queue->queuedata = ns;
 	ns->queue->queuedata = ns;
 	ns->ctrl = ctrl;
 	ns->ctrl = ctrl;
 
 
@@ -3015,9 +3034,9 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 
 
 	__nvme_revalidate_disk(disk, id);
 	__nvme_revalidate_disk(disk, id);
 
 
-	mutex_lock(&ctrl->namespaces_mutex);
+	down_write(&ctrl->namespaces_rwsem);
 	list_add_tail(&ns->list, &ctrl->namespaces);
 	list_add_tail(&ns->list, &ctrl->namespaces);
-	mutex_unlock(&ctrl->namespaces_mutex);
+	up_write(&ctrl->namespaces_rwsem);
 
 
 	nvme_get_ctrl(ctrl);
 	nvme_get_ctrl(ctrl);
 
 
@@ -3033,6 +3052,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 			ns->disk->disk_name);
 			ns->disk->disk_name);
 
 
 	nvme_mpath_add_disk(ns->head);
 	nvme_mpath_add_disk(ns->head);
+	nvme_fault_inject_init(ns);
 	return;
 	return;
  out_unlink_ns:
  out_unlink_ns:
 	mutex_lock(&ctrl->subsys->lock);
 	mutex_lock(&ctrl->subsys->lock);
@@ -3051,6 +3071,7 @@ static void nvme_ns_remove(struct nvme_ns *ns)
 	if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags))
 	if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags))
 		return;
 		return;
 
 
+	nvme_fault_inject_fini(ns);
 	if (ns->disk && ns->disk->flags & GENHD_FL_UP) {
 	if (ns->disk && ns->disk->flags & GENHD_FL_UP) {
 		sysfs_remove_group(&disk_to_dev(ns->disk)->kobj,
 		sysfs_remove_group(&disk_to_dev(ns->disk)->kobj,
 					&nvme_ns_id_attr_group);
 					&nvme_ns_id_attr_group);
@@ -3067,9 +3088,9 @@ static void nvme_ns_remove(struct nvme_ns *ns)
 	list_del_rcu(&ns->siblings);
 	list_del_rcu(&ns->siblings);
 	mutex_unlock(&ns->ctrl->subsys->lock);
 	mutex_unlock(&ns->ctrl->subsys->lock);
 
 
-	mutex_lock(&ns->ctrl->namespaces_mutex);
+	down_write(&ns->ctrl->namespaces_rwsem);
 	list_del_init(&ns->list);
 	list_del_init(&ns->list);
-	mutex_unlock(&ns->ctrl->namespaces_mutex);
+	up_write(&ns->ctrl->namespaces_rwsem);
 
 
 	synchronize_srcu(&ns->head->srcu);
 	synchronize_srcu(&ns->head->srcu);
 	nvme_mpath_check_last_path(ns);
 	nvme_mpath_check_last_path(ns);
@@ -3093,11 +3114,18 @@ static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
 					unsigned nsid)
 					unsigned nsid)
 {
 {
 	struct nvme_ns *ns, *next;
 	struct nvme_ns *ns, *next;
+	LIST_HEAD(rm_list);
 
 
+	down_write(&ctrl->namespaces_rwsem);
 	list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) {
 	list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) {
 		if (ns->head->ns_id > nsid)
 		if (ns->head->ns_id > nsid)
-			nvme_ns_remove(ns);
+			list_move_tail(&ns->list, &rm_list);
 	}
 	}
+	up_write(&ctrl->namespaces_rwsem);
+
+	list_for_each_entry_safe(ns, next, &rm_list, list)
+		nvme_ns_remove(ns);
+
 }
 }
 
 
 static int nvme_scan_ns_list(struct nvme_ctrl *ctrl, unsigned nn)
 static int nvme_scan_ns_list(struct nvme_ctrl *ctrl, unsigned nn)
@@ -3107,7 +3135,7 @@ static int nvme_scan_ns_list(struct nvme_ctrl *ctrl, unsigned nn)
 	unsigned i, j, nsid, prev = 0, num_lists = DIV_ROUND_UP(nn, 1024);
 	unsigned i, j, nsid, prev = 0, num_lists = DIV_ROUND_UP(nn, 1024);
 	int ret = 0;
 	int ret = 0;
 
 
-	ns_list = kzalloc(0x1000, GFP_KERNEL);
+	ns_list = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL);
 	if (!ns_list)
 	if (!ns_list)
 		return -ENOMEM;
 		return -ENOMEM;
 
 
@@ -3173,9 +3201,9 @@ static void nvme_scan_work(struct work_struct *work)
 	}
 	}
 	nvme_scan_ns_sequential(ctrl, nn);
 	nvme_scan_ns_sequential(ctrl, nn);
  done:
  done:
-	mutex_lock(&ctrl->namespaces_mutex);
+	down_write(&ctrl->namespaces_rwsem);
 	list_sort(NULL, &ctrl->namespaces, ns_cmp);
 	list_sort(NULL, &ctrl->namespaces, ns_cmp);
-	mutex_unlock(&ctrl->namespaces_mutex);
+	up_write(&ctrl->namespaces_rwsem);
 	kfree(id);
 	kfree(id);
 }
 }
 
 
@@ -3197,6 +3225,7 @@ EXPORT_SYMBOL_GPL(nvme_queue_scan);
 void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
 void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
 {
 {
 	struct nvme_ns *ns, *next;
 	struct nvme_ns *ns, *next;
+	LIST_HEAD(ns_list);
 
 
 	/*
 	/*
 	 * The dead states indicates the controller was not gracefully
 	 * The dead states indicates the controller was not gracefully
@@ -3207,7 +3236,11 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
 	if (ctrl->state == NVME_CTRL_DEAD)
 	if (ctrl->state == NVME_CTRL_DEAD)
 		nvme_kill_queues(ctrl);
 		nvme_kill_queues(ctrl);
 
 
-	list_for_each_entry_safe(ns, next, &ctrl->namespaces, list)
+	down_write(&ctrl->namespaces_rwsem);
+	list_splice_init(&ctrl->namespaces, &ns_list);
+	up_write(&ctrl->namespaces_rwsem);
+
+	list_for_each_entry_safe(ns, next, &ns_list, list)
 		nvme_ns_remove(ns);
 		nvme_ns_remove(ns);
 }
 }
 EXPORT_SYMBOL_GPL(nvme_remove_namespaces);
 EXPORT_SYMBOL_GPL(nvme_remove_namespaces);
@@ -3337,6 +3370,8 @@ void nvme_stop_ctrl(struct nvme_ctrl *ctrl)
 	flush_work(&ctrl->async_event_work);
 	flush_work(&ctrl->async_event_work);
 	flush_work(&ctrl->scan_work);
 	flush_work(&ctrl->scan_work);
 	cancel_work_sync(&ctrl->fw_act_work);
 	cancel_work_sync(&ctrl->fw_act_work);
+	if (ctrl->ops->stop_ctrl)
+		ctrl->ops->stop_ctrl(ctrl);
 }
 }
 EXPORT_SYMBOL_GPL(nvme_stop_ctrl);
 EXPORT_SYMBOL_GPL(nvme_stop_ctrl);
 
 
@@ -3394,7 +3429,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
 	ctrl->state = NVME_CTRL_NEW;
 	ctrl->state = NVME_CTRL_NEW;
 	spin_lock_init(&ctrl->lock);
 	spin_lock_init(&ctrl->lock);
 	INIT_LIST_HEAD(&ctrl->namespaces);
 	INIT_LIST_HEAD(&ctrl->namespaces);
-	mutex_init(&ctrl->namespaces_mutex);
+	init_rwsem(&ctrl->namespaces_rwsem);
 	ctrl->dev = dev;
 	ctrl->dev = dev;
 	ctrl->ops = ops;
 	ctrl->ops = ops;
 	ctrl->quirks = quirks;
 	ctrl->quirks = quirks;
@@ -3455,7 +3490,7 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl)
 {
 {
 	struct nvme_ns *ns;
 	struct nvme_ns *ns;
 
 
-	mutex_lock(&ctrl->namespaces_mutex);
+	down_read(&ctrl->namespaces_rwsem);
 
 
 	/* Forcibly unquiesce queues to avoid blocking dispatch */
 	/* Forcibly unquiesce queues to avoid blocking dispatch */
 	if (ctrl->admin_q)
 	if (ctrl->admin_q)
@@ -3474,7 +3509,7 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl)
 		/* Forcibly unquiesce queues to avoid blocking dispatch */
 		/* Forcibly unquiesce queues to avoid blocking dispatch */
 		blk_mq_unquiesce_queue(ns->queue);
 		blk_mq_unquiesce_queue(ns->queue);
 	}
 	}
-	mutex_unlock(&ctrl->namespaces_mutex);
+	up_read(&ctrl->namespaces_rwsem);
 }
 }
 EXPORT_SYMBOL_GPL(nvme_kill_queues);
 EXPORT_SYMBOL_GPL(nvme_kill_queues);
 
 
@@ -3482,10 +3517,10 @@ void nvme_unfreeze(struct nvme_ctrl *ctrl)
 {
 {
 	struct nvme_ns *ns;
 	struct nvme_ns *ns;
 
 
-	mutex_lock(&ctrl->namespaces_mutex);
+	down_read(&ctrl->namespaces_rwsem);
 	list_for_each_entry(ns, &ctrl->namespaces, list)
 	list_for_each_entry(ns, &ctrl->namespaces, list)
 		blk_mq_unfreeze_queue(ns->queue);
 		blk_mq_unfreeze_queue(ns->queue);
-	mutex_unlock(&ctrl->namespaces_mutex);
+	up_read(&ctrl->namespaces_rwsem);
 }
 }
 EXPORT_SYMBOL_GPL(nvme_unfreeze);
 EXPORT_SYMBOL_GPL(nvme_unfreeze);
 
 
@@ -3493,13 +3528,13 @@ void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout)
 {
 {
 	struct nvme_ns *ns;
 	struct nvme_ns *ns;
 
 
-	mutex_lock(&ctrl->namespaces_mutex);
+	down_read(&ctrl->namespaces_rwsem);
 	list_for_each_entry(ns, &ctrl->namespaces, list) {
 	list_for_each_entry(ns, &ctrl->namespaces, list) {
 		timeout = blk_mq_freeze_queue_wait_timeout(ns->queue, timeout);
 		timeout = blk_mq_freeze_queue_wait_timeout(ns->queue, timeout);
 		if (timeout <= 0)
 		if (timeout <= 0)
 			break;
 			break;
 	}
 	}
-	mutex_unlock(&ctrl->namespaces_mutex);
+	up_read(&ctrl->namespaces_rwsem);
 }
 }
 EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout);
 EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout);
 
 
@@ -3507,10 +3542,10 @@ void nvme_wait_freeze(struct nvme_ctrl *ctrl)
 {
 {
 	struct nvme_ns *ns;
 	struct nvme_ns *ns;
 
 
-	mutex_lock(&ctrl->namespaces_mutex);
+	down_read(&ctrl->namespaces_rwsem);
 	list_for_each_entry(ns, &ctrl->namespaces, list)
 	list_for_each_entry(ns, &ctrl->namespaces, list)
 		blk_mq_freeze_queue_wait(ns->queue);
 		blk_mq_freeze_queue_wait(ns->queue);
-	mutex_unlock(&ctrl->namespaces_mutex);
+	up_read(&ctrl->namespaces_rwsem);
 }
 }
 EXPORT_SYMBOL_GPL(nvme_wait_freeze);
 EXPORT_SYMBOL_GPL(nvme_wait_freeze);
 
 
@@ -3518,10 +3553,10 @@ void nvme_start_freeze(struct nvme_ctrl *ctrl)
 {
 {
 	struct nvme_ns *ns;
 	struct nvme_ns *ns;
 
 
-	mutex_lock(&ctrl->namespaces_mutex);
+	down_read(&ctrl->namespaces_rwsem);
 	list_for_each_entry(ns, &ctrl->namespaces, list)
 	list_for_each_entry(ns, &ctrl->namespaces, list)
 		blk_freeze_queue_start(ns->queue);
 		blk_freeze_queue_start(ns->queue);
-	mutex_unlock(&ctrl->namespaces_mutex);
+	up_read(&ctrl->namespaces_rwsem);
 }
 }
 EXPORT_SYMBOL_GPL(nvme_start_freeze);
 EXPORT_SYMBOL_GPL(nvme_start_freeze);
 
 
@@ -3529,10 +3564,10 @@ void nvme_stop_queues(struct nvme_ctrl *ctrl)
 {
 {
 	struct nvme_ns *ns;
 	struct nvme_ns *ns;
 
 
-	mutex_lock(&ctrl->namespaces_mutex);
+	down_read(&ctrl->namespaces_rwsem);
 	list_for_each_entry(ns, &ctrl->namespaces, list)
 	list_for_each_entry(ns, &ctrl->namespaces, list)
 		blk_mq_quiesce_queue(ns->queue);
 		blk_mq_quiesce_queue(ns->queue);
-	mutex_unlock(&ctrl->namespaces_mutex);
+	up_read(&ctrl->namespaces_rwsem);
 }
 }
 EXPORT_SYMBOL_GPL(nvme_stop_queues);
 EXPORT_SYMBOL_GPL(nvme_stop_queues);
 
 
@@ -3540,10 +3575,10 @@ void nvme_start_queues(struct nvme_ctrl *ctrl)
 {
 {
 	struct nvme_ns *ns;
 	struct nvme_ns *ns;
 
 
-	mutex_lock(&ctrl->namespaces_mutex);
+	down_read(&ctrl->namespaces_rwsem);
 	list_for_each_entry(ns, &ctrl->namespaces, list)
 	list_for_each_entry(ns, &ctrl->namespaces, list)
 		blk_mq_unquiesce_queue(ns->queue);
 		blk_mq_unquiesce_queue(ns->queue);
-	mutex_unlock(&ctrl->namespaces_mutex);
+	up_read(&ctrl->namespaces_rwsem);
 }
 }
 EXPORT_SYMBOL_GPL(nvme_start_queues);
 EXPORT_SYMBOL_GPL(nvme_start_queues);
 
 

+ 79 - 0
drivers/nvme/host/fault_inject.c

@@ -0,0 +1,79 @@
+/*
+ * fault injection support for nvme.
+ *
+ * Copyright (c) 2018, Oracle and/or its affiliates
+ *
+ */
+
+#include <linux/moduleparam.h>
+#include "nvme.h"
+
+static DECLARE_FAULT_ATTR(fail_default_attr);
+/* optional fault injection attributes boot time option:
+ * nvme_core.fail_request=<interval>,<probability>,<space>,<times>
+ */
+static char *fail_request;
+module_param(fail_request, charp, 0000);
+
+void nvme_fault_inject_init(struct nvme_ns *ns)
+{
+	struct dentry *dir, *parent;
+	char *name = ns->disk->disk_name;
+	struct nvme_fault_inject *fault_inj = &ns->fault_inject;
+	struct fault_attr *attr = &fault_inj->attr;
+
+	/* set default fault injection attribute */
+	if (fail_request)
+		setup_fault_attr(&fail_default_attr, fail_request);
+
+	/* create debugfs directory and attribute */
+	parent = debugfs_create_dir(name, NULL);
+	if (!parent) {
+		pr_warn("%s: failed to create debugfs directory\n", name);
+		return;
+	}
+
+	*attr = fail_default_attr;
+	dir = fault_create_debugfs_attr("fault_inject", parent, attr);
+	if (IS_ERR(dir)) {
+		pr_warn("%s: failed to create debugfs attr\n", name);
+		debugfs_remove_recursive(parent);
+		return;
+	}
+	ns->fault_inject.parent = parent;
+
+	/* create debugfs for status code and dont_retry */
+	fault_inj->status = NVME_SC_INVALID_OPCODE;
+	fault_inj->dont_retry = true;
+	debugfs_create_x16("status", 0600, dir,	&fault_inj->status);
+	debugfs_create_bool("dont_retry", 0600, dir, &fault_inj->dont_retry);
+}
+
+void nvme_fault_inject_fini(struct nvme_ns *ns)
+{
+	/* remove debugfs directories */
+	debugfs_remove_recursive(ns->fault_inject.parent);
+}
+
+void nvme_should_fail(struct request *req)
+{
+	struct gendisk *disk = req->rq_disk;
+	struct nvme_ns *ns = NULL;
+	u16 status;
+
+	/*
+	 * make sure this request is coming from a valid namespace
+	 */
+	if (!disk)
+		return;
+
+	ns = disk->private_data;
+	if (ns && should_fail(&ns->fault_inject.attr, 1)) {
+		/* inject status code and DNR bit */
+		status = ns->fault_inject.status;
+		if (ns->fault_inject.dont_retry)
+			status |= NVME_SC_DNR;
+		nvme_req(req)->status =	status;
+	}
+}
+EXPORT_SYMBOL_GPL(nvme_should_fail);

+ 16 - 20
drivers/nvme/host/fc.c

@@ -588,6 +588,8 @@ nvme_fc_attach_to_suspended_rport(struct nvme_fc_lport *lport,
 			return ERR_PTR(-ESTALE);
 			return ERR_PTR(-ESTALE);
 		}
 		}
 
 
+		rport->remoteport.port_role = pinfo->port_role;
+		rport->remoteport.port_id = pinfo->port_id;
 		rport->remoteport.port_state = FC_OBJSTATE_ONLINE;
 		rport->remoteport.port_state = FC_OBJSTATE_ONLINE;
 		rport->dev_loss_end = 0;
 		rport->dev_loss_end = 0;
 
 
@@ -768,8 +770,7 @@ nvme_fc_ctrl_connectivity_loss(struct nvme_fc_ctrl *ctrl)
 		 */
 		 */
 		if (nvme_reset_ctrl(&ctrl->ctrl)) {
 		if (nvme_reset_ctrl(&ctrl->ctrl)) {
 			dev_warn(ctrl->ctrl.device,
 			dev_warn(ctrl->ctrl.device,
-				"NVME-FC{%d}: Couldn't schedule reset. "
-				"Deleting controller.\n",
+				"NVME-FC{%d}: Couldn't schedule reset.\n",
 				ctrl->cnum);
 				ctrl->cnum);
 			nvme_delete_ctrl(&ctrl->ctrl);
 			nvme_delete_ctrl(&ctrl->ctrl);
 		}
 		}
@@ -836,8 +837,7 @@ nvme_fc_unregister_remoteport(struct nvme_fc_remote_port *portptr)
 		/* if dev_loss_tmo==0, dev loss is immediate */
 		/* if dev_loss_tmo==0, dev loss is immediate */
 		if (!portptr->dev_loss_tmo) {
 		if (!portptr->dev_loss_tmo) {
 			dev_warn(ctrl->ctrl.device,
 			dev_warn(ctrl->ctrl.device,
-				"NVME-FC{%d}: controller connectivity lost. "
-				"Deleting controller.\n",
+				"NVME-FC{%d}: controller connectivity lost.\n",
 				ctrl->cnum);
 				ctrl->cnum);
 			nvme_delete_ctrl(&ctrl->ctrl);
 			nvme_delete_ctrl(&ctrl->ctrl);
 		} else
 		} else
@@ -2076,20 +2076,10 @@ nvme_fc_timeout(struct request *rq, bool reserved)
 {
 {
 	struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq);
 	struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq);
 	struct nvme_fc_ctrl *ctrl = op->ctrl;
 	struct nvme_fc_ctrl *ctrl = op->ctrl;
-	int ret;
-
-	if (ctrl->rport->remoteport.port_state != FC_OBJSTATE_ONLINE ||
-			atomic_read(&op->state) == FCPOP_STATE_ABORTED)
-		return BLK_EH_RESET_TIMER;
-
-	ret = __nvme_fc_abort_op(ctrl, op);
-	if (ret)
-		/* io wasn't active to abort */
-		return BLK_EH_NOT_HANDLED;
 
 
 	/*
 	/*
 	 * we can't individually ABTS an io without affecting the queue,
 	 * we can't individually ABTS an io without affecting the queue,
-	 * thus killing the queue, adn thus the association.
+	 * thus killing the queue, and thus the association.
 	 * So resolve by performing a controller reset, which will stop
 	 * So resolve by performing a controller reset, which will stop
 	 * the host/io stack, terminate the association on the link,
 	 * the host/io stack, terminate the association on the link,
 	 * and recreate an association on the link.
 	 * and recreate an association on the link.
@@ -2191,7 +2181,7 @@ nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue,
 	struct nvme_fc_cmd_iu *cmdiu = &op->cmd_iu;
 	struct nvme_fc_cmd_iu *cmdiu = &op->cmd_iu;
 	struct nvme_command *sqe = &cmdiu->sqe;
 	struct nvme_command *sqe = &cmdiu->sqe;
 	u32 csn;
 	u32 csn;
-	int ret;
+	int ret, opstate;
 
 
 	/*
 	/*
 	 * before attempting to send the io, check to see if we believe
 	 * before attempting to send the io, check to see if we believe
@@ -2269,6 +2259,9 @@ nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue,
 					queue->lldd_handle, &op->fcp_req);
 					queue->lldd_handle, &op->fcp_req);
 
 
 	if (ret) {
 	if (ret) {
+		opstate = atomic_xchg(&op->state, FCPOP_STATE_COMPLETE);
+		__nvme_fc_fcpop_chk_teardowns(ctrl, op, opstate);
+
 		if (!(op->flags & FCOP_FLAGS_AEN))
 		if (!(op->flags & FCOP_FLAGS_AEN))
 			nvme_fc_unmap_data(ctrl, op->rq, op);
 			nvme_fc_unmap_data(ctrl, op->rq, op);
 
 
@@ -2889,14 +2882,13 @@ nvme_fc_reconnect_or_delete(struct nvme_fc_ctrl *ctrl, int status)
 		if (portptr->port_state == FC_OBJSTATE_ONLINE)
 		if (portptr->port_state == FC_OBJSTATE_ONLINE)
 			dev_warn(ctrl->ctrl.device,
 			dev_warn(ctrl->ctrl.device,
 				"NVME-FC{%d}: Max reconnect attempts (%d) "
 				"NVME-FC{%d}: Max reconnect attempts (%d) "
-				"reached. Removing controller\n",
+				"reached.\n",
 				ctrl->cnum, ctrl->ctrl.nr_reconnects);
 				ctrl->cnum, ctrl->ctrl.nr_reconnects);
 		else
 		else
 			dev_warn(ctrl->ctrl.device,
 			dev_warn(ctrl->ctrl.device,
 				"NVME-FC{%d}: dev_loss_tmo (%d) expired "
 				"NVME-FC{%d}: dev_loss_tmo (%d) expired "
-				"while waiting for remoteport connectivity. "
-				"Removing controller\n", ctrl->cnum,
-				portptr->dev_loss_tmo);
+				"while waiting for remoteport connectivity.\n",
+				ctrl->cnum, portptr->dev_loss_tmo);
 		WARN_ON(nvme_delete_ctrl(&ctrl->ctrl));
 		WARN_ON(nvme_delete_ctrl(&ctrl->ctrl));
 	}
 	}
 }
 }
@@ -3133,6 +3125,10 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
 	}
 	}
 
 
 	if (ret) {
 	if (ret) {
+		nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING);
+		cancel_work_sync(&ctrl->ctrl.reset_work);
+		cancel_delayed_work_sync(&ctrl->connect_work);
+
 		/* couldn't schedule retry - fail out */
 		/* couldn't schedule retry - fail out */
 		dev_err(ctrl->ctrl.device,
 		dev_err(ctrl->ctrl.device,
 			"NVME-FC{%d}: Connect retry failed\n", ctrl->cnum);
 			"NVME-FC{%d}: Connect retry failed\n", ctrl->cnum);

+ 579 - 178
drivers/nvme/host/lightnvm.c

@@ -35,6 +35,10 @@ enum nvme_nvm_admin_opcode {
 	nvme_nvm_admin_set_bb_tbl	= 0xf1,
 	nvme_nvm_admin_set_bb_tbl	= 0xf1,
 };
 };
 
 
+enum nvme_nvm_log_page {
+	NVME_NVM_LOG_REPORT_CHUNK	= 0xca,
+};
+
 struct nvme_nvm_ph_rw {
 struct nvme_nvm_ph_rw {
 	__u8			opcode;
 	__u8			opcode;
 	__u8			flags;
 	__u8			flags;
@@ -51,6 +55,21 @@ struct nvme_nvm_ph_rw {
 	__le64			resv;
 	__le64			resv;
 };
 };
 
 
+struct nvme_nvm_erase_blk {
+	__u8			opcode;
+	__u8			flags;
+	__u16			command_id;
+	__le32			nsid;
+	__u64			rsvd[2];
+	__le64			prp1;
+	__le64			prp2;
+	__le64			spba;
+	__le16			length;
+	__le16			control;
+	__le32			dsmgmt;
+	__le64			resv;
+};
+
 struct nvme_nvm_identity {
 struct nvme_nvm_identity {
 	__u8			opcode;
 	__u8			opcode;
 	__u8			flags;
 	__u8			flags;
@@ -59,8 +78,7 @@ struct nvme_nvm_identity {
 	__u64			rsvd[2];
 	__u64			rsvd[2];
 	__le64			prp1;
 	__le64			prp1;
 	__le64			prp2;
 	__le64			prp2;
-	__le32			chnl_off;
-	__u32			rsvd11[5];
+	__u32			rsvd11[6];
 };
 };
 
 
 struct nvme_nvm_getbbtbl {
 struct nvme_nvm_getbbtbl {
@@ -90,44 +108,18 @@ struct nvme_nvm_setbbtbl {
 	__u32			rsvd4[3];
 	__u32			rsvd4[3];
 };
 };
 
 
-struct nvme_nvm_erase_blk {
-	__u8			opcode;
-	__u8			flags;
-	__u16			command_id;
-	__le32			nsid;
-	__u64			rsvd[2];
-	__le64			prp1;
-	__le64			prp2;
-	__le64			spba;
-	__le16			length;
-	__le16			control;
-	__le32			dsmgmt;
-	__le64			resv;
-};
-
 struct nvme_nvm_command {
 struct nvme_nvm_command {
 	union {
 	union {
 		struct nvme_common_command common;
 		struct nvme_common_command common;
-		struct nvme_nvm_identity identity;
 		struct nvme_nvm_ph_rw ph_rw;
 		struct nvme_nvm_ph_rw ph_rw;
+		struct nvme_nvm_erase_blk erase;
+		struct nvme_nvm_identity identity;
 		struct nvme_nvm_getbbtbl get_bb;
 		struct nvme_nvm_getbbtbl get_bb;
 		struct nvme_nvm_setbbtbl set_bb;
 		struct nvme_nvm_setbbtbl set_bb;
-		struct nvme_nvm_erase_blk erase;
 	};
 	};
 };
 };
 
 
-#define NVME_NVM_LP_MLC_PAIRS 886
-struct nvme_nvm_lp_mlc {
-	__le16			num_pairs;
-	__u8			pairs[NVME_NVM_LP_MLC_PAIRS];
-};
-
-struct nvme_nvm_lp_tbl {
-	__u8			id[8];
-	struct nvme_nvm_lp_mlc	mlc;
-};
-
-struct nvme_nvm_id_group {
+struct nvme_nvm_id12_grp {
 	__u8			mtype;
 	__u8			mtype;
 	__u8			fmtype;
 	__u8			fmtype;
 	__le16			res16;
 	__le16			res16;
@@ -150,11 +142,10 @@ struct nvme_nvm_id_group {
 	__le32			mpos;
 	__le32			mpos;
 	__le32			mccap;
 	__le32			mccap;
 	__le16			cpar;
 	__le16			cpar;
-	__u8			reserved[10];
-	struct nvme_nvm_lp_tbl lptbl;
+	__u8			reserved[906];
 } __packed;
 } __packed;
 
 
-struct nvme_nvm_addr_format {
+struct nvme_nvm_id12_addrf {
 	__u8			ch_offset;
 	__u8			ch_offset;
 	__u8			ch_len;
 	__u8			ch_len;
 	__u8			lun_offset;
 	__u8			lun_offset;
@@ -165,21 +156,22 @@ struct nvme_nvm_addr_format {
 	__u8			blk_len;
 	__u8			blk_len;
 	__u8			pg_offset;
 	__u8			pg_offset;
 	__u8			pg_len;
 	__u8			pg_len;
-	__u8			sect_offset;
-	__u8			sect_len;
+	__u8			sec_offset;
+	__u8			sec_len;
 	__u8			res[4];
 	__u8			res[4];
 } __packed;
 } __packed;
 
 
-struct nvme_nvm_id {
+struct nvme_nvm_id12 {
 	__u8			ver_id;
 	__u8			ver_id;
 	__u8			vmnt;
 	__u8			vmnt;
 	__u8			cgrps;
 	__u8			cgrps;
 	__u8			res;
 	__u8			res;
 	__le32			cap;
 	__le32			cap;
 	__le32			dom;
 	__le32			dom;
-	struct nvme_nvm_addr_format ppaf;
+	struct nvme_nvm_id12_addrf ppaf;
 	__u8			resv[228];
 	__u8			resv[228];
-	struct nvme_nvm_id_group groups[4];
+	struct nvme_nvm_id12_grp grp;
+	__u8			resv2[2880];
 } __packed;
 } __packed;
 
 
 struct nvme_nvm_bb_tbl {
 struct nvme_nvm_bb_tbl {
@@ -196,6 +188,68 @@ struct nvme_nvm_bb_tbl {
 	__u8	blk[0];
 	__u8	blk[0];
 };
 };
 
 
+struct nvme_nvm_id20_addrf {
+	__u8			grp_len;
+	__u8			pu_len;
+	__u8			chk_len;
+	__u8			lba_len;
+	__u8			resv[4];
+};
+
+struct nvme_nvm_id20 {
+	__u8			mjr;
+	__u8			mnr;
+	__u8			resv[6];
+
+	struct nvme_nvm_id20_addrf lbaf;
+
+	__le32			mccap;
+	__u8			resv2[12];
+
+	__u8			wit;
+	__u8			resv3[31];
+
+	/* Geometry */
+	__le16			num_grp;
+	__le16			num_pu;
+	__le32			num_chk;
+	__le32			clba;
+	__u8			resv4[52];
+
+	/* Write data requirements */
+	__le32			ws_min;
+	__le32			ws_opt;
+	__le32			mw_cunits;
+	__le32			maxoc;
+	__le32			maxocpu;
+	__u8			resv5[44];
+
+	/* Performance related metrics */
+	__le32			trdt;
+	__le32			trdm;
+	__le32			twrt;
+	__le32			twrm;
+	__le32			tcrst;
+	__le32			tcrsm;
+	__u8			resv6[40];
+
+	/* Reserved area */
+	__u8			resv7[2816];
+
+	/* Vendor specific */
+	__u8			vs[1024];
+};
+
+struct nvme_nvm_chk_meta {
+	__u8	state;
+	__u8	type;
+	__u8	wi;
+	__u8	rsvd[5];
+	__le64	slba;
+	__le64	cnlb;
+	__le64	wp;
+};
+
 /*
 /*
  * Check we didn't inadvertently grow the command struct
  * Check we didn't inadvertently grow the command struct
  */
  */
@@ -203,105 +257,238 @@ static inline void _nvme_nvm_check_size(void)
 {
 {
 	BUILD_BUG_ON(sizeof(struct nvme_nvm_identity) != 64);
 	BUILD_BUG_ON(sizeof(struct nvme_nvm_identity) != 64);
 	BUILD_BUG_ON(sizeof(struct nvme_nvm_ph_rw) != 64);
 	BUILD_BUG_ON(sizeof(struct nvme_nvm_ph_rw) != 64);
+	BUILD_BUG_ON(sizeof(struct nvme_nvm_erase_blk) != 64);
 	BUILD_BUG_ON(sizeof(struct nvme_nvm_getbbtbl) != 64);
 	BUILD_BUG_ON(sizeof(struct nvme_nvm_getbbtbl) != 64);
 	BUILD_BUG_ON(sizeof(struct nvme_nvm_setbbtbl) != 64);
 	BUILD_BUG_ON(sizeof(struct nvme_nvm_setbbtbl) != 64);
-	BUILD_BUG_ON(sizeof(struct nvme_nvm_erase_blk) != 64);
-	BUILD_BUG_ON(sizeof(struct nvme_nvm_id_group) != 960);
-	BUILD_BUG_ON(sizeof(struct nvme_nvm_addr_format) != 16);
-	BUILD_BUG_ON(sizeof(struct nvme_nvm_id) != NVME_IDENTIFY_DATA_SIZE);
+	BUILD_BUG_ON(sizeof(struct nvme_nvm_id12_grp) != 960);
+	BUILD_BUG_ON(sizeof(struct nvme_nvm_id12_addrf) != 16);
+	BUILD_BUG_ON(sizeof(struct nvme_nvm_id12) != NVME_IDENTIFY_DATA_SIZE);
 	BUILD_BUG_ON(sizeof(struct nvme_nvm_bb_tbl) != 64);
 	BUILD_BUG_ON(sizeof(struct nvme_nvm_bb_tbl) != 64);
+	BUILD_BUG_ON(sizeof(struct nvme_nvm_id20_addrf) != 8);
+	BUILD_BUG_ON(sizeof(struct nvme_nvm_id20) != NVME_IDENTIFY_DATA_SIZE);
+	BUILD_BUG_ON(sizeof(struct nvme_nvm_chk_meta) != 32);
+	BUILD_BUG_ON(sizeof(struct nvme_nvm_chk_meta) !=
+						sizeof(struct nvm_chk_meta));
+}
+
+static void nvme_nvm_set_addr_12(struct nvm_addrf_12 *dst,
+				 struct nvme_nvm_id12_addrf *src)
+{
+	dst->ch_len = src->ch_len;
+	dst->lun_len = src->lun_len;
+	dst->blk_len = src->blk_len;
+	dst->pg_len = src->pg_len;
+	dst->pln_len = src->pln_len;
+	dst->sec_len = src->sec_len;
+
+	dst->ch_offset = src->ch_offset;
+	dst->lun_offset = src->lun_offset;
+	dst->blk_offset = src->blk_offset;
+	dst->pg_offset = src->pg_offset;
+	dst->pln_offset = src->pln_offset;
+	dst->sec_offset = src->sec_offset;
+
+	dst->ch_mask = ((1ULL << dst->ch_len) - 1) << dst->ch_offset;
+	dst->lun_mask = ((1ULL << dst->lun_len) - 1) << dst->lun_offset;
+	dst->blk_mask = ((1ULL << dst->blk_len) - 1) << dst->blk_offset;
+	dst->pg_mask = ((1ULL << dst->pg_len) - 1) << dst->pg_offset;
+	dst->pln_mask = ((1ULL << dst->pln_len) - 1) << dst->pln_offset;
+	dst->sec_mask = ((1ULL << dst->sec_len) - 1) << dst->sec_offset;
 }
 }
 
 
-static int init_grps(struct nvm_id *nvm_id, struct nvme_nvm_id *nvme_nvm_id)
+static int nvme_nvm_setup_12(struct nvme_nvm_id12 *id,
+			     struct nvm_geo *geo)
 {
 {
-	struct nvme_nvm_id_group *src;
-	struct nvm_id_group *grp;
+	struct nvme_nvm_id12_grp *src;
 	int sec_per_pg, sec_per_pl, pg_per_blk;
 	int sec_per_pg, sec_per_pl, pg_per_blk;
 
 
-	if (nvme_nvm_id->cgrps != 1)
+	if (id->cgrps != 1)
+		return -EINVAL;
+
+	src = &id->grp;
+
+	if (src->mtype != 0) {
+		pr_err("nvm: memory type not supported\n");
 		return -EINVAL;
 		return -EINVAL;
+	}
+
+	/* 1.2 spec. only reports a single version id - unfold */
+	geo->major_ver_id = id->ver_id;
+	geo->minor_ver_id = 2;
 
 
-	src = &nvme_nvm_id->groups[0];
-	grp = &nvm_id->grp;
+	/* Set compacted version for upper layers */
+	geo->version = NVM_OCSSD_SPEC_12;
 
 
-	grp->mtype = src->mtype;
-	grp->fmtype = src->fmtype;
+	geo->num_ch = src->num_ch;
+	geo->num_lun = src->num_lun;
+	geo->all_luns = geo->num_ch * geo->num_lun;
 
 
-	grp->num_ch = src->num_ch;
-	grp->num_lun = src->num_lun;
+	geo->num_chk = le16_to_cpu(src->num_chk);
 
 
-	grp->num_chk = le16_to_cpu(src->num_chk);
-	grp->csecs = le16_to_cpu(src->csecs);
-	grp->sos = le16_to_cpu(src->sos);
+	geo->csecs = le16_to_cpu(src->csecs);
+	geo->sos = le16_to_cpu(src->sos);
 
 
 	pg_per_blk = le16_to_cpu(src->num_pg);
 	pg_per_blk = le16_to_cpu(src->num_pg);
-	sec_per_pg = le16_to_cpu(src->fpg_sz) / grp->csecs;
+	sec_per_pg = le16_to_cpu(src->fpg_sz) / geo->csecs;
 	sec_per_pl = sec_per_pg * src->num_pln;
 	sec_per_pl = sec_per_pg * src->num_pln;
-	grp->clba = sec_per_pl * pg_per_blk;
-	grp->ws_per_chk = pg_per_blk;
-
-	grp->mpos = le32_to_cpu(src->mpos);
-	grp->cpar = le16_to_cpu(src->cpar);
-	grp->mccap = le32_to_cpu(src->mccap);
-
-	grp->ws_opt = grp->ws_min = sec_per_pg;
-	grp->ws_seq = NVM_IO_SNGL_ACCESS;
-
-	if (grp->mpos & 0x020202) {
-		grp->ws_seq = NVM_IO_DUAL_ACCESS;
-		grp->ws_opt <<= 1;
-	} else if (grp->mpos & 0x040404) {
-		grp->ws_seq = NVM_IO_QUAD_ACCESS;
-		grp->ws_opt <<= 2;
-	}
+	geo->clba = sec_per_pl * pg_per_blk;
+
+	geo->all_chunks = geo->all_luns * geo->num_chk;
+	geo->total_secs = geo->clba * geo->all_chunks;
+
+	geo->ws_min = sec_per_pg;
+	geo->ws_opt = sec_per_pg;
+	geo->mw_cunits = geo->ws_opt << 3;	/* default to MLC safe values */
 
 
-	grp->trdt = le32_to_cpu(src->trdt);
-	grp->trdm = le32_to_cpu(src->trdm);
-	grp->tprt = le32_to_cpu(src->tprt);
-	grp->tprm = le32_to_cpu(src->tprm);
-	grp->tbet = le32_to_cpu(src->tbet);
-	grp->tbem = le32_to_cpu(src->tbem);
+	/* Do not impose values for maximum number of open blocks as it is
+	 * unspecified in 1.2. Users of 1.2 must be aware of this and eventually
+	 * specify these values through a quirk if restrictions apply.
+	 */
+	geo->maxoc = geo->all_luns * geo->num_chk;
+	geo->maxocpu = geo->num_chk;
+
+	geo->mccap = le32_to_cpu(src->mccap);
+
+	geo->trdt = le32_to_cpu(src->trdt);
+	geo->trdm = le32_to_cpu(src->trdm);
+	geo->tprt = le32_to_cpu(src->tprt);
+	geo->tprm = le32_to_cpu(src->tprm);
+	geo->tbet = le32_to_cpu(src->tbet);
+	geo->tbem = le32_to_cpu(src->tbem);
 
 
 	/* 1.2 compatibility */
 	/* 1.2 compatibility */
-	grp->num_pln = src->num_pln;
-	grp->num_pg = le16_to_cpu(src->num_pg);
-	grp->fpg_sz = le16_to_cpu(src->fpg_sz);
+	geo->vmnt = id->vmnt;
+	geo->cap = le32_to_cpu(id->cap);
+	geo->dom = le32_to_cpu(id->dom);
+
+	geo->mtype = src->mtype;
+	geo->fmtype = src->fmtype;
+
+	geo->cpar = le16_to_cpu(src->cpar);
+	geo->mpos = le32_to_cpu(src->mpos);
+
+	geo->pln_mode = NVM_PLANE_SINGLE;
+
+	if (geo->mpos & 0x020202) {
+		geo->pln_mode = NVM_PLANE_DOUBLE;
+		geo->ws_opt <<= 1;
+	} else if (geo->mpos & 0x040404) {
+		geo->pln_mode = NVM_PLANE_QUAD;
+		geo->ws_opt <<= 2;
+	}
+
+	geo->num_pln = src->num_pln;
+	geo->num_pg = le16_to_cpu(src->num_pg);
+	geo->fpg_sz = le16_to_cpu(src->fpg_sz);
+
+	nvme_nvm_set_addr_12((struct nvm_addrf_12 *)&geo->addrf, &id->ppaf);
 
 
 	return 0;
 	return 0;
 }
 }
 
 
-static int nvme_nvm_identity(struct nvm_dev *nvmdev, struct nvm_id *nvm_id)
+static void nvme_nvm_set_addr_20(struct nvm_addrf *dst,
+				 struct nvme_nvm_id20_addrf *src)
+{
+	dst->ch_len = src->grp_len;
+	dst->lun_len = src->pu_len;
+	dst->chk_len = src->chk_len;
+	dst->sec_len = src->lba_len;
+
+	dst->sec_offset = 0;
+	dst->chk_offset = dst->sec_len;
+	dst->lun_offset = dst->chk_offset + dst->chk_len;
+	dst->ch_offset = dst->lun_offset + dst->lun_len;
+
+	dst->ch_mask = ((1ULL << dst->ch_len) - 1) << dst->ch_offset;
+	dst->lun_mask = ((1ULL << dst->lun_len) - 1) << dst->lun_offset;
+	dst->chk_mask = ((1ULL << dst->chk_len) - 1) << dst->chk_offset;
+	dst->sec_mask = ((1ULL << dst->sec_len) - 1) << dst->sec_offset;
+}
+
+static int nvme_nvm_setup_20(struct nvme_nvm_id20 *id,
+			     struct nvm_geo *geo)
+{
+	geo->major_ver_id = id->mjr;
+	geo->minor_ver_id = id->mnr;
+
+	/* Set compacted version for upper layers */
+	geo->version = NVM_OCSSD_SPEC_20;
+
+	if (!(geo->major_ver_id == 2 && geo->minor_ver_id == 0)) {
+		pr_err("nvm: OCSSD version not supported (v%d.%d)\n",
+				geo->major_ver_id, geo->minor_ver_id);
+		return -EINVAL;
+	}
+
+	geo->num_ch = le16_to_cpu(id->num_grp);
+	geo->num_lun = le16_to_cpu(id->num_pu);
+	geo->all_luns = geo->num_ch * geo->num_lun;
+
+	geo->num_chk = le32_to_cpu(id->num_chk);
+	geo->clba = le32_to_cpu(id->clba);
+
+	geo->all_chunks = geo->all_luns * geo->num_chk;
+	geo->total_secs = geo->clba * geo->all_chunks;
+
+	geo->ws_min = le32_to_cpu(id->ws_min);
+	geo->ws_opt = le32_to_cpu(id->ws_opt);
+	geo->mw_cunits = le32_to_cpu(id->mw_cunits);
+	geo->maxoc = le32_to_cpu(id->maxoc);
+	geo->maxocpu = le32_to_cpu(id->maxocpu);
+
+	geo->trdt = le32_to_cpu(id->trdt);
+	geo->trdm = le32_to_cpu(id->trdm);
+	geo->tprt = le32_to_cpu(id->twrt);
+	geo->tprm = le32_to_cpu(id->twrm);
+	geo->tbet = le32_to_cpu(id->tcrst);
+	geo->tbem = le32_to_cpu(id->tcrsm);
+
+	nvme_nvm_set_addr_20(&geo->addrf, &id->lbaf);
+
+	return 0;
+}
+
+static int nvme_nvm_identity(struct nvm_dev *nvmdev)
 {
 {
 	struct nvme_ns *ns = nvmdev->q->queuedata;
 	struct nvme_ns *ns = nvmdev->q->queuedata;
-	struct nvme_nvm_id *nvme_nvm_id;
+	struct nvme_nvm_id12 *id;
 	struct nvme_nvm_command c = {};
 	struct nvme_nvm_command c = {};
 	int ret;
 	int ret;
 
 
 	c.identity.opcode = nvme_nvm_admin_identity;
 	c.identity.opcode = nvme_nvm_admin_identity;
 	c.identity.nsid = cpu_to_le32(ns->head->ns_id);
 	c.identity.nsid = cpu_to_le32(ns->head->ns_id);
-	c.identity.chnl_off = 0;
 
 
-	nvme_nvm_id = kmalloc(sizeof(struct nvme_nvm_id), GFP_KERNEL);
-	if (!nvme_nvm_id)
+	id = kmalloc(sizeof(struct nvme_nvm_id12), GFP_KERNEL);
+	if (!id)
 		return -ENOMEM;
 		return -ENOMEM;
 
 
 	ret = nvme_submit_sync_cmd(ns->ctrl->admin_q, (struct nvme_command *)&c,
 	ret = nvme_submit_sync_cmd(ns->ctrl->admin_q, (struct nvme_command *)&c,
-				nvme_nvm_id, sizeof(struct nvme_nvm_id));
+				id, sizeof(struct nvme_nvm_id12));
 	if (ret) {
 	if (ret) {
 		ret = -EIO;
 		ret = -EIO;
 		goto out;
 		goto out;
 	}
 	}
 
 
-	nvm_id->ver_id = nvme_nvm_id->ver_id;
-	nvm_id->vmnt = nvme_nvm_id->vmnt;
-	nvm_id->cap = le32_to_cpu(nvme_nvm_id->cap);
-	nvm_id->dom = le32_to_cpu(nvme_nvm_id->dom);
-	memcpy(&nvm_id->ppaf, &nvme_nvm_id->ppaf,
-					sizeof(struct nvm_addr_format));
+	/*
+	 * The 1.2 and 2.0 specifications share the first byte in their geometry
+	 * command to make it possible to know what version a device implements.
+	 */
+	switch (id->ver_id) {
+	case 1:
+		ret = nvme_nvm_setup_12(id, &nvmdev->geo);
+		break;
+	case 2:
+		ret = nvme_nvm_setup_20((struct nvme_nvm_id20 *)id,
+							&nvmdev->geo);
+		break;
+	default:
+		dev_err(ns->ctrl->device, "OCSSD revision not supported (%d)\n",
+							id->ver_id);
+		ret = -EINVAL;
+	}
 
 
-	ret = init_grps(nvm_id, nvme_nvm_id);
 out:
 out:
-	kfree(nvme_nvm_id);
+	kfree(id);
 	return ret;
 	return ret;
 }
 }
 
 
@@ -314,7 +501,7 @@ static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa,
 	struct nvme_ctrl *ctrl = ns->ctrl;
 	struct nvme_ctrl *ctrl = ns->ctrl;
 	struct nvme_nvm_command c = {};
 	struct nvme_nvm_command c = {};
 	struct nvme_nvm_bb_tbl *bb_tbl;
 	struct nvme_nvm_bb_tbl *bb_tbl;
-	int nr_blks = geo->nr_chks * geo->plane_mode;
+	int nr_blks = geo->num_chk * geo->num_pln;
 	int tblsz = sizeof(struct nvme_nvm_bb_tbl) + nr_blks;
 	int tblsz = sizeof(struct nvme_nvm_bb_tbl) + nr_blks;
 	int ret = 0;
 	int ret = 0;
 
 
@@ -355,7 +542,7 @@ static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa,
 		goto out;
 		goto out;
 	}
 	}
 
 
-	memcpy(blks, bb_tbl->blk, geo->nr_chks * geo->plane_mode);
+	memcpy(blks, bb_tbl->blk, geo->num_chk * geo->num_pln);
 out:
 out:
 	kfree(bb_tbl);
 	kfree(bb_tbl);
 	return ret;
 	return ret;
@@ -382,6 +569,61 @@ static int nvme_nvm_set_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr *ppas,
 	return ret;
 	return ret;
 }
 }
 
 
+/*
+ * Expect the lba in device format
+ */
+static int nvme_nvm_get_chk_meta(struct nvm_dev *ndev,
+				 struct nvm_chk_meta *meta,
+				 sector_t slba, int nchks)
+{
+	struct nvm_geo *geo = &ndev->geo;
+	struct nvme_ns *ns = ndev->q->queuedata;
+	struct nvme_ctrl *ctrl = ns->ctrl;
+	struct nvme_nvm_chk_meta *dev_meta = (struct nvme_nvm_chk_meta *)meta;
+	struct ppa_addr ppa;
+	size_t left = nchks * sizeof(struct nvme_nvm_chk_meta);
+	size_t log_pos, offset, len;
+	int ret, i;
+
+	/* Normalize lba address space to obtain log offset */
+	ppa.ppa = slba;
+	ppa = dev_to_generic_addr(ndev, ppa);
+
+	log_pos = ppa.m.chk;
+	log_pos += ppa.m.pu * geo->num_chk;
+	log_pos += ppa.m.grp * geo->num_lun * geo->num_chk;
+
+	offset = log_pos * sizeof(struct nvme_nvm_chk_meta);
+
+	while (left) {
+		len = min_t(unsigned int, left, ctrl->max_hw_sectors << 9);
+
+		ret = nvme_get_log_ext(ctrl, ns, NVME_NVM_LOG_REPORT_CHUNK,
+				dev_meta, len, offset);
+		if (ret) {
+			dev_err(ctrl->device, "Get REPORT CHUNK log error\n");
+			break;
+		}
+
+		for (i = 0; i < len; i += sizeof(struct nvme_nvm_chk_meta)) {
+			meta->state = dev_meta->state;
+			meta->type = dev_meta->type;
+			meta->wi = dev_meta->wi;
+			meta->slba = le64_to_cpu(dev_meta->slba);
+			meta->cnlb = le64_to_cpu(dev_meta->cnlb);
+			meta->wp = le64_to_cpu(dev_meta->wp);
+
+			meta++;
+			dev_meta++;
+		}
+
+		offset += len;
+		left -= len;
+	}
+
+	return ret;
+}
+
 static inline void nvme_nvm_rqtocmd(struct nvm_rq *rqd, struct nvme_ns *ns,
 static inline void nvme_nvm_rqtocmd(struct nvm_rq *rqd, struct nvme_ns *ns,
 				    struct nvme_nvm_command *c)
 				    struct nvme_nvm_command *c)
 {
 {
@@ -513,6 +755,8 @@ static struct nvm_dev_ops nvme_nvm_dev_ops = {
 	.get_bb_tbl		= nvme_nvm_get_bb_tbl,
 	.get_bb_tbl		= nvme_nvm_get_bb_tbl,
 	.set_bb_tbl		= nvme_nvm_set_bb_tbl,
 	.set_bb_tbl		= nvme_nvm_set_bb_tbl,
 
 
+	.get_chk_meta		= nvme_nvm_get_chk_meta,
+
 	.submit_io		= nvme_nvm_submit_io,
 	.submit_io		= nvme_nvm_submit_io,
 	.submit_io_sync		= nvme_nvm_submit_io_sync,
 	.submit_io_sync		= nvme_nvm_submit_io_sync,
 
 
@@ -520,8 +764,6 @@ static struct nvm_dev_ops nvme_nvm_dev_ops = {
 	.destroy_dma_pool	= nvme_nvm_destroy_dma_pool,
 	.destroy_dma_pool	= nvme_nvm_destroy_dma_pool,
 	.dev_dma_alloc		= nvme_nvm_dev_dma_alloc,
 	.dev_dma_alloc		= nvme_nvm_dev_dma_alloc,
 	.dev_dma_free		= nvme_nvm_dev_dma_free,
 	.dev_dma_free		= nvme_nvm_dev_dma_free,
-
-	.max_phys_sect		= 64,
 };
 };
 
 
 static int nvme_nvm_submit_user_cmd(struct request_queue *q,
 static int nvme_nvm_submit_user_cmd(struct request_queue *q,
@@ -722,6 +964,15 @@ int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd, unsigned long arg)
 	}
 	}
 }
 }
 
 
+void nvme_nvm_update_nvm_info(struct nvme_ns *ns)
+{
+	struct nvm_dev *ndev = ns->ndev;
+	struct nvm_geo *geo = &ndev->geo;
+
+	geo->csecs = 1 << ns->lba_shift;
+	geo->sos = ns->ms;
+}
+
 int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node)
 int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node)
 {
 {
 	struct request_queue *q = ns->queue;
 	struct request_queue *q = ns->queue;
@@ -748,125 +999,205 @@ void nvme_nvm_unregister(struct nvme_ns *ns)
 }
 }
 
 
 static ssize_t nvm_dev_attr_show(struct device *dev,
 static ssize_t nvm_dev_attr_show(struct device *dev,
-				 struct device_attribute *dattr, char *page)
+		struct device_attribute *dattr, char *page)
 {
 {
 	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
 	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
 	struct nvm_dev *ndev = ns->ndev;
 	struct nvm_dev *ndev = ns->ndev;
-	struct nvm_id *id;
-	struct nvm_id_group *grp;
+	struct nvm_geo *geo = &ndev->geo;
 	struct attribute *attr;
 	struct attribute *attr;
 
 
 	if (!ndev)
 	if (!ndev)
 		return 0;
 		return 0;
 
 
-	id = &ndev->identity;
-	grp = &id->grp;
 	attr = &dattr->attr;
 	attr = &dattr->attr;
 
 
 	if (strcmp(attr->name, "version") == 0) {
 	if (strcmp(attr->name, "version") == 0) {
-		return scnprintf(page, PAGE_SIZE, "%u\n", id->ver_id);
-	} else if (strcmp(attr->name, "vendor_opcode") == 0) {
-		return scnprintf(page, PAGE_SIZE, "%u\n", id->vmnt);
+		if (geo->major_ver_id == 1)
+			return scnprintf(page, PAGE_SIZE, "%u\n",
+						geo->major_ver_id);
+		else
+			return scnprintf(page, PAGE_SIZE, "%u.%u\n",
+						geo->major_ver_id,
+						geo->minor_ver_id);
 	} else if (strcmp(attr->name, "capabilities") == 0) {
 	} else if (strcmp(attr->name, "capabilities") == 0) {
-		return scnprintf(page, PAGE_SIZE, "%u\n", id->cap);
+		return scnprintf(page, PAGE_SIZE, "%u\n", geo->cap);
+	} else if (strcmp(attr->name, "read_typ") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", geo->trdt);
+	} else if (strcmp(attr->name, "read_max") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", geo->trdm);
+	} else {
+		return scnprintf(page,
+				 PAGE_SIZE,
+				 "Unhandled attr(%s) in `%s`\n",
+				 attr->name, __func__);
+	}
+}
+
+static ssize_t nvm_dev_attr_show_ppaf(struct nvm_addrf_12 *ppaf, char *page)
+{
+	return scnprintf(page, PAGE_SIZE,
+		"0x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x\n",
+				ppaf->ch_offset, ppaf->ch_len,
+				ppaf->lun_offset, ppaf->lun_len,
+				ppaf->pln_offset, ppaf->pln_len,
+				ppaf->blk_offset, ppaf->blk_len,
+				ppaf->pg_offset, ppaf->pg_len,
+				ppaf->sec_offset, ppaf->sec_len);
+}
+
+static ssize_t nvm_dev_attr_show_12(struct device *dev,
+		struct device_attribute *dattr, char *page)
+{
+	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
+	struct nvm_dev *ndev = ns->ndev;
+	struct nvm_geo *geo = &ndev->geo;
+	struct attribute *attr;
+
+	if (!ndev)
+		return 0;
+
+	attr = &dattr->attr;
+
+	if (strcmp(attr->name, "vendor_opcode") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", geo->vmnt);
 	} else if (strcmp(attr->name, "device_mode") == 0) {
 	} else if (strcmp(attr->name, "device_mode") == 0) {
-		return scnprintf(page, PAGE_SIZE, "%u\n", id->dom);
+		return scnprintf(page, PAGE_SIZE, "%u\n", geo->dom);
 	/* kept for compatibility */
 	/* kept for compatibility */
 	} else if (strcmp(attr->name, "media_manager") == 0) {
 	} else if (strcmp(attr->name, "media_manager") == 0) {
 		return scnprintf(page, PAGE_SIZE, "%s\n", "gennvm");
 		return scnprintf(page, PAGE_SIZE, "%s\n", "gennvm");
 	} else if (strcmp(attr->name, "ppa_format") == 0) {
 	} else if (strcmp(attr->name, "ppa_format") == 0) {
-		return scnprintf(page, PAGE_SIZE,
-			"0x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x\n",
-			id->ppaf.ch_offset, id->ppaf.ch_len,
-			id->ppaf.lun_offset, id->ppaf.lun_len,
-			id->ppaf.pln_offset, id->ppaf.pln_len,
-			id->ppaf.blk_offset, id->ppaf.blk_len,
-			id->ppaf.pg_offset, id->ppaf.pg_len,
-			id->ppaf.sect_offset, id->ppaf.sect_len);
+		return nvm_dev_attr_show_ppaf((void *)&geo->addrf, page);
 	} else if (strcmp(attr->name, "media_type") == 0) {	/* u8 */
 	} else if (strcmp(attr->name, "media_type") == 0) {	/* u8 */
-		return scnprintf(page, PAGE_SIZE, "%u\n", grp->mtype);
+		return scnprintf(page, PAGE_SIZE, "%u\n", geo->mtype);
 	} else if (strcmp(attr->name, "flash_media_type") == 0) {
 	} else if (strcmp(attr->name, "flash_media_type") == 0) {
-		return scnprintf(page, PAGE_SIZE, "%u\n", grp->fmtype);
+		return scnprintf(page, PAGE_SIZE, "%u\n", geo->fmtype);
 	} else if (strcmp(attr->name, "num_channels") == 0) {
 	} else if (strcmp(attr->name, "num_channels") == 0) {
-		return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_ch);
+		return scnprintf(page, PAGE_SIZE, "%u\n", geo->num_ch);
 	} else if (strcmp(attr->name, "num_luns") == 0) {
 	} else if (strcmp(attr->name, "num_luns") == 0) {
-		return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_lun);
+		return scnprintf(page, PAGE_SIZE, "%u\n", geo->num_lun);
 	} else if (strcmp(attr->name, "num_planes") == 0) {
 	} else if (strcmp(attr->name, "num_planes") == 0) {
-		return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_pln);
+		return scnprintf(page, PAGE_SIZE, "%u\n", geo->num_pln);
 	} else if (strcmp(attr->name, "num_blocks") == 0) {	/* u16 */
 	} else if (strcmp(attr->name, "num_blocks") == 0) {	/* u16 */
-		return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_chk);
+		return scnprintf(page, PAGE_SIZE, "%u\n", geo->num_chk);
 	} else if (strcmp(attr->name, "num_pages") == 0) {
 	} else if (strcmp(attr->name, "num_pages") == 0) {
-		return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_pg);
+		return scnprintf(page, PAGE_SIZE, "%u\n", geo->num_pg);
 	} else if (strcmp(attr->name, "page_size") == 0) {
 	} else if (strcmp(attr->name, "page_size") == 0) {
-		return scnprintf(page, PAGE_SIZE, "%u\n", grp->fpg_sz);
+		return scnprintf(page, PAGE_SIZE, "%u\n", geo->fpg_sz);
 	} else if (strcmp(attr->name, "hw_sector_size") == 0) {
 	} else if (strcmp(attr->name, "hw_sector_size") == 0) {
-		return scnprintf(page, PAGE_SIZE, "%u\n", grp->csecs);
+		return scnprintf(page, PAGE_SIZE, "%u\n", geo->csecs);
 	} else if (strcmp(attr->name, "oob_sector_size") == 0) {/* u32 */
 	} else if (strcmp(attr->name, "oob_sector_size") == 0) {/* u32 */
-		return scnprintf(page, PAGE_SIZE, "%u\n", grp->sos);
-	} else if (strcmp(attr->name, "read_typ") == 0) {
-		return scnprintf(page, PAGE_SIZE, "%u\n", grp->trdt);
-	} else if (strcmp(attr->name, "read_max") == 0) {
-		return scnprintf(page, PAGE_SIZE, "%u\n", grp->trdm);
+		return scnprintf(page, PAGE_SIZE, "%u\n", geo->sos);
 	} else if (strcmp(attr->name, "prog_typ") == 0) {
 	} else if (strcmp(attr->name, "prog_typ") == 0) {
-		return scnprintf(page, PAGE_SIZE, "%u\n", grp->tprt);
+		return scnprintf(page, PAGE_SIZE, "%u\n", geo->tprt);
 	} else if (strcmp(attr->name, "prog_max") == 0) {
 	} else if (strcmp(attr->name, "prog_max") == 0) {
-		return scnprintf(page, PAGE_SIZE, "%u\n", grp->tprm);
+		return scnprintf(page, PAGE_SIZE, "%u\n", geo->tprm);
 	} else if (strcmp(attr->name, "erase_typ") == 0) {
 	} else if (strcmp(attr->name, "erase_typ") == 0) {
-		return scnprintf(page, PAGE_SIZE, "%u\n", grp->tbet);
+		return scnprintf(page, PAGE_SIZE, "%u\n", geo->tbet);
 	} else if (strcmp(attr->name, "erase_max") == 0) {
 	} else if (strcmp(attr->name, "erase_max") == 0) {
-		return scnprintf(page, PAGE_SIZE, "%u\n", grp->tbem);
+		return scnprintf(page, PAGE_SIZE, "%u\n", geo->tbem);
 	} else if (strcmp(attr->name, "multiplane_modes") == 0) {
 	} else if (strcmp(attr->name, "multiplane_modes") == 0) {
-		return scnprintf(page, PAGE_SIZE, "0x%08x\n", grp->mpos);
+		return scnprintf(page, PAGE_SIZE, "0x%08x\n", geo->mpos);
 	} else if (strcmp(attr->name, "media_capabilities") == 0) {
 	} else if (strcmp(attr->name, "media_capabilities") == 0) {
-		return scnprintf(page, PAGE_SIZE, "0x%08x\n", grp->mccap);
+		return scnprintf(page, PAGE_SIZE, "0x%08x\n", geo->mccap);
 	} else if (strcmp(attr->name, "max_phys_secs") == 0) {
 	} else if (strcmp(attr->name, "max_phys_secs") == 0) {
-		return scnprintf(page, PAGE_SIZE, "%u\n",
-				ndev->ops->max_phys_sect);
+		return scnprintf(page, PAGE_SIZE, "%u\n", NVM_MAX_VLBA);
 	} else {
 	} else {
-		return scnprintf(page,
-				 PAGE_SIZE,
-				 "Unhandled attr(%s) in `nvm_dev_attr_show`\n",
-				 attr->name);
+		return scnprintf(page, PAGE_SIZE,
+			"Unhandled attr(%s) in `%s`\n",
+			attr->name, __func__);
+	}
+}
+
+static ssize_t nvm_dev_attr_show_20(struct device *dev,
+		struct device_attribute *dattr, char *page)
+{
+	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
+	struct nvm_dev *ndev = ns->ndev;
+	struct nvm_geo *geo = &ndev->geo;
+	struct attribute *attr;
+
+	if (!ndev)
+		return 0;
+
+	attr = &dattr->attr;
+
+	if (strcmp(attr->name, "groups") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", geo->num_ch);
+	} else if (strcmp(attr->name, "punits") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", geo->num_lun);
+	} else if (strcmp(attr->name, "chunks") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", geo->num_chk);
+	} else if (strcmp(attr->name, "clba") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", geo->clba);
+	} else if (strcmp(attr->name, "ws_min") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", geo->ws_min);
+	} else if (strcmp(attr->name, "ws_opt") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", geo->ws_opt);
+	} else if (strcmp(attr->name, "maxoc") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", geo->maxoc);
+	} else if (strcmp(attr->name, "maxocpu") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", geo->maxocpu);
+	} else if (strcmp(attr->name, "mw_cunits") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", geo->mw_cunits);
+	} else if (strcmp(attr->name, "write_typ") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", geo->tprt);
+	} else if (strcmp(attr->name, "write_max") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", geo->tprm);
+	} else if (strcmp(attr->name, "reset_typ") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", geo->tbet);
+	} else if (strcmp(attr->name, "reset_max") == 0) {
+		return scnprintf(page, PAGE_SIZE, "%u\n", geo->tbem);
+	} else {
+		return scnprintf(page, PAGE_SIZE,
+			"Unhandled attr(%s) in `%s`\n",
+			attr->name, __func__);
 	}
 	}
 }
 }
 
 
-#define NVM_DEV_ATTR_RO(_name)						\
+#define NVM_DEV_ATTR_RO(_name)					\
 	DEVICE_ATTR(_name, S_IRUGO, nvm_dev_attr_show, NULL)
 	DEVICE_ATTR(_name, S_IRUGO, nvm_dev_attr_show, NULL)
+#define NVM_DEV_ATTR_12_RO(_name)					\
+	DEVICE_ATTR(_name, S_IRUGO, nvm_dev_attr_show_12, NULL)
+#define NVM_DEV_ATTR_20_RO(_name)					\
+	DEVICE_ATTR(_name, S_IRUGO, nvm_dev_attr_show_20, NULL)
 
 
+/* general attributes */
 static NVM_DEV_ATTR_RO(version);
 static NVM_DEV_ATTR_RO(version);
-static NVM_DEV_ATTR_RO(vendor_opcode);
 static NVM_DEV_ATTR_RO(capabilities);
 static NVM_DEV_ATTR_RO(capabilities);
-static NVM_DEV_ATTR_RO(device_mode);
-static NVM_DEV_ATTR_RO(ppa_format);
-static NVM_DEV_ATTR_RO(media_manager);
-
-static NVM_DEV_ATTR_RO(media_type);
-static NVM_DEV_ATTR_RO(flash_media_type);
-static NVM_DEV_ATTR_RO(num_channels);
-static NVM_DEV_ATTR_RO(num_luns);
-static NVM_DEV_ATTR_RO(num_planes);
-static NVM_DEV_ATTR_RO(num_blocks);
-static NVM_DEV_ATTR_RO(num_pages);
-static NVM_DEV_ATTR_RO(page_size);
-static NVM_DEV_ATTR_RO(hw_sector_size);
-static NVM_DEV_ATTR_RO(oob_sector_size);
+
 static NVM_DEV_ATTR_RO(read_typ);
 static NVM_DEV_ATTR_RO(read_typ);
 static NVM_DEV_ATTR_RO(read_max);
 static NVM_DEV_ATTR_RO(read_max);
-static NVM_DEV_ATTR_RO(prog_typ);
-static NVM_DEV_ATTR_RO(prog_max);
-static NVM_DEV_ATTR_RO(erase_typ);
-static NVM_DEV_ATTR_RO(erase_max);
-static NVM_DEV_ATTR_RO(multiplane_modes);
-static NVM_DEV_ATTR_RO(media_capabilities);
-static NVM_DEV_ATTR_RO(max_phys_secs);
-
-static struct attribute *nvm_dev_attrs[] = {
+
+/* 1.2 values */
+static NVM_DEV_ATTR_12_RO(vendor_opcode);
+static NVM_DEV_ATTR_12_RO(device_mode);
+static NVM_DEV_ATTR_12_RO(ppa_format);
+static NVM_DEV_ATTR_12_RO(media_manager);
+static NVM_DEV_ATTR_12_RO(media_type);
+static NVM_DEV_ATTR_12_RO(flash_media_type);
+static NVM_DEV_ATTR_12_RO(num_channels);
+static NVM_DEV_ATTR_12_RO(num_luns);
+static NVM_DEV_ATTR_12_RO(num_planes);
+static NVM_DEV_ATTR_12_RO(num_blocks);
+static NVM_DEV_ATTR_12_RO(num_pages);
+static NVM_DEV_ATTR_12_RO(page_size);
+static NVM_DEV_ATTR_12_RO(hw_sector_size);
+static NVM_DEV_ATTR_12_RO(oob_sector_size);
+static NVM_DEV_ATTR_12_RO(prog_typ);
+static NVM_DEV_ATTR_12_RO(prog_max);
+static NVM_DEV_ATTR_12_RO(erase_typ);
+static NVM_DEV_ATTR_12_RO(erase_max);
+static NVM_DEV_ATTR_12_RO(multiplane_modes);
+static NVM_DEV_ATTR_12_RO(media_capabilities);
+static NVM_DEV_ATTR_12_RO(max_phys_secs);
+
+static struct attribute *nvm_dev_attrs_12[] = {
 	&dev_attr_version.attr,
 	&dev_attr_version.attr,
-	&dev_attr_vendor_opcode.attr,
 	&dev_attr_capabilities.attr,
 	&dev_attr_capabilities.attr,
+
+	&dev_attr_vendor_opcode.attr,
 	&dev_attr_device_mode.attr,
 	&dev_attr_device_mode.attr,
 	&dev_attr_media_manager.attr,
 	&dev_attr_media_manager.attr,
-
 	&dev_attr_ppa_format.attr,
 	&dev_attr_ppa_format.attr,
 	&dev_attr_media_type.attr,
 	&dev_attr_media_type.attr,
 	&dev_attr_flash_media_type.attr,
 	&dev_attr_flash_media_type.attr,
@@ -887,22 +1218,92 @@ static struct attribute *nvm_dev_attrs[] = {
 	&dev_attr_multiplane_modes.attr,
 	&dev_attr_multiplane_modes.attr,
 	&dev_attr_media_capabilities.attr,
 	&dev_attr_media_capabilities.attr,
 	&dev_attr_max_phys_secs.attr,
 	&dev_attr_max_phys_secs.attr,
+
 	NULL,
 	NULL,
 };
 };
 
 
-static const struct attribute_group nvm_dev_attr_group = {
+static const struct attribute_group nvm_dev_attr_group_12 = {
 	.name		= "lightnvm",
 	.name		= "lightnvm",
-	.attrs		= nvm_dev_attrs,
+	.attrs		= nvm_dev_attrs_12,
+};
+
+/* 2.0 values */
+static NVM_DEV_ATTR_20_RO(groups);
+static NVM_DEV_ATTR_20_RO(punits);
+static NVM_DEV_ATTR_20_RO(chunks);
+static NVM_DEV_ATTR_20_RO(clba);
+static NVM_DEV_ATTR_20_RO(ws_min);
+static NVM_DEV_ATTR_20_RO(ws_opt);
+static NVM_DEV_ATTR_20_RO(maxoc);
+static NVM_DEV_ATTR_20_RO(maxocpu);
+static NVM_DEV_ATTR_20_RO(mw_cunits);
+static NVM_DEV_ATTR_20_RO(write_typ);
+static NVM_DEV_ATTR_20_RO(write_max);
+static NVM_DEV_ATTR_20_RO(reset_typ);
+static NVM_DEV_ATTR_20_RO(reset_max);
+
+static struct attribute *nvm_dev_attrs_20[] = {
+	&dev_attr_version.attr,
+	&dev_attr_capabilities.attr,
+
+	&dev_attr_groups.attr,
+	&dev_attr_punits.attr,
+	&dev_attr_chunks.attr,
+	&dev_attr_clba.attr,
+	&dev_attr_ws_min.attr,
+	&dev_attr_ws_opt.attr,
+	&dev_attr_maxoc.attr,
+	&dev_attr_maxocpu.attr,
+	&dev_attr_mw_cunits.attr,
+
+	&dev_attr_read_typ.attr,
+	&dev_attr_read_max.attr,
+	&dev_attr_write_typ.attr,
+	&dev_attr_write_max.attr,
+	&dev_attr_reset_typ.attr,
+	&dev_attr_reset_max.attr,
+
+	NULL,
+};
+
+static const struct attribute_group nvm_dev_attr_group_20 = {
+	.name		= "lightnvm",
+	.attrs		= nvm_dev_attrs_20,
 };
 };
 
 
 int nvme_nvm_register_sysfs(struct nvme_ns *ns)
 int nvme_nvm_register_sysfs(struct nvme_ns *ns)
 {
 {
-	return sysfs_create_group(&disk_to_dev(ns->disk)->kobj,
-					&nvm_dev_attr_group);
+	struct nvm_dev *ndev = ns->ndev;
+	struct nvm_geo *geo = &ndev->geo;
+
+	if (!ndev)
+		return -EINVAL;
+
+	switch (geo->major_ver_id) {
+	case 1:
+		return sysfs_create_group(&disk_to_dev(ns->disk)->kobj,
+					&nvm_dev_attr_group_12);
+	case 2:
+		return sysfs_create_group(&disk_to_dev(ns->disk)->kobj,
+					&nvm_dev_attr_group_20);
+	}
+
+	return -EINVAL;
 }
 }
 
 
 void nvme_nvm_unregister_sysfs(struct nvme_ns *ns)
 void nvme_nvm_unregister_sysfs(struct nvme_ns *ns)
 {
 {
-	sysfs_remove_group(&disk_to_dev(ns->disk)->kobj,
-					&nvm_dev_attr_group);
+	struct nvm_dev *ndev = ns->ndev;
+	struct nvm_geo *geo = &ndev->geo;
+
+	switch (geo->major_ver_id) {
+	case 1:
+		sysfs_remove_group(&disk_to_dev(ns->disk)->kobj,
+					&nvm_dev_attr_group_12);
+		break;
+	case 2:
+		sysfs_remove_group(&disk_to_dev(ns->disk)->kobj,
+					&nvm_dev_attr_group_20);
+		break;
+	}
 }
 }

+ 4 - 4
drivers/nvme/host/multipath.c

@@ -44,12 +44,12 @@ void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
 {
 {
 	struct nvme_ns *ns;
 	struct nvme_ns *ns;
 
 
-	mutex_lock(&ctrl->namespaces_mutex);
+	down_read(&ctrl->namespaces_rwsem);
 	list_for_each_entry(ns, &ctrl->namespaces, list) {
 	list_for_each_entry(ns, &ctrl->namespaces, list) {
 		if (ns->head->disk)
 		if (ns->head->disk)
 			kblockd_schedule_work(&ns->head->requeue_work);
 			kblockd_schedule_work(&ns->head->requeue_work);
 	}
 	}
-	mutex_unlock(&ctrl->namespaces_mutex);
+	up_read(&ctrl->namespaces_rwsem);
 }
 }
 
 
 static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head)
 static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head)
@@ -162,13 +162,13 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
 	if (!(ctrl->subsys->cmic & (1 << 1)) || !multipath)
 	if (!(ctrl->subsys->cmic & (1 << 1)) || !multipath)
 		return 0;
 		return 0;
 
 
-	q = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE);
+	q = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE, NULL);
 	if (!q)
 	if (!q)
 		goto out;
 		goto out;
 	q->queuedata = head;
 	q->queuedata = head;
 	blk_queue_make_request(q, nvme_ns_head_make_request);
 	blk_queue_make_request(q, nvme_ns_head_make_request);
 	q->poll_fn = nvme_ns_head_poll;
 	q->poll_fn = nvme_ns_head_poll;
-	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
+	blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
 	/* set to a default value for 512 until disk is validated */
 	/* set to a default value for 512 until disk is validated */
 	blk_queue_logical_block_size(q, 512);
 	blk_queue_logical_block_size(q, 512);
 
 

+ 34 - 1
drivers/nvme/host/nvme.h

@@ -21,6 +21,7 @@
 #include <linux/blk-mq.h>
 #include <linux/blk-mq.h>
 #include <linux/lightnvm.h>
 #include <linux/lightnvm.h>
 #include <linux/sed-opal.h>
 #include <linux/sed-opal.h>
+#include <linux/fault-inject.h>
 
 
 extern unsigned int nvme_io_timeout;
 extern unsigned int nvme_io_timeout;
 #define NVME_IO_TIMEOUT	(nvme_io_timeout * HZ)
 #define NVME_IO_TIMEOUT	(nvme_io_timeout * HZ)
@@ -140,7 +141,7 @@ struct nvme_ctrl {
 	struct blk_mq_tag_set *tagset;
 	struct blk_mq_tag_set *tagset;
 	struct blk_mq_tag_set *admin_tagset;
 	struct blk_mq_tag_set *admin_tagset;
 	struct list_head namespaces;
 	struct list_head namespaces;
-	struct mutex namespaces_mutex;
+	struct rw_semaphore namespaces_rwsem;
 	struct device ctrl_device;
 	struct device ctrl_device;
 	struct device *device;	/* char device */
 	struct device *device;	/* char device */
 	struct cdev cdev;
 	struct cdev cdev;
@@ -261,6 +262,15 @@ struct nvme_ns_head {
 	int			instance;
 	int			instance;
 };
 };
 
 
+#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
+struct nvme_fault_inject {
+	struct fault_attr attr;
+	struct dentry *parent;
+	bool dont_retry;	/* DNR, do not retry */
+	u16 status;		/* status code */
+};
+#endif
+
 struct nvme_ns {
 struct nvme_ns {
 	struct list_head list;
 	struct list_head list;
 
 
@@ -282,6 +292,11 @@ struct nvme_ns {
 #define NVME_NS_REMOVING 0
 #define NVME_NS_REMOVING 0
 #define NVME_NS_DEAD     1
 #define NVME_NS_DEAD     1
 	u16 noiob;
 	u16 noiob;
+
+#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
+	struct nvme_fault_inject fault_inject;
+#endif
+
 };
 };
 
 
 struct nvme_ctrl_ops {
 struct nvme_ctrl_ops {
@@ -298,8 +313,19 @@ struct nvme_ctrl_ops {
 	void (*delete_ctrl)(struct nvme_ctrl *ctrl);
 	void (*delete_ctrl)(struct nvme_ctrl *ctrl);
 	int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size);
 	int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size);
 	int (*reinit_request)(void *data, struct request *rq);
 	int (*reinit_request)(void *data, struct request *rq);
+	void (*stop_ctrl)(struct nvme_ctrl *ctrl);
 };
 };
 
 
+#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
+void nvme_fault_inject_init(struct nvme_ns *ns);
+void nvme_fault_inject_fini(struct nvme_ns *ns);
+void nvme_should_fail(struct request *req);
+#else
+static inline void nvme_fault_inject_init(struct nvme_ns *ns) {}
+static inline void nvme_fault_inject_fini(struct nvme_ns *ns) {}
+static inline void nvme_should_fail(struct request *req) {}
+#endif
+
 static inline bool nvme_ctrl_ready(struct nvme_ctrl *ctrl)
 static inline bool nvme_ctrl_ready(struct nvme_ctrl *ctrl)
 {
 {
 	u32 val = 0;
 	u32 val = 0;
@@ -336,6 +362,8 @@ static inline void nvme_end_request(struct request *req, __le16 status,
 
 
 	rq->status = le16_to_cpu(status) >> 1;
 	rq->status = le16_to_cpu(status) >> 1;
 	rq->result = result;
 	rq->result = result;
+	/* inject error when permitted by fault injection framework */
+	nvme_should_fail(req);
 	blk_mq_complete_request(req);
 	blk_mq_complete_request(req);
 }
 }
 
 
@@ -401,6 +429,9 @@ int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl);
 int nvme_delete_ctrl(struct nvme_ctrl *ctrl);
 int nvme_delete_ctrl(struct nvme_ctrl *ctrl);
 int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl);
 int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl);
 
 
+int nvme_get_log_ext(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
+		u8 log_page, void *log, size_t size, size_t offset);
+
 extern const struct attribute_group nvme_ns_id_attr_group;
 extern const struct attribute_group nvme_ns_id_attr_group;
 extern const struct block_device_operations nvme_ns_head_ops;
 extern const struct block_device_operations nvme_ns_head_ops;
 
 
@@ -461,12 +492,14 @@ static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
 #endif /* CONFIG_NVME_MULTIPATH */
 #endif /* CONFIG_NVME_MULTIPATH */
 
 
 #ifdef CONFIG_NVM
 #ifdef CONFIG_NVM
+void nvme_nvm_update_nvm_info(struct nvme_ns *ns);
 int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node);
 int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node);
 void nvme_nvm_unregister(struct nvme_ns *ns);
 void nvme_nvm_unregister(struct nvme_ns *ns);
 int nvme_nvm_register_sysfs(struct nvme_ns *ns);
 int nvme_nvm_register_sysfs(struct nvme_ns *ns);
 void nvme_nvm_unregister_sysfs(struct nvme_ns *ns);
 void nvme_nvm_unregister_sysfs(struct nvme_ns *ns);
 int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd, unsigned long arg);
 int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd, unsigned long arg);
 #else
 #else
+static inline void nvme_nvm_update_nvm_info(struct nvme_ns *ns) {};
 static inline int nvme_nvm_register(struct nvme_ns *ns, char *disk_name,
 static inline int nvme_nvm_register(struct nvme_ns *ns, char *disk_name,
 				    int node)
 				    int node)
 {
 {

+ 18 - 8
drivers/nvme/host/pci.c

@@ -414,7 +414,7 @@ static int nvme_pci_map_queues(struct blk_mq_tag_set *set)
 {
 {
 	struct nvme_dev *dev = set->driver_data;
 	struct nvme_dev *dev = set->driver_data;
 
 
-	return blk_mq_pci_map_queues(set, to_pci_dev(dev->dev));
+	return blk_mq_pci_map_queues(set, to_pci_dev(dev->dev), 0);
 }
 }
 
 
 /**
 /**
@@ -2197,7 +2197,11 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
 	if (!dead) {
 	if (!dead) {
 		if (shutdown)
 		if (shutdown)
 			nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT);
 			nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT);
+	}
+
+	nvme_stop_queues(&dev->ctrl);
 
 
+	if (!dead) {
 		/*
 		/*
 		 * If the controller is still alive tell it to stop using the
 		 * If the controller is still alive tell it to stop using the
 		 * host memory buffer.  In theory the shutdown / reset should
 		 * host memory buffer.  In theory the shutdown / reset should
@@ -2206,11 +2210,6 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
 		 */
 		 */
 		if (dev->host_mem_descs)
 		if (dev->host_mem_descs)
 			nvme_set_host_mem(dev, 0);
 			nvme_set_host_mem(dev, 0);
-
-	}
-	nvme_stop_queues(&dev->ctrl);
-
-	if (!dead) {
 		nvme_disable_io_queues(dev);
 		nvme_disable_io_queues(dev);
 		nvme_disable_admin_queue(dev, shutdown);
 		nvme_disable_admin_queue(dev, shutdown);
 	}
 	}
@@ -2416,6 +2415,13 @@ static int nvme_pci_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val)
 	return 0;
 	return 0;
 }
 }
 
 
+static int nvme_pci_get_address(struct nvme_ctrl *ctrl, char *buf, int size)
+{
+	struct pci_dev *pdev = to_pci_dev(to_nvme_dev(ctrl)->dev);
+
+	return snprintf(buf, size, "%s", dev_name(&pdev->dev));
+}
+
 static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
 static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
 	.name			= "pcie",
 	.name			= "pcie",
 	.module			= THIS_MODULE,
 	.module			= THIS_MODULE,
@@ -2425,6 +2431,7 @@ static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
 	.reg_read64		= nvme_pci_reg_read64,
 	.reg_read64		= nvme_pci_reg_read64,
 	.free_ctrl		= nvme_pci_free_ctrl,
 	.free_ctrl		= nvme_pci_free_ctrl,
 	.submit_async_event	= nvme_pci_submit_async_event,
 	.submit_async_event	= nvme_pci_submit_async_event,
+	.get_address		= nvme_pci_get_address,
 };
 };
 
 
 static int nvme_dev_map(struct nvme_dev *dev)
 static int nvme_dev_map(struct nvme_dev *dev)
@@ -2461,10 +2468,13 @@ static unsigned long check_vendor_combination_bug(struct pci_dev *pdev)
 	} else if (pdev->vendor == 0x144d && pdev->device == 0xa804) {
 	} else if (pdev->vendor == 0x144d && pdev->device == 0xa804) {
 		/*
 		/*
 		 * Samsung SSD 960 EVO drops off the PCIe bus after system
 		 * Samsung SSD 960 EVO drops off the PCIe bus after system
-		 * suspend on a Ryzen board, ASUS PRIME B350M-A.
+		 * suspend on a Ryzen board, ASUS PRIME B350M-A, as well as
+		 * within few minutes after bootup on a Coffee Lake board -
+		 * ASUS PRIME Z370-A
 		 */
 		 */
 		if (dmi_match(DMI_BOARD_VENDOR, "ASUSTeK COMPUTER INC.") &&
 		if (dmi_match(DMI_BOARD_VENDOR, "ASUSTeK COMPUTER INC.") &&
-		    dmi_match(DMI_BOARD_NAME, "PRIME B350M-A"))
+		    (dmi_match(DMI_BOARD_NAME, "PRIME B350M-A") ||
+		     dmi_match(DMI_BOARD_NAME, "PRIME Z370-A")))
 			return NVME_QUIRK_NO_APST;
 			return NVME_QUIRK_NO_APST;
 	}
 	}
 
 

+ 25 - 9
drivers/nvme/host/rdma.c

@@ -867,6 +867,14 @@ out_free_io_queues:
 	return ret;
 	return ret;
 }
 }
 
 
+static void nvme_rdma_stop_ctrl(struct nvme_ctrl *nctrl)
+{
+	struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
+
+	cancel_work_sync(&ctrl->err_work);
+	cancel_delayed_work_sync(&ctrl->reconnect_work);
+}
+
 static void nvme_rdma_free_ctrl(struct nvme_ctrl *nctrl)
 static void nvme_rdma_free_ctrl(struct nvme_ctrl *nctrl)
 {
 {
 	struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
 	struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
@@ -899,7 +907,6 @@ static void nvme_rdma_reconnect_or_remove(struct nvme_rdma_ctrl *ctrl)
 		queue_delayed_work(nvme_wq, &ctrl->reconnect_work,
 		queue_delayed_work(nvme_wq, &ctrl->reconnect_work,
 				ctrl->ctrl.opts->reconnect_delay * HZ);
 				ctrl->ctrl.opts->reconnect_delay * HZ);
 	} else {
 	} else {
-		dev_info(ctrl->ctrl.device, "Removing controller...\n");
 		nvme_delete_ctrl(&ctrl->ctrl);
 		nvme_delete_ctrl(&ctrl->ctrl);
 	}
 	}
 }
 }
@@ -974,8 +981,8 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work)
 	nvme_start_queues(&ctrl->ctrl);
 	nvme_start_queues(&ctrl->ctrl);
 
 
 	if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
 	if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
-		/* state change failure should never happen */
-		WARN_ON_ONCE(1);
+		/* state change failure is ok if we're in DELETING state */
+		WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING);
 		return;
 		return;
 	}
 	}
 
 
@@ -1719,9 +1726,6 @@ static const struct blk_mq_ops nvme_rdma_admin_mq_ops = {
 
 
 static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown)
 static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown)
 {
 {
-	cancel_work_sync(&ctrl->err_work);
-	cancel_delayed_work_sync(&ctrl->reconnect_work);
-
 	if (ctrl->ctrl.queue_count > 1) {
 	if (ctrl->ctrl.queue_count > 1) {
 		nvme_stop_queues(&ctrl->ctrl);
 		nvme_stop_queues(&ctrl->ctrl);
 		blk_mq_tagset_busy_iter(&ctrl->tag_set,
 		blk_mq_tagset_busy_iter(&ctrl->tag_set,
@@ -1799,6 +1803,7 @@ static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = {
 	.submit_async_event	= nvme_rdma_submit_async_event,
 	.submit_async_event	= nvme_rdma_submit_async_event,
 	.delete_ctrl		= nvme_rdma_delete_ctrl,
 	.delete_ctrl		= nvme_rdma_delete_ctrl,
 	.get_address		= nvmf_get_address,
 	.get_address		= nvmf_get_address,
+	.stop_ctrl		= nvme_rdma_stop_ctrl,
 };
 };
 
 
 static inline bool
 static inline bool
@@ -2025,15 +2030,26 @@ static struct nvmf_transport_ops nvme_rdma_transport = {
 static void nvme_rdma_remove_one(struct ib_device *ib_device, void *client_data)
 static void nvme_rdma_remove_one(struct ib_device *ib_device, void *client_data)
 {
 {
 	struct nvme_rdma_ctrl *ctrl;
 	struct nvme_rdma_ctrl *ctrl;
+	struct nvme_rdma_device *ndev;
+	bool found = false;
+
+	mutex_lock(&device_list_mutex);
+	list_for_each_entry(ndev, &device_list, entry) {
+		if (ndev->dev == ib_device) {
+			found = true;
+			break;
+		}
+	}
+	mutex_unlock(&device_list_mutex);
+
+	if (!found)
+		return;
 
 
 	/* Delete all controllers using this device */
 	/* Delete all controllers using this device */
 	mutex_lock(&nvme_rdma_ctrl_mutex);
 	mutex_lock(&nvme_rdma_ctrl_mutex);
 	list_for_each_entry(ctrl, &nvme_rdma_ctrl_list, list) {
 	list_for_each_entry(ctrl, &nvme_rdma_ctrl_list, list) {
 		if (ctrl->device->dev != ib_device)
 		if (ctrl->device->dev != ib_device)
 			continue;
 			continue;
-		dev_info(ctrl->ctrl.device,
-			"Removing ctrl: NQN \"%s\", addr %pISp\n",
-			ctrl->ctrl.opts->subsysnqn, &ctrl->addr);
 		nvme_delete_ctrl(&ctrl->ctrl);
 		nvme_delete_ctrl(&ctrl->ctrl);
 	}
 	}
 	mutex_unlock(&nvme_rdma_ctrl_mutex);
 	mutex_unlock(&nvme_rdma_ctrl_mutex);

+ 31 - 34
drivers/nvme/target/configfs.c

@@ -23,6 +23,15 @@
 static const struct config_item_type nvmet_host_type;
 static const struct config_item_type nvmet_host_type;
 static const struct config_item_type nvmet_subsys_type;
 static const struct config_item_type nvmet_subsys_type;
 
 
+static const struct nvmet_transport_name {
+	u8		type;
+	const char	*name;
+} nvmet_transport_names[] = {
+	{ NVMF_TRTYPE_RDMA,	"rdma" },
+	{ NVMF_TRTYPE_FC,	"fc" },
+	{ NVMF_TRTYPE_LOOP,	"loop" },
+};
+
 /*
 /*
  * nvmet_port Generic ConfigFS definitions.
  * nvmet_port Generic ConfigFS definitions.
  * Used in any place in the ConfigFS tree that refers to an address.
  * Used in any place in the ConfigFS tree that refers to an address.
@@ -208,43 +217,30 @@ CONFIGFS_ATTR(nvmet_, addr_trsvcid);
 static ssize_t nvmet_addr_trtype_show(struct config_item *item,
 static ssize_t nvmet_addr_trtype_show(struct config_item *item,
 		char *page)
 		char *page)
 {
 {
-	switch (to_nvmet_port(item)->disc_addr.trtype) {
-	case NVMF_TRTYPE_RDMA:
-		return sprintf(page, "rdma\n");
-	case NVMF_TRTYPE_LOOP:
-		return sprintf(page, "loop\n");
-	case NVMF_TRTYPE_FC:
-		return sprintf(page, "fc\n");
-	default:
-		return sprintf(page, "\n");
+	struct nvmet_port *port = to_nvmet_port(item);
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(nvmet_transport_names); i++) {
+		if (port->disc_addr.trtype != nvmet_transport_names[i].type)
+			continue;
+		return sprintf(page, "%s\n", nvmet_transport_names[i].name);
 	}
 	}
+
+	return sprintf(page, "\n");
 }
 }
 
 
 static void nvmet_port_init_tsas_rdma(struct nvmet_port *port)
 static void nvmet_port_init_tsas_rdma(struct nvmet_port *port)
 {
 {
-	port->disc_addr.trtype = NVMF_TRTYPE_RDMA;
-	memset(&port->disc_addr.tsas.rdma, 0, NVMF_TSAS_SIZE);
 	port->disc_addr.tsas.rdma.qptype = NVMF_RDMA_QPTYPE_CONNECTED;
 	port->disc_addr.tsas.rdma.qptype = NVMF_RDMA_QPTYPE_CONNECTED;
 	port->disc_addr.tsas.rdma.prtype = NVMF_RDMA_PRTYPE_NOT_SPECIFIED;
 	port->disc_addr.tsas.rdma.prtype = NVMF_RDMA_PRTYPE_NOT_SPECIFIED;
 	port->disc_addr.tsas.rdma.cms = NVMF_RDMA_CMS_RDMA_CM;
 	port->disc_addr.tsas.rdma.cms = NVMF_RDMA_CMS_RDMA_CM;
 }
 }
 
 
-static void nvmet_port_init_tsas_loop(struct nvmet_port *port)
-{
-	port->disc_addr.trtype = NVMF_TRTYPE_LOOP;
-	memset(&port->disc_addr.tsas, 0, NVMF_TSAS_SIZE);
-}
-
-static void nvmet_port_init_tsas_fc(struct nvmet_port *port)
-{
-	port->disc_addr.trtype = NVMF_TRTYPE_FC;
-	memset(&port->disc_addr.tsas, 0, NVMF_TSAS_SIZE);
-}
-
 static ssize_t nvmet_addr_trtype_store(struct config_item *item,
 static ssize_t nvmet_addr_trtype_store(struct config_item *item,
 		const char *page, size_t count)
 		const char *page, size_t count)
 {
 {
 	struct nvmet_port *port = to_nvmet_port(item);
 	struct nvmet_port *port = to_nvmet_port(item);
+	int i;
 
 
 	if (port->enabled) {
 	if (port->enabled) {
 		pr_err("Cannot modify address while enabled\n");
 		pr_err("Cannot modify address while enabled\n");
@@ -252,17 +248,18 @@ static ssize_t nvmet_addr_trtype_store(struct config_item *item,
 		return -EACCES;
 		return -EACCES;
 	}
 	}
 
 
-	if (sysfs_streq(page, "rdma")) {
-		nvmet_port_init_tsas_rdma(port);
-	} else if (sysfs_streq(page, "loop")) {
-		nvmet_port_init_tsas_loop(port);
-	} else if (sysfs_streq(page, "fc")) {
-		nvmet_port_init_tsas_fc(port);
-	} else {
-		pr_err("Invalid value '%s' for trtype\n", page);
-		return -EINVAL;
+	for (i = 0; i < ARRAY_SIZE(nvmet_transport_names); i++) {
+		if (sysfs_streq(page, nvmet_transport_names[i].name))
+			goto found;
 	}
 	}
 
 
+	pr_err("Invalid value '%s' for trtype\n", page);
+	return -EINVAL;
+found:
+	memset(&port->disc_addr.tsas, 0, NVMF_TSAS_SIZE);
+	port->disc_addr.trtype = nvmet_transport_names[i].type;
+	if (port->disc_addr.trtype == NVMF_TRTYPE_RDMA)
+		nvmet_port_init_tsas_rdma(port);
 	return count;
 	return count;
 }
 }
 
 
@@ -333,13 +330,13 @@ out_unlock:
 	return ret ? ret : count;
 	return ret ? ret : count;
 }
 }
 
 
+CONFIGFS_ATTR(nvmet_ns_, device_uuid);
+
 static ssize_t nvmet_ns_device_nguid_show(struct config_item *item, char *page)
 static ssize_t nvmet_ns_device_nguid_show(struct config_item *item, char *page)
 {
 {
 	return sprintf(page, "%pUb\n", &to_nvmet_ns(item)->nguid);
 	return sprintf(page, "%pUb\n", &to_nvmet_ns(item)->nguid);
 }
 }
 
 
-CONFIGFS_ATTR(nvmet_ns_, device_uuid);
-
 static ssize_t nvmet_ns_device_nguid_store(struct config_item *item,
 static ssize_t nvmet_ns_device_nguid_store(struct config_item *item,
 		const char *page, size_t count)
 		const char *page, size_t count)
 {
 {

Some files were not shown because too many files changed in this diff