Browse Source

blk-mq: improve layout of blk_mq_hw_ctx

Various cache line optimizations:

- Move delay_work towards the end. It's huge, and we don't use it
  a lot (only SCSI).

- Move the atomic state into the same cacheline as the the dispatch
  list and lock.

- Rearrange a few members to pack it better.

- Shrink the max-order for dispatch accounting from 10 to 7. This
  means that ->dispatched[] and ->run now take up their own
  cacheline.

This shrinks struct blk_mq_hw_ctx down to 8 cachelines.

Signed-off-by: Jens Axboe <axboe@fb.com>
Jens Axboe 9 years ago
parent
commit
8d354f133e
1 changed files with 5 additions and 4 deletions
  1. 5 4
      include/linux/blk-mq.h

+ 5 - 4
include/linux/blk-mq.h

@@ -22,11 +22,10 @@ struct blk_mq_hw_ctx {
 	struct {
 		spinlock_t		lock;
 		struct list_head	dispatch;
+		unsigned long		state;		/* BLK_MQ_S_* flags */
 	} ____cacheline_aligned_in_smp;
 
-	unsigned long		state;		/* BLK_MQ_S_* flags */
 	struct work_struct	run_work;
-	struct delayed_work	delay_work;
 	cpumask_var_t		cpumask;
 	int			next_cpu;
 	int			next_cpu_batch;
@@ -40,8 +39,8 @@ struct blk_mq_hw_ctx {
 
 	struct blk_mq_ctxmap	ctx_map;
 
-	unsigned int		nr_ctx;
 	struct blk_mq_ctx	**ctxs;
+	unsigned int		nr_ctx;
 
 	atomic_t		wait_index;
 
@@ -49,7 +48,7 @@ struct blk_mq_hw_ctx {
 
 	unsigned long		queued;
 	unsigned long		run;
-#define BLK_MQ_MAX_DISPATCH_ORDER	10
+#define BLK_MQ_MAX_DISPATCH_ORDER	7
 	unsigned long		dispatched[BLK_MQ_MAX_DISPATCH_ORDER];
 
 	unsigned int		numa_node;
@@ -57,6 +56,8 @@ struct blk_mq_hw_ctx {
 
 	atomic_t		nr_active;
 
+	struct delayed_work	delay_work;
+
 	struct blk_mq_cpu_notifier	cpu_notifier;
 	struct kobject		kobj;