null_blk.c 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636
  1. #include <linux/module.h>
  2. #include <linux/moduleparam.h>
  3. #include <linux/sched.h>
  4. #include <linux/fs.h>
  5. #include <linux/blkdev.h>
  6. #include <linux/init.h>
  7. #include <linux/slab.h>
  8. #include <linux/blk-mq.h>
  9. #include <linux/hrtimer.h>
  10. struct nullb_cmd {
  11. struct list_head list;
  12. struct llist_node ll_list;
  13. struct call_single_data csd;
  14. struct request *rq;
  15. struct bio *bio;
  16. unsigned int tag;
  17. struct nullb_queue *nq;
  18. };
  19. struct nullb_queue {
  20. unsigned long *tag_map;
  21. wait_queue_head_t wait;
  22. unsigned int queue_depth;
  23. struct nullb_cmd *cmds;
  24. };
  25. struct nullb {
  26. struct list_head list;
  27. unsigned int index;
  28. struct request_queue *q;
  29. struct gendisk *disk;
  30. struct blk_mq_tag_set tag_set;
  31. struct hrtimer timer;
  32. unsigned int queue_depth;
  33. spinlock_t lock;
  34. struct nullb_queue *queues;
  35. unsigned int nr_queues;
  36. };
  37. static LIST_HEAD(nullb_list);
  38. static struct mutex lock;
  39. static int null_major;
  40. static int nullb_indexes;
  41. struct completion_queue {
  42. struct llist_head list;
  43. struct hrtimer timer;
  44. };
  45. /*
  46. * These are per-cpu for now, they will need to be configured by the
  47. * complete_queues parameter and appropriately mapped.
  48. */
  49. static DEFINE_PER_CPU(struct completion_queue, completion_queues);
  50. enum {
  51. NULL_IRQ_NONE = 0,
  52. NULL_IRQ_SOFTIRQ = 1,
  53. NULL_IRQ_TIMER = 2,
  54. };
  55. enum {
  56. NULL_Q_BIO = 0,
  57. NULL_Q_RQ = 1,
  58. NULL_Q_MQ = 2,
  59. };
  60. static int submit_queues;
  61. module_param(submit_queues, int, S_IRUGO);
  62. MODULE_PARM_DESC(submit_queues, "Number of submission queues");
  63. static int home_node = NUMA_NO_NODE;
  64. module_param(home_node, int, S_IRUGO);
  65. MODULE_PARM_DESC(home_node, "Home node for the device");
  66. static int queue_mode = NULL_Q_MQ;
  67. module_param(queue_mode, int, S_IRUGO);
  68. MODULE_PARM_DESC(queue_mode, "Block interface to use (0=bio,1=rq,2=multiqueue)");
  69. static int gb = 250;
  70. module_param(gb, int, S_IRUGO);
  71. MODULE_PARM_DESC(gb, "Size in GB");
  72. static int bs = 512;
  73. module_param(bs, int, S_IRUGO);
  74. MODULE_PARM_DESC(bs, "Block size (in bytes)");
  75. static int nr_devices = 2;
  76. module_param(nr_devices, int, S_IRUGO);
  77. MODULE_PARM_DESC(nr_devices, "Number of devices to register");
  78. static int irqmode = NULL_IRQ_SOFTIRQ;
  79. module_param(irqmode, int, S_IRUGO);
  80. MODULE_PARM_DESC(irqmode, "IRQ completion handler. 0-none, 1-softirq, 2-timer");
  81. static int completion_nsec = 10000;
  82. module_param(completion_nsec, int, S_IRUGO);
  83. MODULE_PARM_DESC(completion_nsec, "Time in ns to complete a request in hardware. Default: 10,000ns");
  84. static int hw_queue_depth = 64;
  85. module_param(hw_queue_depth, int, S_IRUGO);
  86. MODULE_PARM_DESC(hw_queue_depth, "Queue depth for each hardware queue. Default: 64");
  87. static bool use_per_node_hctx = false;
  88. module_param(use_per_node_hctx, bool, S_IRUGO);
  89. MODULE_PARM_DESC(use_per_node_hctx, "Use per-node allocation for hardware context queues. Default: false");
  90. static void put_tag(struct nullb_queue *nq, unsigned int tag)
  91. {
  92. clear_bit_unlock(tag, nq->tag_map);
  93. if (waitqueue_active(&nq->wait))
  94. wake_up(&nq->wait);
  95. }
  96. static unsigned int get_tag(struct nullb_queue *nq)
  97. {
  98. unsigned int tag;
  99. do {
  100. tag = find_first_zero_bit(nq->tag_map, nq->queue_depth);
  101. if (tag >= nq->queue_depth)
  102. return -1U;
  103. } while (test_and_set_bit_lock(tag, nq->tag_map));
  104. return tag;
  105. }
  106. static void free_cmd(struct nullb_cmd *cmd)
  107. {
  108. put_tag(cmd->nq, cmd->tag);
  109. }
  110. static struct nullb_cmd *__alloc_cmd(struct nullb_queue *nq)
  111. {
  112. struct nullb_cmd *cmd;
  113. unsigned int tag;
  114. tag = get_tag(nq);
  115. if (tag != -1U) {
  116. cmd = &nq->cmds[tag];
  117. cmd->tag = tag;
  118. cmd->nq = nq;
  119. return cmd;
  120. }
  121. return NULL;
  122. }
  123. static struct nullb_cmd *alloc_cmd(struct nullb_queue *nq, int can_wait)
  124. {
  125. struct nullb_cmd *cmd;
  126. DEFINE_WAIT(wait);
  127. cmd = __alloc_cmd(nq);
  128. if (cmd || !can_wait)
  129. return cmd;
  130. do {
  131. prepare_to_wait(&nq->wait, &wait, TASK_UNINTERRUPTIBLE);
  132. cmd = __alloc_cmd(nq);
  133. if (cmd)
  134. break;
  135. io_schedule();
  136. } while (1);
  137. finish_wait(&nq->wait, &wait);
  138. return cmd;
  139. }
  140. static void end_cmd(struct nullb_cmd *cmd)
  141. {
  142. switch (queue_mode) {
  143. case NULL_Q_MQ:
  144. blk_mq_end_request(cmd->rq, 0);
  145. return;
  146. case NULL_Q_RQ:
  147. INIT_LIST_HEAD(&cmd->rq->queuelist);
  148. blk_end_request_all(cmd->rq, 0);
  149. break;
  150. case NULL_Q_BIO:
  151. bio_endio(cmd->bio, 0);
  152. break;
  153. }
  154. free_cmd(cmd);
  155. }
  156. static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer)
  157. {
  158. struct completion_queue *cq;
  159. struct llist_node *entry;
  160. struct nullb_cmd *cmd;
  161. cq = &per_cpu(completion_queues, smp_processor_id());
  162. while ((entry = llist_del_all(&cq->list)) != NULL) {
  163. entry = llist_reverse_order(entry);
  164. do {
  165. cmd = container_of(entry, struct nullb_cmd, ll_list);
  166. entry = entry->next;
  167. end_cmd(cmd);
  168. } while (entry);
  169. }
  170. return HRTIMER_NORESTART;
  171. }
  172. static void null_cmd_end_timer(struct nullb_cmd *cmd)
  173. {
  174. struct completion_queue *cq = &per_cpu(completion_queues, get_cpu());
  175. cmd->ll_list.next = NULL;
  176. if (llist_add(&cmd->ll_list, &cq->list)) {
  177. ktime_t kt = ktime_set(0, completion_nsec);
  178. hrtimer_start(&cq->timer, kt, HRTIMER_MODE_REL);
  179. }
  180. put_cpu();
  181. }
  182. static void null_softirq_done_fn(struct request *rq)
  183. {
  184. if (queue_mode == NULL_Q_MQ)
  185. end_cmd(blk_mq_rq_to_pdu(rq));
  186. else
  187. end_cmd(rq->special);
  188. }
  189. static inline void null_handle_cmd(struct nullb_cmd *cmd)
  190. {
  191. /* Complete IO by inline, softirq or timer */
  192. switch (irqmode) {
  193. case NULL_IRQ_SOFTIRQ:
  194. switch (queue_mode) {
  195. case NULL_Q_MQ:
  196. blk_mq_complete_request(cmd->rq);
  197. break;
  198. case NULL_Q_RQ:
  199. blk_complete_request(cmd->rq);
  200. break;
  201. case NULL_Q_BIO:
  202. /*
  203. * XXX: no proper submitting cpu information available.
  204. */
  205. end_cmd(cmd);
  206. break;
  207. }
  208. break;
  209. case NULL_IRQ_NONE:
  210. end_cmd(cmd);
  211. break;
  212. case NULL_IRQ_TIMER:
  213. null_cmd_end_timer(cmd);
  214. break;
  215. }
  216. }
  217. static struct nullb_queue *nullb_to_queue(struct nullb *nullb)
  218. {
  219. int index = 0;
  220. if (nullb->nr_queues != 1)
  221. index = raw_smp_processor_id() / ((nr_cpu_ids + nullb->nr_queues - 1) / nullb->nr_queues);
  222. return &nullb->queues[index];
  223. }
  224. static void null_queue_bio(struct request_queue *q, struct bio *bio)
  225. {
  226. struct nullb *nullb = q->queuedata;
  227. struct nullb_queue *nq = nullb_to_queue(nullb);
  228. struct nullb_cmd *cmd;
  229. cmd = alloc_cmd(nq, 1);
  230. cmd->bio = bio;
  231. null_handle_cmd(cmd);
  232. }
  233. static int null_rq_prep_fn(struct request_queue *q, struct request *req)
  234. {
  235. struct nullb *nullb = q->queuedata;
  236. struct nullb_queue *nq = nullb_to_queue(nullb);
  237. struct nullb_cmd *cmd;
  238. cmd = alloc_cmd(nq, 0);
  239. if (cmd) {
  240. cmd->rq = req;
  241. req->special = cmd;
  242. return BLKPREP_OK;
  243. }
  244. return BLKPREP_DEFER;
  245. }
  246. static void null_request_fn(struct request_queue *q)
  247. {
  248. struct request *rq;
  249. while ((rq = blk_fetch_request(q)) != NULL) {
  250. struct nullb_cmd *cmd = rq->special;
  251. spin_unlock_irq(q->queue_lock);
  252. null_handle_cmd(cmd);
  253. spin_lock_irq(q->queue_lock);
  254. }
  255. }
  256. static int null_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq,
  257. bool last)
  258. {
  259. struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq);
  260. cmd->rq = rq;
  261. cmd->nq = hctx->driver_data;
  262. blk_mq_start_request(rq);
  263. null_handle_cmd(cmd);
  264. return BLK_MQ_RQ_QUEUE_OK;
  265. }
  266. static void null_init_queue(struct nullb *nullb, struct nullb_queue *nq)
  267. {
  268. BUG_ON(!nullb);
  269. BUG_ON(!nq);
  270. init_waitqueue_head(&nq->wait);
  271. nq->queue_depth = nullb->queue_depth;
  272. }
  273. static int null_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
  274. unsigned int index)
  275. {
  276. struct nullb *nullb = data;
  277. struct nullb_queue *nq = &nullb->queues[index];
  278. hctx->driver_data = nq;
  279. null_init_queue(nullb, nq);
  280. nullb->nr_queues++;
  281. return 0;
  282. }
  283. static struct blk_mq_ops null_mq_ops = {
  284. .queue_rq = null_queue_rq,
  285. .map_queue = blk_mq_map_queue,
  286. .init_hctx = null_init_hctx,
  287. .complete = null_softirq_done_fn,
  288. };
  289. static void null_del_dev(struct nullb *nullb)
  290. {
  291. list_del_init(&nullb->list);
  292. del_gendisk(nullb->disk);
  293. blk_cleanup_queue(nullb->q);
  294. if (queue_mode == NULL_Q_MQ)
  295. blk_mq_free_tag_set(&nullb->tag_set);
  296. put_disk(nullb->disk);
  297. kfree(nullb);
  298. }
  299. static int null_open(struct block_device *bdev, fmode_t mode)
  300. {
  301. return 0;
  302. }
  303. static void null_release(struct gendisk *disk, fmode_t mode)
  304. {
  305. }
  306. static const struct block_device_operations null_fops = {
  307. .owner = THIS_MODULE,
  308. .open = null_open,
  309. .release = null_release,
  310. };
  311. static int setup_commands(struct nullb_queue *nq)
  312. {
  313. struct nullb_cmd *cmd;
  314. int i, tag_size;
  315. nq->cmds = kzalloc(nq->queue_depth * sizeof(*cmd), GFP_KERNEL);
  316. if (!nq->cmds)
  317. return -ENOMEM;
  318. tag_size = ALIGN(nq->queue_depth, BITS_PER_LONG) / BITS_PER_LONG;
  319. nq->tag_map = kzalloc(tag_size * sizeof(unsigned long), GFP_KERNEL);
  320. if (!nq->tag_map) {
  321. kfree(nq->cmds);
  322. return -ENOMEM;
  323. }
  324. for (i = 0; i < nq->queue_depth; i++) {
  325. cmd = &nq->cmds[i];
  326. INIT_LIST_HEAD(&cmd->list);
  327. cmd->ll_list.next = NULL;
  328. cmd->tag = -1U;
  329. }
  330. return 0;
  331. }
  332. static void cleanup_queue(struct nullb_queue *nq)
  333. {
  334. kfree(nq->tag_map);
  335. kfree(nq->cmds);
  336. }
  337. static void cleanup_queues(struct nullb *nullb)
  338. {
  339. int i;
  340. for (i = 0; i < nullb->nr_queues; i++)
  341. cleanup_queue(&nullb->queues[i]);
  342. kfree(nullb->queues);
  343. }
  344. static int setup_queues(struct nullb *nullb)
  345. {
  346. nullb->queues = kzalloc(submit_queues * sizeof(struct nullb_queue),
  347. GFP_KERNEL);
  348. if (!nullb->queues)
  349. return -ENOMEM;
  350. nullb->nr_queues = 0;
  351. nullb->queue_depth = hw_queue_depth;
  352. return 0;
  353. }
  354. static int init_driver_queues(struct nullb *nullb)
  355. {
  356. struct nullb_queue *nq;
  357. int i, ret = 0;
  358. for (i = 0; i < submit_queues; i++) {
  359. nq = &nullb->queues[i];
  360. null_init_queue(nullb, nq);
  361. ret = setup_commands(nq);
  362. if (ret)
  363. goto err_queue;
  364. nullb->nr_queues++;
  365. }
  366. return 0;
  367. err_queue:
  368. cleanup_queues(nullb);
  369. return ret;
  370. }
  371. static int null_add_dev(void)
  372. {
  373. struct gendisk *disk;
  374. struct nullb *nullb;
  375. sector_t size;
  376. int rv;
  377. nullb = kzalloc_node(sizeof(*nullb), GFP_KERNEL, home_node);
  378. if (!nullb) {
  379. rv = -ENOMEM;
  380. goto out;
  381. }
  382. spin_lock_init(&nullb->lock);
  383. if (queue_mode == NULL_Q_MQ && use_per_node_hctx)
  384. submit_queues = nr_online_nodes;
  385. rv = setup_queues(nullb);
  386. if (rv)
  387. goto out_free_nullb;
  388. if (queue_mode == NULL_Q_MQ) {
  389. nullb->tag_set.ops = &null_mq_ops;
  390. nullb->tag_set.nr_hw_queues = submit_queues;
  391. nullb->tag_set.queue_depth = hw_queue_depth;
  392. nullb->tag_set.numa_node = home_node;
  393. nullb->tag_set.cmd_size = sizeof(struct nullb_cmd);
  394. nullb->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
  395. nullb->tag_set.driver_data = nullb;
  396. rv = blk_mq_alloc_tag_set(&nullb->tag_set);
  397. if (rv)
  398. goto out_cleanup_queues;
  399. nullb->q = blk_mq_init_queue(&nullb->tag_set);
  400. if (!nullb->q) {
  401. rv = -ENOMEM;
  402. goto out_cleanup_tags;
  403. }
  404. } else if (queue_mode == NULL_Q_BIO) {
  405. nullb->q = blk_alloc_queue_node(GFP_KERNEL, home_node);
  406. if (!nullb->q) {
  407. rv = -ENOMEM;
  408. goto out_cleanup_queues;
  409. }
  410. blk_queue_make_request(nullb->q, null_queue_bio);
  411. init_driver_queues(nullb);
  412. } else {
  413. nullb->q = blk_init_queue_node(null_request_fn, &nullb->lock, home_node);
  414. if (!nullb->q) {
  415. rv = -ENOMEM;
  416. goto out_cleanup_queues;
  417. }
  418. blk_queue_prep_rq(nullb->q, null_rq_prep_fn);
  419. blk_queue_softirq_done(nullb->q, null_softirq_done_fn);
  420. init_driver_queues(nullb);
  421. }
  422. nullb->q->queuedata = nullb;
  423. queue_flag_set_unlocked(QUEUE_FLAG_NONROT, nullb->q);
  424. queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, nullb->q);
  425. disk = nullb->disk = alloc_disk_node(1, home_node);
  426. if (!disk) {
  427. rv = -ENOMEM;
  428. goto out_cleanup_blk_queue;
  429. }
  430. mutex_lock(&lock);
  431. list_add_tail(&nullb->list, &nullb_list);
  432. nullb->index = nullb_indexes++;
  433. mutex_unlock(&lock);
  434. blk_queue_logical_block_size(nullb->q, bs);
  435. blk_queue_physical_block_size(nullb->q, bs);
  436. size = gb * 1024 * 1024 * 1024ULL;
  437. sector_div(size, bs);
  438. set_capacity(disk, size);
  439. disk->flags |= GENHD_FL_EXT_DEVT;
  440. disk->major = null_major;
  441. disk->first_minor = nullb->index;
  442. disk->fops = &null_fops;
  443. disk->private_data = nullb;
  444. disk->queue = nullb->q;
  445. sprintf(disk->disk_name, "nullb%d", nullb->index);
  446. add_disk(disk);
  447. return 0;
  448. out_cleanup_blk_queue:
  449. blk_cleanup_queue(nullb->q);
  450. out_cleanup_tags:
  451. if (queue_mode == NULL_Q_MQ)
  452. blk_mq_free_tag_set(&nullb->tag_set);
  453. out_cleanup_queues:
  454. cleanup_queues(nullb);
  455. out_free_nullb:
  456. kfree(nullb);
  457. out:
  458. return rv;
  459. }
  460. static int __init null_init(void)
  461. {
  462. unsigned int i;
  463. if (bs > PAGE_SIZE) {
  464. pr_warn("null_blk: invalid block size\n");
  465. pr_warn("null_blk: defaults block size to %lu\n", PAGE_SIZE);
  466. bs = PAGE_SIZE;
  467. }
  468. if (queue_mode == NULL_Q_MQ && use_per_node_hctx) {
  469. if (submit_queues < nr_online_nodes) {
  470. pr_warn("null_blk: submit_queues param is set to %u.",
  471. nr_online_nodes);
  472. submit_queues = nr_online_nodes;
  473. }
  474. } else if (submit_queues > nr_cpu_ids)
  475. submit_queues = nr_cpu_ids;
  476. else if (!submit_queues)
  477. submit_queues = 1;
  478. mutex_init(&lock);
  479. /* Initialize a separate list for each CPU for issuing softirqs */
  480. for_each_possible_cpu(i) {
  481. struct completion_queue *cq = &per_cpu(completion_queues, i);
  482. init_llist_head(&cq->list);
  483. if (irqmode != NULL_IRQ_TIMER)
  484. continue;
  485. hrtimer_init(&cq->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
  486. cq->timer.function = null_cmd_timer_expired;
  487. }
  488. null_major = register_blkdev(0, "nullb");
  489. if (null_major < 0)
  490. return null_major;
  491. for (i = 0; i < nr_devices; i++) {
  492. if (null_add_dev()) {
  493. unregister_blkdev(null_major, "nullb");
  494. return -EINVAL;
  495. }
  496. }
  497. pr_info("null: module loaded\n");
  498. return 0;
  499. }
  500. static void __exit null_exit(void)
  501. {
  502. struct nullb *nullb;
  503. unregister_blkdev(null_major, "nullb");
  504. mutex_lock(&lock);
  505. while (!list_empty(&nullb_list)) {
  506. nullb = list_entry(nullb_list.next, struct nullb, list);
  507. null_del_dev(nullb);
  508. }
  509. mutex_unlock(&lock);
  510. }
  511. module_init(null_init);
  512. module_exit(null_exit);
  513. MODULE_AUTHOR("Jens Axboe <jaxboe@fusionio.com>");
  514. MODULE_LICENSE("GPL");