blk-mq-tag.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541
  1. /*
  2. * Tag allocation using scalable bitmaps. Uses active queue tracking to support
  3. * fairer distribution of tags between multiple submitters when a shared tag map
  4. * is used.
  5. *
  6. * Copyright (C) 2013-2014 Jens Axboe
  7. */
  8. #include <linux/kernel.h>
  9. #include <linux/module.h>
  10. #include <linux/blk-mq.h>
  11. #include "blk.h"
  12. #include "blk-mq.h"
  13. #include "blk-mq-tag.h"
  14. bool blk_mq_has_free_tags(struct blk_mq_tags *tags)
  15. {
  16. if (!tags)
  17. return true;
  18. return sbitmap_any_bit_clear(&tags->bitmap_tags.sb);
  19. }
  20. /*
  21. * If a previously inactive queue goes active, bump the active user count.
  22. * We need to do this before try to allocate driver tag, then even if fail
  23. * to get tag when first time, the other shared-tag users could reserve
  24. * budget for it.
  25. */
  26. bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
  27. {
  28. if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) &&
  29. !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
  30. atomic_inc(&hctx->tags->active_queues);
  31. return true;
  32. }
  33. /*
  34. * Wakeup all potentially sleeping on tags
  35. */
  36. void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve)
  37. {
  38. sbitmap_queue_wake_all(&tags->bitmap_tags);
  39. if (include_reserve)
  40. sbitmap_queue_wake_all(&tags->breserved_tags);
  41. }
  42. /*
  43. * If a previously busy queue goes inactive, potential waiters could now
  44. * be allowed to queue. Wake them up and check.
  45. */
  46. void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
  47. {
  48. struct blk_mq_tags *tags = hctx->tags;
  49. if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
  50. return;
  51. atomic_dec(&tags->active_queues);
  52. blk_mq_tag_wakeup_all(tags, false);
  53. }
  54. /*
  55. * For shared tag users, we track the number of currently active users
  56. * and attempt to provide a fair share of the tag depth for each of them.
  57. */
  58. static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
  59. struct sbitmap_queue *bt)
  60. {
  61. unsigned int depth, users;
  62. if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_SHARED))
  63. return true;
  64. if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
  65. return true;
  66. /*
  67. * Don't try dividing an ant
  68. */
  69. if (bt->sb.depth == 1)
  70. return true;
  71. users = atomic_read(&hctx->tags->active_queues);
  72. if (!users)
  73. return true;
  74. /*
  75. * Allow at least some tags
  76. */
  77. depth = max((bt->sb.depth + users - 1) / users, 4U);
  78. return atomic_read(&hctx->nr_active) < depth;
  79. }
  80. static int __blk_mq_get_tag(struct blk_mq_alloc_data *data,
  81. struct sbitmap_queue *bt)
  82. {
  83. if (!(data->flags & BLK_MQ_REQ_INTERNAL) &&
  84. !hctx_may_queue(data->hctx, bt))
  85. return -1;
  86. if (data->shallow_depth)
  87. return __sbitmap_queue_get_shallow(bt, data->shallow_depth);
  88. else
  89. return __sbitmap_queue_get(bt);
  90. }
  91. unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
  92. {
  93. struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
  94. struct sbitmap_queue *bt;
  95. struct sbq_wait_state *ws;
  96. DEFINE_WAIT(wait);
  97. unsigned int tag_offset;
  98. bool drop_ctx;
  99. int tag;
  100. if (data->flags & BLK_MQ_REQ_RESERVED) {
  101. if (unlikely(!tags->nr_reserved_tags)) {
  102. WARN_ON_ONCE(1);
  103. return BLK_MQ_TAG_FAIL;
  104. }
  105. bt = &tags->breserved_tags;
  106. tag_offset = 0;
  107. } else {
  108. bt = &tags->bitmap_tags;
  109. tag_offset = tags->nr_reserved_tags;
  110. }
  111. tag = __blk_mq_get_tag(data, bt);
  112. if (tag != -1)
  113. goto found_tag;
  114. if (data->flags & BLK_MQ_REQ_NOWAIT)
  115. return BLK_MQ_TAG_FAIL;
  116. ws = bt_wait_ptr(bt, data->hctx);
  117. drop_ctx = data->ctx == NULL;
  118. do {
  119. struct sbitmap_queue *bt_prev;
  120. /*
  121. * We're out of tags on this hardware queue, kick any
  122. * pending IO submits before going to sleep waiting for
  123. * some to complete.
  124. */
  125. blk_mq_run_hw_queue(data->hctx, false);
  126. /*
  127. * Retry tag allocation after running the hardware queue,
  128. * as running the queue may also have found completions.
  129. */
  130. tag = __blk_mq_get_tag(data, bt);
  131. if (tag != -1)
  132. break;
  133. prepare_to_wait_exclusive(&ws->wait, &wait,
  134. TASK_UNINTERRUPTIBLE);
  135. tag = __blk_mq_get_tag(data, bt);
  136. if (tag != -1)
  137. break;
  138. if (data->ctx)
  139. blk_mq_put_ctx(data->ctx);
  140. bt_prev = bt;
  141. io_schedule();
  142. data->ctx = blk_mq_get_ctx(data->q);
  143. data->hctx = blk_mq_map_queue(data->q, data->ctx->cpu);
  144. tags = blk_mq_tags_from_data(data);
  145. if (data->flags & BLK_MQ_REQ_RESERVED)
  146. bt = &tags->breserved_tags;
  147. else
  148. bt = &tags->bitmap_tags;
  149. finish_wait(&ws->wait, &wait);
  150. /*
  151. * If destination hw queue is changed, fake wake up on
  152. * previous queue for compensating the wake up miss, so
  153. * other allocations on previous queue won't be starved.
  154. */
  155. if (bt != bt_prev)
  156. sbitmap_queue_wake_up(bt_prev);
  157. ws = bt_wait_ptr(bt, data->hctx);
  158. } while (1);
  159. if (drop_ctx && data->ctx)
  160. blk_mq_put_ctx(data->ctx);
  161. finish_wait(&ws->wait, &wait);
  162. found_tag:
  163. return tag + tag_offset;
  164. }
  165. void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_tags *tags,
  166. struct blk_mq_ctx *ctx, unsigned int tag)
  167. {
  168. if (!blk_mq_tag_is_reserved(tags, tag)) {
  169. const int real_tag = tag - tags->nr_reserved_tags;
  170. BUG_ON(real_tag >= tags->nr_tags);
  171. sbitmap_queue_clear(&tags->bitmap_tags, real_tag, ctx->cpu);
  172. } else {
  173. BUG_ON(tag >= tags->nr_reserved_tags);
  174. sbitmap_queue_clear(&tags->breserved_tags, tag, ctx->cpu);
  175. }
  176. }
  177. struct bt_iter_data {
  178. struct blk_mq_hw_ctx *hctx;
  179. busy_iter_fn *fn;
  180. void *data;
  181. bool reserved;
  182. };
  183. static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
  184. {
  185. struct bt_iter_data *iter_data = data;
  186. struct blk_mq_hw_ctx *hctx = iter_data->hctx;
  187. struct blk_mq_tags *tags = hctx->tags;
  188. bool reserved = iter_data->reserved;
  189. struct request *rq;
  190. if (!reserved)
  191. bitnr += tags->nr_reserved_tags;
  192. rq = tags->rqs[bitnr];
  193. /*
  194. * We can hit rq == NULL here, because the tagging functions
  195. * test and set the bit before assigning ->rqs[].
  196. */
  197. if (rq && rq->q == hctx->queue)
  198. iter_data->fn(hctx, rq, iter_data->data, reserved);
  199. return true;
  200. }
  201. /**
  202. * bt_for_each - iterate over the requests associated with a hardware queue
  203. * @hctx: Hardware queue to examine.
  204. * @bt: sbitmap to examine. This is either the breserved_tags member
  205. * or the bitmap_tags member of struct blk_mq_tags.
  206. * @fn: Pointer to the function that will be called for each request
  207. * associated with @hctx that has been assigned a driver tag.
  208. * @fn will be called as follows: @fn(@hctx, rq, @data, @reserved)
  209. * where rq is a pointer to a request.
  210. * @data: Will be passed as third argument to @fn.
  211. * @reserved: Indicates whether @bt is the breserved_tags member or the
  212. * bitmap_tags member of struct blk_mq_tags.
  213. */
  214. static void bt_for_each(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt,
  215. busy_iter_fn *fn, void *data, bool reserved)
  216. {
  217. struct bt_iter_data iter_data = {
  218. .hctx = hctx,
  219. .fn = fn,
  220. .data = data,
  221. .reserved = reserved,
  222. };
  223. sbitmap_for_each_set(&bt->sb, bt_iter, &iter_data);
  224. }
  225. struct bt_tags_iter_data {
  226. struct blk_mq_tags *tags;
  227. busy_tag_iter_fn *fn;
  228. void *data;
  229. bool reserved;
  230. };
  231. static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
  232. {
  233. struct bt_tags_iter_data *iter_data = data;
  234. struct blk_mq_tags *tags = iter_data->tags;
  235. bool reserved = iter_data->reserved;
  236. struct request *rq;
  237. if (!reserved)
  238. bitnr += tags->nr_reserved_tags;
  239. /*
  240. * We can hit rq == NULL here, because the tagging functions
  241. * test and set the bit before assining ->rqs[].
  242. */
  243. rq = tags->rqs[bitnr];
  244. if (rq && blk_mq_request_started(rq))
  245. iter_data->fn(rq, iter_data->data, reserved);
  246. return true;
  247. }
  248. /**
  249. * bt_tags_for_each - iterate over the requests in a tag map
  250. * @tags: Tag map to iterate over.
  251. * @bt: sbitmap to examine. This is either the breserved_tags member
  252. * or the bitmap_tags member of struct blk_mq_tags.
  253. * @fn: Pointer to the function that will be called for each started
  254. * request. @fn will be called as follows: @fn(rq, @data,
  255. * @reserved) where rq is a pointer to a request.
  256. * @data: Will be passed as second argument to @fn.
  257. * @reserved: Indicates whether @bt is the breserved_tags member or the
  258. * bitmap_tags member of struct blk_mq_tags.
  259. */
  260. static void bt_tags_for_each(struct blk_mq_tags *tags, struct sbitmap_queue *bt,
  261. busy_tag_iter_fn *fn, void *data, bool reserved)
  262. {
  263. struct bt_tags_iter_data iter_data = {
  264. .tags = tags,
  265. .fn = fn,
  266. .data = data,
  267. .reserved = reserved,
  268. };
  269. if (tags->rqs)
  270. sbitmap_for_each_set(&bt->sb, bt_tags_iter, &iter_data);
  271. }
  272. /**
  273. * blk_mq_all_tag_busy_iter - iterate over all started requests in a tag map
  274. * @tags: Tag map to iterate over.
  275. * @fn: Pointer to the function that will be called for each started
  276. * request. @fn will be called as follows: @fn(rq, @priv,
  277. * reserved) where rq is a pointer to a request. 'reserved'
  278. * indicates whether or not @rq is a reserved request.
  279. * @priv: Will be passed as second argument to @fn.
  280. */
  281. static void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags,
  282. busy_tag_iter_fn *fn, void *priv)
  283. {
  284. if (tags->nr_reserved_tags)
  285. bt_tags_for_each(tags, &tags->breserved_tags, fn, priv, true);
  286. bt_tags_for_each(tags, &tags->bitmap_tags, fn, priv, false);
  287. }
  288. /**
  289. * blk_mq_tagset_busy_iter - iterate over all started requests in a tag set
  290. * @tagset: Tag set to iterate over.
  291. * @fn: Pointer to the function that will be called for each started
  292. * request. @fn will be called as follows: @fn(rq, @priv,
  293. * reserved) where rq is a pointer to a request. 'reserved'
  294. * indicates whether or not @rq is a reserved request.
  295. * @priv: Will be passed as second argument to @fn.
  296. */
  297. void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
  298. busy_tag_iter_fn *fn, void *priv)
  299. {
  300. int i;
  301. for (i = 0; i < tagset->nr_hw_queues; i++) {
  302. if (tagset->tags && tagset->tags[i])
  303. blk_mq_all_tag_busy_iter(tagset->tags[i], fn, priv);
  304. }
  305. }
  306. EXPORT_SYMBOL(blk_mq_tagset_busy_iter);
  307. /**
  308. * blk_mq_queue_tag_busy_iter - iterate over all requests with a driver tag
  309. * @q: Request queue to examine.
  310. * @fn: Pointer to the function that will be called for each request
  311. * on @q. @fn will be called as follows: @fn(hctx, rq, @priv,
  312. * reserved) where rq is a pointer to a request and hctx points
  313. * to the hardware queue associated with the request. 'reserved'
  314. * indicates whether or not @rq is a reserved request.
  315. * @priv: Will be passed as third argument to @fn.
  316. *
  317. * Note: if @q->tag_set is shared with other request queues then @fn will be
  318. * called for all requests on all queues that share that tag set and not only
  319. * for requests associated with @q.
  320. */
  321. void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
  322. void *priv)
  323. {
  324. struct blk_mq_hw_ctx *hctx;
  325. int i;
  326. /*
  327. * __blk_mq_update_nr_hw_queues() updates nr_hw_queues and queue_hw_ctx
  328. * while the queue is frozen. So we can use q_usage_counter to avoid
  329. * racing with it. __blk_mq_update_nr_hw_queues() uses
  330. * synchronize_rcu() to ensure this function left the critical section
  331. * below.
  332. */
  333. if (!percpu_ref_tryget(&q->q_usage_counter))
  334. return;
  335. queue_for_each_hw_ctx(q, hctx, i) {
  336. struct blk_mq_tags *tags = hctx->tags;
  337. /*
  338. * If no software queues are currently mapped to this
  339. * hardware queue, there's nothing to check
  340. */
  341. if (!blk_mq_hw_queue_mapped(hctx))
  342. continue;
  343. if (tags->nr_reserved_tags)
  344. bt_for_each(hctx, &tags->breserved_tags, fn, priv, true);
  345. bt_for_each(hctx, &tags->bitmap_tags, fn, priv, false);
  346. }
  347. blk_queue_exit(q);
  348. }
  349. static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth,
  350. bool round_robin, int node)
  351. {
  352. return sbitmap_queue_init_node(bt, depth, -1, round_robin, GFP_KERNEL,
  353. node);
  354. }
  355. static struct blk_mq_tags *blk_mq_init_bitmap_tags(struct blk_mq_tags *tags,
  356. int node, int alloc_policy)
  357. {
  358. unsigned int depth = tags->nr_tags - tags->nr_reserved_tags;
  359. bool round_robin = alloc_policy == BLK_TAG_ALLOC_RR;
  360. if (bt_alloc(&tags->bitmap_tags, depth, round_robin, node))
  361. goto free_tags;
  362. if (bt_alloc(&tags->breserved_tags, tags->nr_reserved_tags, round_robin,
  363. node))
  364. goto free_bitmap_tags;
  365. return tags;
  366. free_bitmap_tags:
  367. sbitmap_queue_free(&tags->bitmap_tags);
  368. free_tags:
  369. kfree(tags);
  370. return NULL;
  371. }
  372. struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
  373. unsigned int reserved_tags,
  374. int node, int alloc_policy)
  375. {
  376. struct blk_mq_tags *tags;
  377. if (total_tags > BLK_MQ_TAG_MAX) {
  378. pr_err("blk-mq: tag depth too large\n");
  379. return NULL;
  380. }
  381. tags = kzalloc_node(sizeof(*tags), GFP_KERNEL, node);
  382. if (!tags)
  383. return NULL;
  384. tags->nr_tags = total_tags;
  385. tags->nr_reserved_tags = reserved_tags;
  386. return blk_mq_init_bitmap_tags(tags, node, alloc_policy);
  387. }
  388. void blk_mq_free_tags(struct blk_mq_tags *tags)
  389. {
  390. sbitmap_queue_free(&tags->bitmap_tags);
  391. sbitmap_queue_free(&tags->breserved_tags);
  392. kfree(tags);
  393. }
  394. int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
  395. struct blk_mq_tags **tagsptr, unsigned int tdepth,
  396. bool can_grow)
  397. {
  398. struct blk_mq_tags *tags = *tagsptr;
  399. if (tdepth <= tags->nr_reserved_tags)
  400. return -EINVAL;
  401. /*
  402. * If we are allowed to grow beyond the original size, allocate
  403. * a new set of tags before freeing the old one.
  404. */
  405. if (tdepth > tags->nr_tags) {
  406. struct blk_mq_tag_set *set = hctx->queue->tag_set;
  407. struct blk_mq_tags *new;
  408. bool ret;
  409. if (!can_grow)
  410. return -EINVAL;
  411. /*
  412. * We need some sort of upper limit, set it high enough that
  413. * no valid use cases should require more.
  414. */
  415. if (tdepth > 16 * BLKDEV_MAX_RQ)
  416. return -EINVAL;
  417. new = blk_mq_alloc_rq_map(set, hctx->queue_num, tdepth,
  418. tags->nr_reserved_tags);
  419. if (!new)
  420. return -ENOMEM;
  421. ret = blk_mq_alloc_rqs(set, new, hctx->queue_num, tdepth);
  422. if (ret) {
  423. blk_mq_free_rq_map(new);
  424. return -ENOMEM;
  425. }
  426. blk_mq_free_rqs(set, *tagsptr, hctx->queue_num);
  427. blk_mq_free_rq_map(*tagsptr);
  428. *tagsptr = new;
  429. } else {
  430. /*
  431. * Don't need (or can't) update reserved tags here, they
  432. * remain static and should never need resizing.
  433. */
  434. sbitmap_queue_resize(&tags->bitmap_tags,
  435. tdepth - tags->nr_reserved_tags);
  436. }
  437. return 0;
  438. }
  439. /**
  440. * blk_mq_unique_tag() - return a tag that is unique queue-wide
  441. * @rq: request for which to compute a unique tag
  442. *
  443. * The tag field in struct request is unique per hardware queue but not over
  444. * all hardware queues. Hence this function that returns a tag with the
  445. * hardware context index in the upper bits and the per hardware queue tag in
  446. * the lower bits.
  447. *
  448. * Note: When called for a request that is queued on a non-multiqueue request
  449. * queue, the hardware context index is set to zero.
  450. */
  451. u32 blk_mq_unique_tag(struct request *rq)
  452. {
  453. struct request_queue *q = rq->q;
  454. struct blk_mq_hw_ctx *hctx;
  455. int hwq = 0;
  456. if (q->mq_ops) {
  457. hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu);
  458. hwq = hctx->queue_num;
  459. }
  460. return (hwq << BLK_MQ_UNIQUE_TAG_BITS) |
  461. (rq->tag & BLK_MQ_UNIQUE_TAG_MASK);
  462. }
  463. EXPORT_SYMBOL(blk_mq_unique_tag);