kyber-iosched.c 28 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060
  1. /*
  2. * The Kyber I/O scheduler. Controls latency by throttling queue depths using
  3. * scalable techniques.
  4. *
  5. * Copyright (C) 2017 Facebook
  6. *
  7. * This program is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU General Public
  9. * License v2 as published by the Free Software Foundation.
  10. *
  11. * This program is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU General Public License
  17. * along with this program. If not, see <https://www.gnu.org/licenses/>.
  18. */
  19. #include <linux/kernel.h>
  20. #include <linux/blkdev.h>
  21. #include <linux/blk-mq.h>
  22. #include <linux/elevator.h>
  23. #include <linux/module.h>
  24. #include <linux/sbitmap.h>
  25. #include "blk.h"
  26. #include "blk-mq.h"
  27. #include "blk-mq-debugfs.h"
  28. #include "blk-mq-sched.h"
  29. #include "blk-mq-tag.h"
  30. #define CREATE_TRACE_POINTS
  31. #include <trace/events/kyber.h>
  32. /*
  33. * Scheduling domains: the device is divided into multiple domains based on the
  34. * request type.
  35. */
  36. enum {
  37. KYBER_READ,
  38. KYBER_WRITE,
  39. KYBER_DISCARD,
  40. KYBER_OTHER,
  41. KYBER_NUM_DOMAINS,
  42. };
  43. static const char *kyber_domain_names[] = {
  44. [KYBER_READ] = "READ",
  45. [KYBER_WRITE] = "WRITE",
  46. [KYBER_DISCARD] = "DISCARD",
  47. [KYBER_OTHER] = "OTHER",
  48. };
  49. enum {
  50. /*
  51. * In order to prevent starvation of synchronous requests by a flood of
  52. * asynchronous requests, we reserve 25% of requests for synchronous
  53. * operations.
  54. */
  55. KYBER_ASYNC_PERCENT = 75,
  56. };
  57. /*
  58. * Maximum device-wide depth for each scheduling domain.
  59. *
  60. * Even for fast devices with lots of tags like NVMe, you can saturate the
  61. * device with only a fraction of the maximum possible queue depth. So, we cap
  62. * these to a reasonable value.
  63. */
  64. static const unsigned int kyber_depth[] = {
  65. [KYBER_READ] = 256,
  66. [KYBER_WRITE] = 128,
  67. [KYBER_DISCARD] = 64,
  68. [KYBER_OTHER] = 16,
  69. };
  70. /*
  71. * Default latency targets for each scheduling domain.
  72. */
  73. static const u64 kyber_latency_targets[] = {
  74. [KYBER_READ] = 2ULL * NSEC_PER_MSEC,
  75. [KYBER_WRITE] = 10ULL * NSEC_PER_MSEC,
  76. [KYBER_DISCARD] = 5ULL * NSEC_PER_SEC,
  77. };
  78. /*
  79. * Batch size (number of requests we'll dispatch in a row) for each scheduling
  80. * domain.
  81. */
  82. static const unsigned int kyber_batch_size[] = {
  83. [KYBER_READ] = 16,
  84. [KYBER_WRITE] = 8,
  85. [KYBER_DISCARD] = 1,
  86. [KYBER_OTHER] = 1,
  87. };
  88. /*
  89. * Requests latencies are recorded in a histogram with buckets defined relative
  90. * to the target latency:
  91. *
  92. * <= 1/4 * target latency
  93. * <= 1/2 * target latency
  94. * <= 3/4 * target latency
  95. * <= target latency
  96. * <= 1 1/4 * target latency
  97. * <= 1 1/2 * target latency
  98. * <= 1 3/4 * target latency
  99. * > 1 3/4 * target latency
  100. */
  101. enum {
  102. /*
  103. * The width of the latency histogram buckets is
  104. * 1 / (1 << KYBER_LATENCY_SHIFT) * target latency.
  105. */
  106. KYBER_LATENCY_SHIFT = 2,
  107. /*
  108. * The first (1 << KYBER_LATENCY_SHIFT) buckets are <= target latency,
  109. * thus, "good".
  110. */
  111. KYBER_GOOD_BUCKETS = 1 << KYBER_LATENCY_SHIFT,
  112. /* There are also (1 << KYBER_LATENCY_SHIFT) "bad" buckets. */
  113. KYBER_LATENCY_BUCKETS = 2 << KYBER_LATENCY_SHIFT,
  114. };
  115. /*
  116. * We measure both the total latency and the I/O latency (i.e., latency after
  117. * submitting to the device).
  118. */
  119. enum {
  120. KYBER_TOTAL_LATENCY,
  121. KYBER_IO_LATENCY,
  122. };
  123. static const char *kyber_latency_type_names[] = {
  124. [KYBER_TOTAL_LATENCY] = "total",
  125. [KYBER_IO_LATENCY] = "I/O",
  126. };
  127. /*
  128. * Per-cpu latency histograms: total latency and I/O latency for each scheduling
  129. * domain except for KYBER_OTHER.
  130. */
  131. struct kyber_cpu_latency {
  132. atomic_t buckets[KYBER_OTHER][2][KYBER_LATENCY_BUCKETS];
  133. };
  134. /*
  135. * There is a same mapping between ctx & hctx and kcq & khd,
  136. * we use request->mq_ctx->index_hw to index the kcq in khd.
  137. */
  138. struct kyber_ctx_queue {
  139. /*
  140. * Used to ensure operations on rq_list and kcq_map to be an atmoic one.
  141. * Also protect the rqs on rq_list when merge.
  142. */
  143. spinlock_t lock;
  144. struct list_head rq_list[KYBER_NUM_DOMAINS];
  145. } ____cacheline_aligned_in_smp;
  146. struct kyber_queue_data {
  147. struct request_queue *q;
  148. /*
  149. * Each scheduling domain has a limited number of in-flight requests
  150. * device-wide, limited by these tokens.
  151. */
  152. struct sbitmap_queue domain_tokens[KYBER_NUM_DOMAINS];
  153. /*
  154. * Async request percentage, converted to per-word depth for
  155. * sbitmap_get_shallow().
  156. */
  157. unsigned int async_depth;
  158. struct kyber_cpu_latency __percpu *cpu_latency;
  159. /* Timer for stats aggregation and adjusting domain tokens. */
  160. struct timer_list timer;
  161. unsigned int latency_buckets[KYBER_OTHER][2][KYBER_LATENCY_BUCKETS];
  162. unsigned long latency_timeout[KYBER_OTHER];
  163. int domain_p99[KYBER_OTHER];
  164. /* Target latencies in nanoseconds. */
  165. u64 latency_targets[KYBER_OTHER];
  166. };
  167. struct kyber_hctx_data {
  168. spinlock_t lock;
  169. struct list_head rqs[KYBER_NUM_DOMAINS];
  170. unsigned int cur_domain;
  171. unsigned int batching;
  172. struct kyber_ctx_queue *kcqs;
  173. struct sbitmap kcq_map[KYBER_NUM_DOMAINS];
  174. wait_queue_entry_t domain_wait[KYBER_NUM_DOMAINS];
  175. struct sbq_wait_state *domain_ws[KYBER_NUM_DOMAINS];
  176. atomic_t wait_index[KYBER_NUM_DOMAINS];
  177. };
  178. static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
  179. void *key);
  180. static unsigned int kyber_sched_domain(unsigned int op)
  181. {
  182. switch (op & REQ_OP_MASK) {
  183. case REQ_OP_READ:
  184. return KYBER_READ;
  185. case REQ_OP_WRITE:
  186. return KYBER_WRITE;
  187. case REQ_OP_DISCARD:
  188. return KYBER_DISCARD;
  189. default:
  190. return KYBER_OTHER;
  191. }
  192. }
  193. static void flush_latency_buckets(struct kyber_queue_data *kqd,
  194. struct kyber_cpu_latency *cpu_latency,
  195. unsigned int sched_domain, unsigned int type)
  196. {
  197. unsigned int *buckets = kqd->latency_buckets[sched_domain][type];
  198. atomic_t *cpu_buckets = cpu_latency->buckets[sched_domain][type];
  199. unsigned int bucket;
  200. for (bucket = 0; bucket < KYBER_LATENCY_BUCKETS; bucket++)
  201. buckets[bucket] += atomic_xchg(&cpu_buckets[bucket], 0);
  202. }
  203. /*
  204. * Calculate the histogram bucket with the given percentile rank, or -1 if there
  205. * aren't enough samples yet.
  206. */
  207. static int calculate_percentile(struct kyber_queue_data *kqd,
  208. unsigned int sched_domain, unsigned int type,
  209. unsigned int percentile)
  210. {
  211. unsigned int *buckets = kqd->latency_buckets[sched_domain][type];
  212. unsigned int bucket, samples = 0, percentile_samples;
  213. for (bucket = 0; bucket < KYBER_LATENCY_BUCKETS; bucket++)
  214. samples += buckets[bucket];
  215. if (!samples)
  216. return -1;
  217. /*
  218. * We do the calculation once we have 500 samples or one second passes
  219. * since the first sample was recorded, whichever comes first.
  220. */
  221. if (!kqd->latency_timeout[sched_domain])
  222. kqd->latency_timeout[sched_domain] = max(jiffies + HZ, 1UL);
  223. if (samples < 500 &&
  224. time_is_after_jiffies(kqd->latency_timeout[sched_domain])) {
  225. return -1;
  226. }
  227. kqd->latency_timeout[sched_domain] = 0;
  228. percentile_samples = DIV_ROUND_UP(samples * percentile, 100);
  229. for (bucket = 0; bucket < KYBER_LATENCY_BUCKETS - 1; bucket++) {
  230. if (buckets[bucket] >= percentile_samples)
  231. break;
  232. percentile_samples -= buckets[bucket];
  233. }
  234. memset(buckets, 0, sizeof(kqd->latency_buckets[sched_domain][type]));
  235. trace_kyber_latency(kqd->q, kyber_domain_names[sched_domain],
  236. kyber_latency_type_names[type], percentile,
  237. bucket + 1, 1 << KYBER_LATENCY_SHIFT, samples);
  238. return bucket;
  239. }
  240. static void kyber_resize_domain(struct kyber_queue_data *kqd,
  241. unsigned int sched_domain, unsigned int depth)
  242. {
  243. depth = clamp(depth, 1U, kyber_depth[sched_domain]);
  244. if (depth != kqd->domain_tokens[sched_domain].sb.depth) {
  245. sbitmap_queue_resize(&kqd->domain_tokens[sched_domain], depth);
  246. trace_kyber_adjust(kqd->q, kyber_domain_names[sched_domain],
  247. depth);
  248. }
  249. }
  250. static void kyber_timer_fn(struct timer_list *t)
  251. {
  252. struct kyber_queue_data *kqd = from_timer(kqd, t, timer);
  253. unsigned int sched_domain;
  254. int cpu;
  255. bool bad = false;
  256. /* Sum all of the per-cpu latency histograms. */
  257. for_each_online_cpu(cpu) {
  258. struct kyber_cpu_latency *cpu_latency;
  259. cpu_latency = per_cpu_ptr(kqd->cpu_latency, cpu);
  260. for (sched_domain = 0; sched_domain < KYBER_OTHER; sched_domain++) {
  261. flush_latency_buckets(kqd, cpu_latency, sched_domain,
  262. KYBER_TOTAL_LATENCY);
  263. flush_latency_buckets(kqd, cpu_latency, sched_domain,
  264. KYBER_IO_LATENCY);
  265. }
  266. }
  267. /*
  268. * Check if any domains have a high I/O latency, which might indicate
  269. * congestion in the device. Note that we use the p90; we don't want to
  270. * be too sensitive to outliers here.
  271. */
  272. for (sched_domain = 0; sched_domain < KYBER_OTHER; sched_domain++) {
  273. int p90;
  274. p90 = calculate_percentile(kqd, sched_domain, KYBER_IO_LATENCY,
  275. 90);
  276. if (p90 >= KYBER_GOOD_BUCKETS)
  277. bad = true;
  278. }
  279. /*
  280. * Adjust the scheduling domain depths. If we determined that there was
  281. * congestion, we throttle all domains with good latencies. Either way,
  282. * we ease up on throttling domains with bad latencies.
  283. */
  284. for (sched_domain = 0; sched_domain < KYBER_OTHER; sched_domain++) {
  285. unsigned int orig_depth, depth;
  286. int p99;
  287. p99 = calculate_percentile(kqd, sched_domain,
  288. KYBER_TOTAL_LATENCY, 99);
  289. /*
  290. * This is kind of subtle: different domains will not
  291. * necessarily have enough samples to calculate the latency
  292. * percentiles during the same window, so we have to remember
  293. * the p99 for the next time we observe congestion; once we do,
  294. * we don't want to throttle again until we get more data, so we
  295. * reset it to -1.
  296. */
  297. if (bad) {
  298. if (p99 < 0)
  299. p99 = kqd->domain_p99[sched_domain];
  300. kqd->domain_p99[sched_domain] = -1;
  301. } else if (p99 >= 0) {
  302. kqd->domain_p99[sched_domain] = p99;
  303. }
  304. if (p99 < 0)
  305. continue;
  306. /*
  307. * If this domain has bad latency, throttle less. Otherwise,
  308. * throttle more iff we determined that there is congestion.
  309. *
  310. * The new depth is scaled linearly with the p99 latency vs the
  311. * latency target. E.g., if the p99 is 3/4 of the target, then
  312. * we throttle down to 3/4 of the current depth, and if the p99
  313. * is 2x the target, then we double the depth.
  314. */
  315. if (bad || p99 >= KYBER_GOOD_BUCKETS) {
  316. orig_depth = kqd->domain_tokens[sched_domain].sb.depth;
  317. depth = (orig_depth * (p99 + 1)) >> KYBER_LATENCY_SHIFT;
  318. kyber_resize_domain(kqd, sched_domain, depth);
  319. }
  320. }
  321. }
  322. static unsigned int kyber_sched_tags_shift(struct request_queue *q)
  323. {
  324. /*
  325. * All of the hardware queues have the same depth, so we can just grab
  326. * the shift of the first one.
  327. */
  328. return q->queue_hw_ctx[0]->sched_tags->bitmap_tags.sb.shift;
  329. }
  330. static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q)
  331. {
  332. struct kyber_queue_data *kqd;
  333. unsigned int shift;
  334. int ret = -ENOMEM;
  335. int i;
  336. kqd = kzalloc_node(sizeof(*kqd), GFP_KERNEL, q->node);
  337. if (!kqd)
  338. goto err;
  339. kqd->q = q;
  340. kqd->cpu_latency = alloc_percpu_gfp(struct kyber_cpu_latency,
  341. GFP_KERNEL | __GFP_ZERO);
  342. if (!kqd->cpu_latency)
  343. goto err_kqd;
  344. timer_setup(&kqd->timer, kyber_timer_fn, 0);
  345. for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
  346. WARN_ON(!kyber_depth[i]);
  347. WARN_ON(!kyber_batch_size[i]);
  348. ret = sbitmap_queue_init_node(&kqd->domain_tokens[i],
  349. kyber_depth[i], -1, false,
  350. GFP_KERNEL, q->node);
  351. if (ret) {
  352. while (--i >= 0)
  353. sbitmap_queue_free(&kqd->domain_tokens[i]);
  354. goto err_buckets;
  355. }
  356. }
  357. for (i = 0; i < KYBER_OTHER; i++) {
  358. kqd->domain_p99[i] = -1;
  359. kqd->latency_targets[i] = kyber_latency_targets[i];
  360. }
  361. shift = kyber_sched_tags_shift(q);
  362. kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U;
  363. return kqd;
  364. err_buckets:
  365. free_percpu(kqd->cpu_latency);
  366. err_kqd:
  367. kfree(kqd);
  368. err:
  369. return ERR_PTR(ret);
  370. }
  371. static int kyber_init_sched(struct request_queue *q, struct elevator_type *e)
  372. {
  373. struct kyber_queue_data *kqd;
  374. struct elevator_queue *eq;
  375. eq = elevator_alloc(q, e);
  376. if (!eq)
  377. return -ENOMEM;
  378. kqd = kyber_queue_data_alloc(q);
  379. if (IS_ERR(kqd)) {
  380. kobject_put(&eq->kobj);
  381. return PTR_ERR(kqd);
  382. }
  383. blk_stat_enable_accounting(q);
  384. eq->elevator_data = kqd;
  385. q->elevator = eq;
  386. return 0;
  387. }
  388. static void kyber_exit_sched(struct elevator_queue *e)
  389. {
  390. struct kyber_queue_data *kqd = e->elevator_data;
  391. int i;
  392. del_timer_sync(&kqd->timer);
  393. for (i = 0; i < KYBER_NUM_DOMAINS; i++)
  394. sbitmap_queue_free(&kqd->domain_tokens[i]);
  395. free_percpu(kqd->cpu_latency);
  396. kfree(kqd);
  397. }
  398. static void kyber_ctx_queue_init(struct kyber_ctx_queue *kcq)
  399. {
  400. unsigned int i;
  401. spin_lock_init(&kcq->lock);
  402. for (i = 0; i < KYBER_NUM_DOMAINS; i++)
  403. INIT_LIST_HEAD(&kcq->rq_list[i]);
  404. }
  405. static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
  406. {
  407. struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data;
  408. struct kyber_hctx_data *khd;
  409. int i;
  410. khd = kmalloc_node(sizeof(*khd), GFP_KERNEL, hctx->numa_node);
  411. if (!khd)
  412. return -ENOMEM;
  413. khd->kcqs = kmalloc_array_node(hctx->nr_ctx,
  414. sizeof(struct kyber_ctx_queue),
  415. GFP_KERNEL, hctx->numa_node);
  416. if (!khd->kcqs)
  417. goto err_khd;
  418. for (i = 0; i < hctx->nr_ctx; i++)
  419. kyber_ctx_queue_init(&khd->kcqs[i]);
  420. for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
  421. if (sbitmap_init_node(&khd->kcq_map[i], hctx->nr_ctx,
  422. ilog2(8), GFP_KERNEL, hctx->numa_node)) {
  423. while (--i >= 0)
  424. sbitmap_free(&khd->kcq_map[i]);
  425. goto err_kcqs;
  426. }
  427. }
  428. spin_lock_init(&khd->lock);
  429. for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
  430. INIT_LIST_HEAD(&khd->rqs[i]);
  431. init_waitqueue_func_entry(&khd->domain_wait[i],
  432. kyber_domain_wake);
  433. khd->domain_wait[i].private = hctx;
  434. INIT_LIST_HEAD(&khd->domain_wait[i].entry);
  435. atomic_set(&khd->wait_index[i], 0);
  436. }
  437. khd->cur_domain = 0;
  438. khd->batching = 0;
  439. hctx->sched_data = khd;
  440. sbitmap_queue_min_shallow_depth(&hctx->sched_tags->bitmap_tags,
  441. kqd->async_depth);
  442. return 0;
  443. err_kcqs:
  444. kfree(khd->kcqs);
  445. err_khd:
  446. kfree(khd);
  447. return -ENOMEM;
  448. }
  449. static void kyber_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
  450. {
  451. struct kyber_hctx_data *khd = hctx->sched_data;
  452. int i;
  453. for (i = 0; i < KYBER_NUM_DOMAINS; i++)
  454. sbitmap_free(&khd->kcq_map[i]);
  455. kfree(khd->kcqs);
  456. kfree(hctx->sched_data);
  457. }
  458. static int rq_get_domain_token(struct request *rq)
  459. {
  460. return (long)rq->elv.priv[0];
  461. }
  462. static void rq_set_domain_token(struct request *rq, int token)
  463. {
  464. rq->elv.priv[0] = (void *)(long)token;
  465. }
  466. static void rq_clear_domain_token(struct kyber_queue_data *kqd,
  467. struct request *rq)
  468. {
  469. unsigned int sched_domain;
  470. int nr;
  471. nr = rq_get_domain_token(rq);
  472. if (nr != -1) {
  473. sched_domain = kyber_sched_domain(rq->cmd_flags);
  474. sbitmap_queue_clear(&kqd->domain_tokens[sched_domain], nr,
  475. rq->mq_ctx->cpu);
  476. }
  477. }
  478. static void kyber_limit_depth(unsigned int op, struct blk_mq_alloc_data *data)
  479. {
  480. /*
  481. * We use the scheduler tags as per-hardware queue queueing tokens.
  482. * Async requests can be limited at this stage.
  483. */
  484. if (!op_is_sync(op)) {
  485. struct kyber_queue_data *kqd = data->q->elevator->elevator_data;
  486. data->shallow_depth = kqd->async_depth;
  487. }
  488. }
  489. static bool kyber_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio)
  490. {
  491. struct kyber_hctx_data *khd = hctx->sched_data;
  492. struct blk_mq_ctx *ctx = blk_mq_get_ctx(hctx->queue);
  493. struct kyber_ctx_queue *kcq = &khd->kcqs[ctx->index_hw];
  494. unsigned int sched_domain = kyber_sched_domain(bio->bi_opf);
  495. struct list_head *rq_list = &kcq->rq_list[sched_domain];
  496. bool merged;
  497. spin_lock(&kcq->lock);
  498. merged = blk_mq_bio_list_merge(hctx->queue, rq_list, bio);
  499. spin_unlock(&kcq->lock);
  500. blk_mq_put_ctx(ctx);
  501. return merged;
  502. }
  503. static void kyber_prepare_request(struct request *rq, struct bio *bio)
  504. {
  505. rq_set_domain_token(rq, -1);
  506. }
  507. static void kyber_insert_requests(struct blk_mq_hw_ctx *hctx,
  508. struct list_head *rq_list, bool at_head)
  509. {
  510. struct kyber_hctx_data *khd = hctx->sched_data;
  511. struct request *rq, *next;
  512. list_for_each_entry_safe(rq, next, rq_list, queuelist) {
  513. unsigned int sched_domain = kyber_sched_domain(rq->cmd_flags);
  514. struct kyber_ctx_queue *kcq = &khd->kcqs[rq->mq_ctx->index_hw];
  515. struct list_head *head = &kcq->rq_list[sched_domain];
  516. spin_lock(&kcq->lock);
  517. if (at_head)
  518. list_move(&rq->queuelist, head);
  519. else
  520. list_move_tail(&rq->queuelist, head);
  521. sbitmap_set_bit(&khd->kcq_map[sched_domain],
  522. rq->mq_ctx->index_hw);
  523. blk_mq_sched_request_inserted(rq);
  524. spin_unlock(&kcq->lock);
  525. }
  526. }
  527. static void kyber_finish_request(struct request *rq)
  528. {
  529. struct kyber_queue_data *kqd = rq->q->elevator->elevator_data;
  530. rq_clear_domain_token(kqd, rq);
  531. }
  532. static void add_latency_sample(struct kyber_cpu_latency *cpu_latency,
  533. unsigned int sched_domain, unsigned int type,
  534. u64 target, u64 latency)
  535. {
  536. unsigned int bucket;
  537. u64 divisor;
  538. if (latency > 0) {
  539. divisor = max_t(u64, target >> KYBER_LATENCY_SHIFT, 1);
  540. bucket = min_t(unsigned int, div64_u64(latency - 1, divisor),
  541. KYBER_LATENCY_BUCKETS - 1);
  542. } else {
  543. bucket = 0;
  544. }
  545. atomic_inc(&cpu_latency->buckets[sched_domain][type][bucket]);
  546. }
  547. static void kyber_completed_request(struct request *rq, u64 now)
  548. {
  549. struct kyber_queue_data *kqd = rq->q->elevator->elevator_data;
  550. struct kyber_cpu_latency *cpu_latency;
  551. unsigned int sched_domain;
  552. u64 target;
  553. sched_domain = kyber_sched_domain(rq->cmd_flags);
  554. if (sched_domain == KYBER_OTHER)
  555. return;
  556. cpu_latency = get_cpu_ptr(kqd->cpu_latency);
  557. target = kqd->latency_targets[sched_domain];
  558. add_latency_sample(cpu_latency, sched_domain, KYBER_TOTAL_LATENCY,
  559. target, now - rq->start_time_ns);
  560. add_latency_sample(cpu_latency, sched_domain, KYBER_IO_LATENCY, target,
  561. now - rq->io_start_time_ns);
  562. put_cpu_ptr(kqd->cpu_latency);
  563. timer_reduce(&kqd->timer, jiffies + HZ / 10);
  564. }
  565. struct flush_kcq_data {
  566. struct kyber_hctx_data *khd;
  567. unsigned int sched_domain;
  568. struct list_head *list;
  569. };
  570. static bool flush_busy_kcq(struct sbitmap *sb, unsigned int bitnr, void *data)
  571. {
  572. struct flush_kcq_data *flush_data = data;
  573. struct kyber_ctx_queue *kcq = &flush_data->khd->kcqs[bitnr];
  574. spin_lock(&kcq->lock);
  575. list_splice_tail_init(&kcq->rq_list[flush_data->sched_domain],
  576. flush_data->list);
  577. sbitmap_clear_bit(sb, bitnr);
  578. spin_unlock(&kcq->lock);
  579. return true;
  580. }
  581. static void kyber_flush_busy_kcqs(struct kyber_hctx_data *khd,
  582. unsigned int sched_domain,
  583. struct list_head *list)
  584. {
  585. struct flush_kcq_data data = {
  586. .khd = khd,
  587. .sched_domain = sched_domain,
  588. .list = list,
  589. };
  590. sbitmap_for_each_set(&khd->kcq_map[sched_domain],
  591. flush_busy_kcq, &data);
  592. }
  593. static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
  594. void *key)
  595. {
  596. struct blk_mq_hw_ctx *hctx = READ_ONCE(wait->private);
  597. list_del_init(&wait->entry);
  598. blk_mq_run_hw_queue(hctx, true);
  599. return 1;
  600. }
  601. static int kyber_get_domain_token(struct kyber_queue_data *kqd,
  602. struct kyber_hctx_data *khd,
  603. struct blk_mq_hw_ctx *hctx)
  604. {
  605. unsigned int sched_domain = khd->cur_domain;
  606. struct sbitmap_queue *domain_tokens = &kqd->domain_tokens[sched_domain];
  607. wait_queue_entry_t *wait = &khd->domain_wait[sched_domain];
  608. struct sbq_wait_state *ws;
  609. int nr;
  610. nr = __sbitmap_queue_get(domain_tokens);
  611. /*
  612. * If we failed to get a domain token, make sure the hardware queue is
  613. * run when one becomes available. Note that this is serialized on
  614. * khd->lock, but we still need to be careful about the waker.
  615. */
  616. if (nr < 0 && list_empty_careful(&wait->entry)) {
  617. ws = sbq_wait_ptr(domain_tokens,
  618. &khd->wait_index[sched_domain]);
  619. khd->domain_ws[sched_domain] = ws;
  620. add_wait_queue(&ws->wait, wait);
  621. /*
  622. * Try again in case a token was freed before we got on the wait
  623. * queue.
  624. */
  625. nr = __sbitmap_queue_get(domain_tokens);
  626. }
  627. /*
  628. * If we got a token while we were on the wait queue, remove ourselves
  629. * from the wait queue to ensure that all wake ups make forward
  630. * progress. It's possible that the waker already deleted the entry
  631. * between the !list_empty_careful() check and us grabbing the lock, but
  632. * list_del_init() is okay with that.
  633. */
  634. if (nr >= 0 && !list_empty_careful(&wait->entry)) {
  635. ws = khd->domain_ws[sched_domain];
  636. spin_lock_irq(&ws->wait.lock);
  637. list_del_init(&wait->entry);
  638. spin_unlock_irq(&ws->wait.lock);
  639. }
  640. return nr;
  641. }
  642. static struct request *
  643. kyber_dispatch_cur_domain(struct kyber_queue_data *kqd,
  644. struct kyber_hctx_data *khd,
  645. struct blk_mq_hw_ctx *hctx)
  646. {
  647. struct list_head *rqs;
  648. struct request *rq;
  649. int nr;
  650. rqs = &khd->rqs[khd->cur_domain];
  651. /*
  652. * If we already have a flushed request, then we just need to get a
  653. * token for it. Otherwise, if there are pending requests in the kcqs,
  654. * flush the kcqs, but only if we can get a token. If not, we should
  655. * leave the requests in the kcqs so that they can be merged. Note that
  656. * khd->lock serializes the flushes, so if we observed any bit set in
  657. * the kcq_map, we will always get a request.
  658. */
  659. rq = list_first_entry_or_null(rqs, struct request, queuelist);
  660. if (rq) {
  661. nr = kyber_get_domain_token(kqd, khd, hctx);
  662. if (nr >= 0) {
  663. khd->batching++;
  664. rq_set_domain_token(rq, nr);
  665. list_del_init(&rq->queuelist);
  666. return rq;
  667. } else {
  668. trace_kyber_throttled(kqd->q,
  669. kyber_domain_names[khd->cur_domain]);
  670. }
  671. } else if (sbitmap_any_bit_set(&khd->kcq_map[khd->cur_domain])) {
  672. nr = kyber_get_domain_token(kqd, khd, hctx);
  673. if (nr >= 0) {
  674. kyber_flush_busy_kcqs(khd, khd->cur_domain, rqs);
  675. rq = list_first_entry(rqs, struct request, queuelist);
  676. khd->batching++;
  677. rq_set_domain_token(rq, nr);
  678. list_del_init(&rq->queuelist);
  679. return rq;
  680. } else {
  681. trace_kyber_throttled(kqd->q,
  682. kyber_domain_names[khd->cur_domain]);
  683. }
  684. }
  685. /* There were either no pending requests or no tokens. */
  686. return NULL;
  687. }
  688. static struct request *kyber_dispatch_request(struct blk_mq_hw_ctx *hctx)
  689. {
  690. struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data;
  691. struct kyber_hctx_data *khd = hctx->sched_data;
  692. struct request *rq;
  693. int i;
  694. spin_lock(&khd->lock);
  695. /*
  696. * First, if we are still entitled to batch, try to dispatch a request
  697. * from the batch.
  698. */
  699. if (khd->batching < kyber_batch_size[khd->cur_domain]) {
  700. rq = kyber_dispatch_cur_domain(kqd, khd, hctx);
  701. if (rq)
  702. goto out;
  703. }
  704. /*
  705. * Either,
  706. * 1. We were no longer entitled to a batch.
  707. * 2. The domain we were batching didn't have any requests.
  708. * 3. The domain we were batching was out of tokens.
  709. *
  710. * Start another batch. Note that this wraps back around to the original
  711. * domain if no other domains have requests or tokens.
  712. */
  713. khd->batching = 0;
  714. for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
  715. if (khd->cur_domain == KYBER_NUM_DOMAINS - 1)
  716. khd->cur_domain = 0;
  717. else
  718. khd->cur_domain++;
  719. rq = kyber_dispatch_cur_domain(kqd, khd, hctx);
  720. if (rq)
  721. goto out;
  722. }
  723. rq = NULL;
  724. out:
  725. spin_unlock(&khd->lock);
  726. return rq;
  727. }
  728. static bool kyber_has_work(struct blk_mq_hw_ctx *hctx)
  729. {
  730. struct kyber_hctx_data *khd = hctx->sched_data;
  731. int i;
  732. for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
  733. if (!list_empty_careful(&khd->rqs[i]) ||
  734. sbitmap_any_bit_set(&khd->kcq_map[i]))
  735. return true;
  736. }
  737. return false;
  738. }
  739. #define KYBER_LAT_SHOW_STORE(domain, name) \
  740. static ssize_t kyber_##name##_lat_show(struct elevator_queue *e, \
  741. char *page) \
  742. { \
  743. struct kyber_queue_data *kqd = e->elevator_data; \
  744. \
  745. return sprintf(page, "%llu\n", kqd->latency_targets[domain]); \
  746. } \
  747. \
  748. static ssize_t kyber_##name##_lat_store(struct elevator_queue *e, \
  749. const char *page, size_t count) \
  750. { \
  751. struct kyber_queue_data *kqd = e->elevator_data; \
  752. unsigned long long nsec; \
  753. int ret; \
  754. \
  755. ret = kstrtoull(page, 10, &nsec); \
  756. if (ret) \
  757. return ret; \
  758. \
  759. kqd->latency_targets[domain] = nsec; \
  760. \
  761. return count; \
  762. }
  763. KYBER_LAT_SHOW_STORE(KYBER_READ, read);
  764. KYBER_LAT_SHOW_STORE(KYBER_WRITE, write);
  765. #undef KYBER_LAT_SHOW_STORE
  766. #define KYBER_LAT_ATTR(op) __ATTR(op##_lat_nsec, 0644, kyber_##op##_lat_show, kyber_##op##_lat_store)
  767. static struct elv_fs_entry kyber_sched_attrs[] = {
  768. KYBER_LAT_ATTR(read),
  769. KYBER_LAT_ATTR(write),
  770. __ATTR_NULL
  771. };
  772. #undef KYBER_LAT_ATTR
  773. #ifdef CONFIG_BLK_DEBUG_FS
  774. #define KYBER_DEBUGFS_DOMAIN_ATTRS(domain, name) \
  775. static int kyber_##name##_tokens_show(void *data, struct seq_file *m) \
  776. { \
  777. struct request_queue *q = data; \
  778. struct kyber_queue_data *kqd = q->elevator->elevator_data; \
  779. \
  780. sbitmap_queue_show(&kqd->domain_tokens[domain], m); \
  781. return 0; \
  782. } \
  783. \
  784. static void *kyber_##name##_rqs_start(struct seq_file *m, loff_t *pos) \
  785. __acquires(&khd->lock) \
  786. { \
  787. struct blk_mq_hw_ctx *hctx = m->private; \
  788. struct kyber_hctx_data *khd = hctx->sched_data; \
  789. \
  790. spin_lock(&khd->lock); \
  791. return seq_list_start(&khd->rqs[domain], *pos); \
  792. } \
  793. \
  794. static void *kyber_##name##_rqs_next(struct seq_file *m, void *v, \
  795. loff_t *pos) \
  796. { \
  797. struct blk_mq_hw_ctx *hctx = m->private; \
  798. struct kyber_hctx_data *khd = hctx->sched_data; \
  799. \
  800. return seq_list_next(v, &khd->rqs[domain], pos); \
  801. } \
  802. \
  803. static void kyber_##name##_rqs_stop(struct seq_file *m, void *v) \
  804. __releases(&khd->lock) \
  805. { \
  806. struct blk_mq_hw_ctx *hctx = m->private; \
  807. struct kyber_hctx_data *khd = hctx->sched_data; \
  808. \
  809. spin_unlock(&khd->lock); \
  810. } \
  811. \
  812. static const struct seq_operations kyber_##name##_rqs_seq_ops = { \
  813. .start = kyber_##name##_rqs_start, \
  814. .next = kyber_##name##_rqs_next, \
  815. .stop = kyber_##name##_rqs_stop, \
  816. .show = blk_mq_debugfs_rq_show, \
  817. }; \
  818. \
  819. static int kyber_##name##_waiting_show(void *data, struct seq_file *m) \
  820. { \
  821. struct blk_mq_hw_ctx *hctx = data; \
  822. struct kyber_hctx_data *khd = hctx->sched_data; \
  823. wait_queue_entry_t *wait = &khd->domain_wait[domain]; \
  824. \
  825. seq_printf(m, "%d\n", !list_empty_careful(&wait->entry)); \
  826. return 0; \
  827. }
  828. KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_READ, read)
  829. KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_WRITE, write)
  830. KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_DISCARD, discard)
  831. KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_OTHER, other)
  832. #undef KYBER_DEBUGFS_DOMAIN_ATTRS
  833. static int kyber_async_depth_show(void *data, struct seq_file *m)
  834. {
  835. struct request_queue *q = data;
  836. struct kyber_queue_data *kqd = q->elevator->elevator_data;
  837. seq_printf(m, "%u\n", kqd->async_depth);
  838. return 0;
  839. }
  840. static int kyber_cur_domain_show(void *data, struct seq_file *m)
  841. {
  842. struct blk_mq_hw_ctx *hctx = data;
  843. struct kyber_hctx_data *khd = hctx->sched_data;
  844. seq_printf(m, "%s\n", kyber_domain_names[khd->cur_domain]);
  845. return 0;
  846. }
  847. static int kyber_batching_show(void *data, struct seq_file *m)
  848. {
  849. struct blk_mq_hw_ctx *hctx = data;
  850. struct kyber_hctx_data *khd = hctx->sched_data;
  851. seq_printf(m, "%u\n", khd->batching);
  852. return 0;
  853. }
  854. #define KYBER_QUEUE_DOMAIN_ATTRS(name) \
  855. {#name "_tokens", 0400, kyber_##name##_tokens_show}
  856. static const struct blk_mq_debugfs_attr kyber_queue_debugfs_attrs[] = {
  857. KYBER_QUEUE_DOMAIN_ATTRS(read),
  858. KYBER_QUEUE_DOMAIN_ATTRS(write),
  859. KYBER_QUEUE_DOMAIN_ATTRS(discard),
  860. KYBER_QUEUE_DOMAIN_ATTRS(other),
  861. {"async_depth", 0400, kyber_async_depth_show},
  862. {},
  863. };
  864. #undef KYBER_QUEUE_DOMAIN_ATTRS
  865. #define KYBER_HCTX_DOMAIN_ATTRS(name) \
  866. {#name "_rqs", 0400, .seq_ops = &kyber_##name##_rqs_seq_ops}, \
  867. {#name "_waiting", 0400, kyber_##name##_waiting_show}
  868. static const struct blk_mq_debugfs_attr kyber_hctx_debugfs_attrs[] = {
  869. KYBER_HCTX_DOMAIN_ATTRS(read),
  870. KYBER_HCTX_DOMAIN_ATTRS(write),
  871. KYBER_HCTX_DOMAIN_ATTRS(discard),
  872. KYBER_HCTX_DOMAIN_ATTRS(other),
  873. {"cur_domain", 0400, kyber_cur_domain_show},
  874. {"batching", 0400, kyber_batching_show},
  875. {},
  876. };
  877. #undef KYBER_HCTX_DOMAIN_ATTRS
  878. #endif
  879. static struct elevator_type kyber_sched = {
  880. .ops.mq = {
  881. .init_sched = kyber_init_sched,
  882. .exit_sched = kyber_exit_sched,
  883. .init_hctx = kyber_init_hctx,
  884. .exit_hctx = kyber_exit_hctx,
  885. .limit_depth = kyber_limit_depth,
  886. .bio_merge = kyber_bio_merge,
  887. .prepare_request = kyber_prepare_request,
  888. .insert_requests = kyber_insert_requests,
  889. .finish_request = kyber_finish_request,
  890. .requeue_request = kyber_finish_request,
  891. .completed_request = kyber_completed_request,
  892. .dispatch_request = kyber_dispatch_request,
  893. .has_work = kyber_has_work,
  894. },
  895. .uses_mq = true,
  896. #ifdef CONFIG_BLK_DEBUG_FS
  897. .queue_debugfs_attrs = kyber_queue_debugfs_attrs,
  898. .hctx_debugfs_attrs = kyber_hctx_debugfs_attrs,
  899. #endif
  900. .elevator_attrs = kyber_sched_attrs,
  901. .elevator_name = "kyber",
  902. .elevator_owner = THIS_MODULE,
  903. };
  904. static int __init kyber_init(void)
  905. {
  906. return elv_register(&kyber_sched);
  907. }
  908. static void __exit kyber_exit(void)
  909. {
  910. elv_unregister(&kyber_sched);
  911. }
  912. module_init(kyber_init);
  913. module_exit(kyber_exit);
  914. MODULE_AUTHOR("Omar Sandoval");
  915. MODULE_LICENSE("GPL");
  916. MODULE_DESCRIPTION("Kyber I/O scheduler");