dm-cache-policy-smq.c 44 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918
  1. /*
  2. * Copyright (C) 2015 Red Hat. All rights reserved.
  3. *
  4. * This file is released under the GPL.
  5. */
  6. #include "dm-cache-background-tracker.h"
  7. #include "dm-cache-policy-internal.h"
  8. #include "dm-cache-policy.h"
  9. #include "dm.h"
  10. #include <linux/hash.h>
  11. #include <linux/jiffies.h>
  12. #include <linux/module.h>
  13. #include <linux/mutex.h>
  14. #include <linux/vmalloc.h>
  15. #include <linux/math64.h>
  16. #define DM_MSG_PREFIX "cache-policy-smq"
  17. /*----------------------------------------------------------------*/
  18. /*
  19. * Safe division functions that return zero on divide by zero.
  20. */
  21. static unsigned safe_div(unsigned n, unsigned d)
  22. {
  23. return d ? n / d : 0u;
  24. }
  25. static unsigned safe_mod(unsigned n, unsigned d)
  26. {
  27. return d ? n % d : 0u;
  28. }
  29. /*----------------------------------------------------------------*/
  30. struct entry {
  31. unsigned hash_next:28;
  32. unsigned prev:28;
  33. unsigned next:28;
  34. unsigned level:6;
  35. bool dirty:1;
  36. bool allocated:1;
  37. bool sentinel:1;
  38. bool pending_work:1;
  39. dm_oblock_t oblock;
  40. };
  41. /*----------------------------------------------------------------*/
  42. #define INDEXER_NULL ((1u << 28u) - 1u)
  43. /*
  44. * An entry_space manages a set of entries that we use for the queues.
  45. * The clean and dirty queues share entries, so this object is separate
  46. * from the queue itself.
  47. */
  48. struct entry_space {
  49. struct entry *begin;
  50. struct entry *end;
  51. };
  52. static int space_init(struct entry_space *es, unsigned nr_entries)
  53. {
  54. if (!nr_entries) {
  55. es->begin = es->end = NULL;
  56. return 0;
  57. }
  58. es->begin = vzalloc(sizeof(struct entry) * nr_entries);
  59. if (!es->begin)
  60. return -ENOMEM;
  61. es->end = es->begin + nr_entries;
  62. return 0;
  63. }
  64. static void space_exit(struct entry_space *es)
  65. {
  66. vfree(es->begin);
  67. }
  68. static struct entry *__get_entry(struct entry_space *es, unsigned block)
  69. {
  70. struct entry *e;
  71. e = es->begin + block;
  72. BUG_ON(e >= es->end);
  73. return e;
  74. }
  75. static unsigned to_index(struct entry_space *es, struct entry *e)
  76. {
  77. BUG_ON(e < es->begin || e >= es->end);
  78. return e - es->begin;
  79. }
  80. static struct entry *to_entry(struct entry_space *es, unsigned block)
  81. {
  82. if (block == INDEXER_NULL)
  83. return NULL;
  84. return __get_entry(es, block);
  85. }
  86. /*----------------------------------------------------------------*/
  87. struct ilist {
  88. unsigned nr_elts; /* excluding sentinel entries */
  89. unsigned head, tail;
  90. };
  91. static void l_init(struct ilist *l)
  92. {
  93. l->nr_elts = 0;
  94. l->head = l->tail = INDEXER_NULL;
  95. }
  96. static struct entry *l_head(struct entry_space *es, struct ilist *l)
  97. {
  98. return to_entry(es, l->head);
  99. }
  100. static struct entry *l_tail(struct entry_space *es, struct ilist *l)
  101. {
  102. return to_entry(es, l->tail);
  103. }
  104. static struct entry *l_next(struct entry_space *es, struct entry *e)
  105. {
  106. return to_entry(es, e->next);
  107. }
  108. static struct entry *l_prev(struct entry_space *es, struct entry *e)
  109. {
  110. return to_entry(es, e->prev);
  111. }
  112. static bool l_empty(struct ilist *l)
  113. {
  114. return l->head == INDEXER_NULL;
  115. }
  116. static void l_add_head(struct entry_space *es, struct ilist *l, struct entry *e)
  117. {
  118. struct entry *head = l_head(es, l);
  119. e->next = l->head;
  120. e->prev = INDEXER_NULL;
  121. if (head)
  122. head->prev = l->head = to_index(es, e);
  123. else
  124. l->head = l->tail = to_index(es, e);
  125. if (!e->sentinel)
  126. l->nr_elts++;
  127. }
  128. static void l_add_tail(struct entry_space *es, struct ilist *l, struct entry *e)
  129. {
  130. struct entry *tail = l_tail(es, l);
  131. e->next = INDEXER_NULL;
  132. e->prev = l->tail;
  133. if (tail)
  134. tail->next = l->tail = to_index(es, e);
  135. else
  136. l->head = l->tail = to_index(es, e);
  137. if (!e->sentinel)
  138. l->nr_elts++;
  139. }
  140. static void l_add_before(struct entry_space *es, struct ilist *l,
  141. struct entry *old, struct entry *e)
  142. {
  143. struct entry *prev = l_prev(es, old);
  144. if (!prev)
  145. l_add_head(es, l, e);
  146. else {
  147. e->prev = old->prev;
  148. e->next = to_index(es, old);
  149. prev->next = old->prev = to_index(es, e);
  150. if (!e->sentinel)
  151. l->nr_elts++;
  152. }
  153. }
  154. static void l_del(struct entry_space *es, struct ilist *l, struct entry *e)
  155. {
  156. struct entry *prev = l_prev(es, e);
  157. struct entry *next = l_next(es, e);
  158. if (prev)
  159. prev->next = e->next;
  160. else
  161. l->head = e->next;
  162. if (next)
  163. next->prev = e->prev;
  164. else
  165. l->tail = e->prev;
  166. if (!e->sentinel)
  167. l->nr_elts--;
  168. }
  169. static struct entry *l_pop_tail(struct entry_space *es, struct ilist *l)
  170. {
  171. struct entry *e;
  172. for (e = l_tail(es, l); e; e = l_prev(es, e))
  173. if (!e->sentinel) {
  174. l_del(es, l, e);
  175. return e;
  176. }
  177. return NULL;
  178. }
  179. /*----------------------------------------------------------------*/
  180. /*
  181. * The stochastic-multi-queue is a set of lru lists stacked into levels.
  182. * Entries are moved up levels when they are used, which loosely orders the
  183. * most accessed entries in the top levels and least in the bottom. This
  184. * structure is *much* better than a single lru list.
  185. */
  186. #define MAX_LEVELS 64u
  187. struct queue {
  188. struct entry_space *es;
  189. unsigned nr_elts;
  190. unsigned nr_levels;
  191. struct ilist qs[MAX_LEVELS];
  192. /*
  193. * We maintain a count of the number of entries we would like in each
  194. * level.
  195. */
  196. unsigned last_target_nr_elts;
  197. unsigned nr_top_levels;
  198. unsigned nr_in_top_levels;
  199. unsigned target_count[MAX_LEVELS];
  200. };
  201. static void q_init(struct queue *q, struct entry_space *es, unsigned nr_levels)
  202. {
  203. unsigned i;
  204. q->es = es;
  205. q->nr_elts = 0;
  206. q->nr_levels = nr_levels;
  207. for (i = 0; i < q->nr_levels; i++) {
  208. l_init(q->qs + i);
  209. q->target_count[i] = 0u;
  210. }
  211. q->last_target_nr_elts = 0u;
  212. q->nr_top_levels = 0u;
  213. q->nr_in_top_levels = 0u;
  214. }
  215. static unsigned q_size(struct queue *q)
  216. {
  217. return q->nr_elts;
  218. }
  219. /*
  220. * Insert an entry to the back of the given level.
  221. */
  222. static void q_push(struct queue *q, struct entry *e)
  223. {
  224. BUG_ON(e->pending_work);
  225. if (!e->sentinel)
  226. q->nr_elts++;
  227. l_add_tail(q->es, q->qs + e->level, e);
  228. }
  229. static void q_push_front(struct queue *q, struct entry *e)
  230. {
  231. BUG_ON(e->pending_work);
  232. if (!e->sentinel)
  233. q->nr_elts++;
  234. l_add_head(q->es, q->qs + e->level, e);
  235. }
  236. static void q_push_before(struct queue *q, struct entry *old, struct entry *e)
  237. {
  238. BUG_ON(e->pending_work);
  239. if (!e->sentinel)
  240. q->nr_elts++;
  241. l_add_before(q->es, q->qs + e->level, old, e);
  242. }
  243. static void q_del(struct queue *q, struct entry *e)
  244. {
  245. l_del(q->es, q->qs + e->level, e);
  246. if (!e->sentinel)
  247. q->nr_elts--;
  248. }
  249. /*
  250. * Return the oldest entry of the lowest populated level.
  251. */
  252. static struct entry *q_peek(struct queue *q, unsigned max_level, bool can_cross_sentinel)
  253. {
  254. unsigned level;
  255. struct entry *e;
  256. max_level = min(max_level, q->nr_levels);
  257. for (level = 0; level < max_level; level++)
  258. for (e = l_head(q->es, q->qs + level); e; e = l_next(q->es, e)) {
  259. if (e->sentinel) {
  260. if (can_cross_sentinel)
  261. continue;
  262. else
  263. break;
  264. }
  265. return e;
  266. }
  267. return NULL;
  268. }
  269. static struct entry *q_pop(struct queue *q)
  270. {
  271. struct entry *e = q_peek(q, q->nr_levels, true);
  272. if (e)
  273. q_del(q, e);
  274. return e;
  275. }
  276. /*
  277. * This function assumes there is a non-sentinel entry to pop. It's only
  278. * used by redistribute, so we know this is true. It also doesn't adjust
  279. * the q->nr_elts count.
  280. */
  281. static struct entry *__redist_pop_from(struct queue *q, unsigned level)
  282. {
  283. struct entry *e;
  284. for (; level < q->nr_levels; level++)
  285. for (e = l_head(q->es, q->qs + level); e; e = l_next(q->es, e))
  286. if (!e->sentinel) {
  287. l_del(q->es, q->qs + e->level, e);
  288. return e;
  289. }
  290. return NULL;
  291. }
  292. static void q_set_targets_subrange_(struct queue *q, unsigned nr_elts, unsigned lbegin, unsigned lend)
  293. {
  294. unsigned level, nr_levels, entries_per_level, remainder;
  295. BUG_ON(lbegin > lend);
  296. BUG_ON(lend > q->nr_levels);
  297. nr_levels = lend - lbegin;
  298. entries_per_level = safe_div(nr_elts, nr_levels);
  299. remainder = safe_mod(nr_elts, nr_levels);
  300. for (level = lbegin; level < lend; level++)
  301. q->target_count[level] =
  302. (level < (lbegin + remainder)) ? entries_per_level + 1u : entries_per_level;
  303. }
  304. /*
  305. * Typically we have fewer elements in the top few levels which allows us
  306. * to adjust the promote threshold nicely.
  307. */
  308. static void q_set_targets(struct queue *q)
  309. {
  310. if (q->last_target_nr_elts == q->nr_elts)
  311. return;
  312. q->last_target_nr_elts = q->nr_elts;
  313. if (q->nr_top_levels > q->nr_levels)
  314. q_set_targets_subrange_(q, q->nr_elts, 0, q->nr_levels);
  315. else {
  316. q_set_targets_subrange_(q, q->nr_in_top_levels,
  317. q->nr_levels - q->nr_top_levels, q->nr_levels);
  318. if (q->nr_in_top_levels < q->nr_elts)
  319. q_set_targets_subrange_(q, q->nr_elts - q->nr_in_top_levels,
  320. 0, q->nr_levels - q->nr_top_levels);
  321. else
  322. q_set_targets_subrange_(q, 0, 0, q->nr_levels - q->nr_top_levels);
  323. }
  324. }
  325. static void q_redistribute(struct queue *q)
  326. {
  327. unsigned target, level;
  328. struct ilist *l, *l_above;
  329. struct entry *e;
  330. q_set_targets(q);
  331. for (level = 0u; level < q->nr_levels - 1u; level++) {
  332. l = q->qs + level;
  333. target = q->target_count[level];
  334. /*
  335. * Pull down some entries from the level above.
  336. */
  337. while (l->nr_elts < target) {
  338. e = __redist_pop_from(q, level + 1u);
  339. if (!e) {
  340. /* bug in nr_elts */
  341. break;
  342. }
  343. e->level = level;
  344. l_add_tail(q->es, l, e);
  345. }
  346. /*
  347. * Push some entries up.
  348. */
  349. l_above = q->qs + level + 1u;
  350. while (l->nr_elts > target) {
  351. e = l_pop_tail(q->es, l);
  352. if (!e)
  353. /* bug in nr_elts */
  354. break;
  355. e->level = level + 1u;
  356. l_add_tail(q->es, l_above, e);
  357. }
  358. }
  359. }
  360. static void q_requeue(struct queue *q, struct entry *e, unsigned extra_levels,
  361. struct entry *s1, struct entry *s2)
  362. {
  363. struct entry *de;
  364. unsigned sentinels_passed = 0;
  365. unsigned new_level = min(q->nr_levels - 1u, e->level + extra_levels);
  366. /* try and find an entry to swap with */
  367. if (extra_levels && (e->level < q->nr_levels - 1u)) {
  368. for (de = l_head(q->es, q->qs + new_level); de && de->sentinel; de = l_next(q->es, de))
  369. sentinels_passed++;
  370. if (de) {
  371. q_del(q, de);
  372. de->level = e->level;
  373. if (s1) {
  374. switch (sentinels_passed) {
  375. case 0:
  376. q_push_before(q, s1, de);
  377. break;
  378. case 1:
  379. q_push_before(q, s2, de);
  380. break;
  381. default:
  382. q_push(q, de);
  383. }
  384. } else
  385. q_push(q, de);
  386. }
  387. }
  388. q_del(q, e);
  389. e->level = new_level;
  390. q_push(q, e);
  391. }
  392. /*----------------------------------------------------------------*/
  393. #define FP_SHIFT 8
  394. #define SIXTEENTH (1u << (FP_SHIFT - 4u))
  395. #define EIGHTH (1u << (FP_SHIFT - 3u))
  396. struct stats {
  397. unsigned hit_threshold;
  398. unsigned hits;
  399. unsigned misses;
  400. };
  401. enum performance {
  402. Q_POOR,
  403. Q_FAIR,
  404. Q_WELL
  405. };
  406. static void stats_init(struct stats *s, unsigned nr_levels)
  407. {
  408. s->hit_threshold = (nr_levels * 3u) / 4u;
  409. s->hits = 0u;
  410. s->misses = 0u;
  411. }
  412. static void stats_reset(struct stats *s)
  413. {
  414. s->hits = s->misses = 0u;
  415. }
  416. static void stats_level_accessed(struct stats *s, unsigned level)
  417. {
  418. if (level >= s->hit_threshold)
  419. s->hits++;
  420. else
  421. s->misses++;
  422. }
  423. static void stats_miss(struct stats *s)
  424. {
  425. s->misses++;
  426. }
  427. /*
  428. * There are times when we don't have any confidence in the hotspot queue.
  429. * Such as when a fresh cache is created and the blocks have been spread
  430. * out across the levels, or if an io load changes. We detect this by
  431. * seeing how often a lookup is in the top levels of the hotspot queue.
  432. */
  433. static enum performance stats_assess(struct stats *s)
  434. {
  435. unsigned confidence = safe_div(s->hits << FP_SHIFT, s->hits + s->misses);
  436. if (confidence < SIXTEENTH)
  437. return Q_POOR;
  438. else if (confidence < EIGHTH)
  439. return Q_FAIR;
  440. else
  441. return Q_WELL;
  442. }
  443. /*----------------------------------------------------------------*/
  444. struct smq_hash_table {
  445. struct entry_space *es;
  446. unsigned long long hash_bits;
  447. unsigned *buckets;
  448. };
  449. /*
  450. * All cache entries are stored in a chained hash table. To save space we
  451. * use indexing again, and only store indexes to the next entry.
  452. */
  453. static int h_init(struct smq_hash_table *ht, struct entry_space *es, unsigned nr_entries)
  454. {
  455. unsigned i, nr_buckets;
  456. ht->es = es;
  457. nr_buckets = roundup_pow_of_two(max(nr_entries / 4u, 16u));
  458. ht->hash_bits = __ffs(nr_buckets);
  459. ht->buckets = vmalloc(sizeof(*ht->buckets) * nr_buckets);
  460. if (!ht->buckets)
  461. return -ENOMEM;
  462. for (i = 0; i < nr_buckets; i++)
  463. ht->buckets[i] = INDEXER_NULL;
  464. return 0;
  465. }
  466. static void h_exit(struct smq_hash_table *ht)
  467. {
  468. vfree(ht->buckets);
  469. }
  470. static struct entry *h_head(struct smq_hash_table *ht, unsigned bucket)
  471. {
  472. return to_entry(ht->es, ht->buckets[bucket]);
  473. }
  474. static struct entry *h_next(struct smq_hash_table *ht, struct entry *e)
  475. {
  476. return to_entry(ht->es, e->hash_next);
  477. }
  478. static void __h_insert(struct smq_hash_table *ht, unsigned bucket, struct entry *e)
  479. {
  480. e->hash_next = ht->buckets[bucket];
  481. ht->buckets[bucket] = to_index(ht->es, e);
  482. }
  483. static void h_insert(struct smq_hash_table *ht, struct entry *e)
  484. {
  485. unsigned h = hash_64(from_oblock(e->oblock), ht->hash_bits);
  486. __h_insert(ht, h, e);
  487. }
  488. static struct entry *__h_lookup(struct smq_hash_table *ht, unsigned h, dm_oblock_t oblock,
  489. struct entry **prev)
  490. {
  491. struct entry *e;
  492. *prev = NULL;
  493. for (e = h_head(ht, h); e; e = h_next(ht, e)) {
  494. if (e->oblock == oblock)
  495. return e;
  496. *prev = e;
  497. }
  498. return NULL;
  499. }
  500. static void __h_unlink(struct smq_hash_table *ht, unsigned h,
  501. struct entry *e, struct entry *prev)
  502. {
  503. if (prev)
  504. prev->hash_next = e->hash_next;
  505. else
  506. ht->buckets[h] = e->hash_next;
  507. }
  508. /*
  509. * Also moves each entry to the front of the bucket.
  510. */
  511. static struct entry *h_lookup(struct smq_hash_table *ht, dm_oblock_t oblock)
  512. {
  513. struct entry *e, *prev;
  514. unsigned h = hash_64(from_oblock(oblock), ht->hash_bits);
  515. e = __h_lookup(ht, h, oblock, &prev);
  516. if (e && prev) {
  517. /*
  518. * Move to the front because this entry is likely
  519. * to be hit again.
  520. */
  521. __h_unlink(ht, h, e, prev);
  522. __h_insert(ht, h, e);
  523. }
  524. return e;
  525. }
  526. static void h_remove(struct smq_hash_table *ht, struct entry *e)
  527. {
  528. unsigned h = hash_64(from_oblock(e->oblock), ht->hash_bits);
  529. struct entry *prev;
  530. /*
  531. * The down side of using a singly linked list is we have to
  532. * iterate the bucket to remove an item.
  533. */
  534. e = __h_lookup(ht, h, e->oblock, &prev);
  535. if (e)
  536. __h_unlink(ht, h, e, prev);
  537. }
  538. /*----------------------------------------------------------------*/
  539. struct entry_alloc {
  540. struct entry_space *es;
  541. unsigned begin;
  542. unsigned nr_allocated;
  543. struct ilist free;
  544. };
  545. static void init_allocator(struct entry_alloc *ea, struct entry_space *es,
  546. unsigned begin, unsigned end)
  547. {
  548. unsigned i;
  549. ea->es = es;
  550. ea->nr_allocated = 0u;
  551. ea->begin = begin;
  552. l_init(&ea->free);
  553. for (i = begin; i != end; i++)
  554. l_add_tail(ea->es, &ea->free, __get_entry(ea->es, i));
  555. }
  556. static void init_entry(struct entry *e)
  557. {
  558. /*
  559. * We can't memset because that would clear the hotspot and
  560. * sentinel bits which remain constant.
  561. */
  562. e->hash_next = INDEXER_NULL;
  563. e->next = INDEXER_NULL;
  564. e->prev = INDEXER_NULL;
  565. e->level = 0u;
  566. e->dirty = true; /* FIXME: audit */
  567. e->allocated = true;
  568. e->sentinel = false;
  569. e->pending_work = false;
  570. }
  571. static struct entry *alloc_entry(struct entry_alloc *ea)
  572. {
  573. struct entry *e;
  574. if (l_empty(&ea->free))
  575. return NULL;
  576. e = l_pop_tail(ea->es, &ea->free);
  577. init_entry(e);
  578. ea->nr_allocated++;
  579. return e;
  580. }
  581. /*
  582. * This assumes the cblock hasn't already been allocated.
  583. */
  584. static struct entry *alloc_particular_entry(struct entry_alloc *ea, unsigned i)
  585. {
  586. struct entry *e = __get_entry(ea->es, ea->begin + i);
  587. BUG_ON(e->allocated);
  588. l_del(ea->es, &ea->free, e);
  589. init_entry(e);
  590. ea->nr_allocated++;
  591. return e;
  592. }
  593. static void free_entry(struct entry_alloc *ea, struct entry *e)
  594. {
  595. BUG_ON(!ea->nr_allocated);
  596. BUG_ON(!e->allocated);
  597. ea->nr_allocated--;
  598. e->allocated = false;
  599. l_add_tail(ea->es, &ea->free, e);
  600. }
  601. static bool allocator_empty(struct entry_alloc *ea)
  602. {
  603. return l_empty(&ea->free);
  604. }
  605. static unsigned get_index(struct entry_alloc *ea, struct entry *e)
  606. {
  607. return to_index(ea->es, e) - ea->begin;
  608. }
  609. static struct entry *get_entry(struct entry_alloc *ea, unsigned index)
  610. {
  611. return __get_entry(ea->es, ea->begin + index);
  612. }
  613. /*----------------------------------------------------------------*/
  614. #define NR_HOTSPOT_LEVELS 64u
  615. #define NR_CACHE_LEVELS 64u
  616. #define WRITEBACK_PERIOD (10ul * HZ)
  617. #define DEMOTE_PERIOD (60ul * HZ)
  618. #define HOTSPOT_UPDATE_PERIOD (HZ)
  619. #define CACHE_UPDATE_PERIOD (60ul * HZ)
  620. struct smq_policy {
  621. struct dm_cache_policy policy;
  622. /* protects everything */
  623. spinlock_t lock;
  624. dm_cblock_t cache_size;
  625. sector_t cache_block_size;
  626. sector_t hotspot_block_size;
  627. unsigned nr_hotspot_blocks;
  628. unsigned cache_blocks_per_hotspot_block;
  629. unsigned hotspot_level_jump;
  630. struct entry_space es;
  631. struct entry_alloc writeback_sentinel_alloc;
  632. struct entry_alloc demote_sentinel_alloc;
  633. struct entry_alloc hotspot_alloc;
  634. struct entry_alloc cache_alloc;
  635. unsigned long *hotspot_hit_bits;
  636. unsigned long *cache_hit_bits;
  637. /*
  638. * We maintain three queues of entries. The cache proper,
  639. * consisting of a clean and dirty queue, containing the currently
  640. * active mappings. The hotspot queue uses a larger block size to
  641. * track blocks that are being hit frequently and potential
  642. * candidates for promotion to the cache.
  643. */
  644. struct queue hotspot;
  645. struct queue clean;
  646. struct queue dirty;
  647. struct stats hotspot_stats;
  648. struct stats cache_stats;
  649. /*
  650. * Keeps track of time, incremented by the core. We use this to
  651. * avoid attributing multiple hits within the same tick.
  652. */
  653. unsigned tick;
  654. /*
  655. * The hash tables allows us to quickly find an entry by origin
  656. * block.
  657. */
  658. struct smq_hash_table table;
  659. struct smq_hash_table hotspot_table;
  660. bool current_writeback_sentinels;
  661. unsigned long next_writeback_period;
  662. bool current_demote_sentinels;
  663. unsigned long next_demote_period;
  664. unsigned write_promote_level;
  665. unsigned read_promote_level;
  666. unsigned long next_hotspot_period;
  667. unsigned long next_cache_period;
  668. struct background_tracker *bg_work;
  669. bool migrations_allowed;
  670. };
  671. /*----------------------------------------------------------------*/
  672. static struct entry *get_sentinel(struct entry_alloc *ea, unsigned level, bool which)
  673. {
  674. return get_entry(ea, which ? level : NR_CACHE_LEVELS + level);
  675. }
  676. static struct entry *writeback_sentinel(struct smq_policy *mq, unsigned level)
  677. {
  678. return get_sentinel(&mq->writeback_sentinel_alloc, level, mq->current_writeback_sentinels);
  679. }
  680. static struct entry *demote_sentinel(struct smq_policy *mq, unsigned level)
  681. {
  682. return get_sentinel(&mq->demote_sentinel_alloc, level, mq->current_demote_sentinels);
  683. }
  684. static void __update_writeback_sentinels(struct smq_policy *mq)
  685. {
  686. unsigned level;
  687. struct queue *q = &mq->dirty;
  688. struct entry *sentinel;
  689. for (level = 0; level < q->nr_levels; level++) {
  690. sentinel = writeback_sentinel(mq, level);
  691. q_del(q, sentinel);
  692. q_push(q, sentinel);
  693. }
  694. }
  695. static void __update_demote_sentinels(struct smq_policy *mq)
  696. {
  697. unsigned level;
  698. struct queue *q = &mq->clean;
  699. struct entry *sentinel;
  700. for (level = 0; level < q->nr_levels; level++) {
  701. sentinel = demote_sentinel(mq, level);
  702. q_del(q, sentinel);
  703. q_push(q, sentinel);
  704. }
  705. }
  706. static void update_sentinels(struct smq_policy *mq)
  707. {
  708. if (time_after(jiffies, mq->next_writeback_period)) {
  709. mq->next_writeback_period = jiffies + WRITEBACK_PERIOD;
  710. mq->current_writeback_sentinels = !mq->current_writeback_sentinels;
  711. __update_writeback_sentinels(mq);
  712. }
  713. if (time_after(jiffies, mq->next_demote_period)) {
  714. mq->next_demote_period = jiffies + DEMOTE_PERIOD;
  715. mq->current_demote_sentinels = !mq->current_demote_sentinels;
  716. __update_demote_sentinels(mq);
  717. }
  718. }
  719. static void __sentinels_init(struct smq_policy *mq)
  720. {
  721. unsigned level;
  722. struct entry *sentinel;
  723. for (level = 0; level < NR_CACHE_LEVELS; level++) {
  724. sentinel = writeback_sentinel(mq, level);
  725. sentinel->level = level;
  726. q_push(&mq->dirty, sentinel);
  727. sentinel = demote_sentinel(mq, level);
  728. sentinel->level = level;
  729. q_push(&mq->clean, sentinel);
  730. }
  731. }
  732. static void sentinels_init(struct smq_policy *mq)
  733. {
  734. mq->next_writeback_period = jiffies + WRITEBACK_PERIOD;
  735. mq->next_demote_period = jiffies + DEMOTE_PERIOD;
  736. mq->current_writeback_sentinels = false;
  737. mq->current_demote_sentinels = false;
  738. __sentinels_init(mq);
  739. mq->current_writeback_sentinels = !mq->current_writeback_sentinels;
  740. mq->current_demote_sentinels = !mq->current_demote_sentinels;
  741. __sentinels_init(mq);
  742. }
  743. /*----------------------------------------------------------------*/
  744. static void del_queue(struct smq_policy *mq, struct entry *e)
  745. {
  746. q_del(e->dirty ? &mq->dirty : &mq->clean, e);
  747. }
  748. static void push_queue(struct smq_policy *mq, struct entry *e)
  749. {
  750. if (e->dirty)
  751. q_push(&mq->dirty, e);
  752. else
  753. q_push(&mq->clean, e);
  754. }
  755. // !h, !q, a -> h, q, a
  756. static void push(struct smq_policy *mq, struct entry *e)
  757. {
  758. h_insert(&mq->table, e);
  759. if (!e->pending_work)
  760. push_queue(mq, e);
  761. }
  762. static void push_queue_front(struct smq_policy *mq, struct entry *e)
  763. {
  764. if (e->dirty)
  765. q_push_front(&mq->dirty, e);
  766. else
  767. q_push_front(&mq->clean, e);
  768. }
  769. static void push_front(struct smq_policy *mq, struct entry *e)
  770. {
  771. h_insert(&mq->table, e);
  772. if (!e->pending_work)
  773. push_queue_front(mq, e);
  774. }
  775. static dm_cblock_t infer_cblock(struct smq_policy *mq, struct entry *e)
  776. {
  777. return to_cblock(get_index(&mq->cache_alloc, e));
  778. }
  779. static void requeue(struct smq_policy *mq, struct entry *e)
  780. {
  781. /*
  782. * Pending work has temporarily been taken out of the queues.
  783. */
  784. if (e->pending_work)
  785. return;
  786. if (!test_and_set_bit(from_cblock(infer_cblock(mq, e)), mq->cache_hit_bits)) {
  787. if (!e->dirty) {
  788. q_requeue(&mq->clean, e, 1u, NULL, NULL);
  789. return;
  790. }
  791. q_requeue(&mq->dirty, e, 1u,
  792. get_sentinel(&mq->writeback_sentinel_alloc, e->level, !mq->current_writeback_sentinels),
  793. get_sentinel(&mq->writeback_sentinel_alloc, e->level, mq->current_writeback_sentinels));
  794. }
  795. }
  796. static unsigned default_promote_level(struct smq_policy *mq)
  797. {
  798. /*
  799. * The promote level depends on the current performance of the
  800. * cache.
  801. *
  802. * If the cache is performing badly, then we can't afford
  803. * to promote much without causing performance to drop below that
  804. * of the origin device.
  805. *
  806. * If the cache is performing well, then we don't need to promote
  807. * much. If it isn't broken, don't fix it.
  808. *
  809. * If the cache is middling then we promote more.
  810. *
  811. * This scheme reminds me of a graph of entropy vs probability of a
  812. * binary variable.
  813. */
  814. static unsigned table[] = {1, 1, 1, 2, 4, 6, 7, 8, 7, 6, 4, 4, 3, 3, 2, 2, 1};
  815. unsigned hits = mq->cache_stats.hits;
  816. unsigned misses = mq->cache_stats.misses;
  817. unsigned index = safe_div(hits << 4u, hits + misses);
  818. return table[index];
  819. }
  820. static void update_promote_levels(struct smq_policy *mq)
  821. {
  822. /*
  823. * If there are unused cache entries then we want to be really
  824. * eager to promote.
  825. */
  826. unsigned threshold_level = allocator_empty(&mq->cache_alloc) ?
  827. default_promote_level(mq) : (NR_HOTSPOT_LEVELS / 2u);
  828. threshold_level = max(threshold_level, NR_HOTSPOT_LEVELS);
  829. /*
  830. * If the hotspot queue is performing badly then we have little
  831. * confidence that we know which blocks to promote. So we cut down
  832. * the amount of promotions.
  833. */
  834. switch (stats_assess(&mq->hotspot_stats)) {
  835. case Q_POOR:
  836. threshold_level /= 4u;
  837. break;
  838. case Q_FAIR:
  839. threshold_level /= 2u;
  840. break;
  841. case Q_WELL:
  842. break;
  843. }
  844. mq->read_promote_level = NR_HOTSPOT_LEVELS - threshold_level;
  845. mq->write_promote_level = (NR_HOTSPOT_LEVELS - threshold_level);
  846. }
  847. /*
  848. * If the hotspot queue is performing badly, then we try and move entries
  849. * around more quickly.
  850. */
  851. static void update_level_jump(struct smq_policy *mq)
  852. {
  853. switch (stats_assess(&mq->hotspot_stats)) {
  854. case Q_POOR:
  855. mq->hotspot_level_jump = 4u;
  856. break;
  857. case Q_FAIR:
  858. mq->hotspot_level_jump = 2u;
  859. break;
  860. case Q_WELL:
  861. mq->hotspot_level_jump = 1u;
  862. break;
  863. }
  864. }
  865. static void end_hotspot_period(struct smq_policy *mq)
  866. {
  867. clear_bitset(mq->hotspot_hit_bits, mq->nr_hotspot_blocks);
  868. update_promote_levels(mq);
  869. if (time_after(jiffies, mq->next_hotspot_period)) {
  870. update_level_jump(mq);
  871. q_redistribute(&mq->hotspot);
  872. stats_reset(&mq->hotspot_stats);
  873. mq->next_hotspot_period = jiffies + HOTSPOT_UPDATE_PERIOD;
  874. }
  875. }
  876. static void end_cache_period(struct smq_policy *mq)
  877. {
  878. if (time_after(jiffies, mq->next_cache_period)) {
  879. clear_bitset(mq->cache_hit_bits, from_cblock(mq->cache_size));
  880. q_redistribute(&mq->dirty);
  881. q_redistribute(&mq->clean);
  882. stats_reset(&mq->cache_stats);
  883. mq->next_cache_period = jiffies + CACHE_UPDATE_PERIOD;
  884. }
  885. }
  886. /*----------------------------------------------------------------*/
  887. /*
  888. * Targets are given as a percentage.
  889. */
  890. #define CLEAN_TARGET 25u
  891. #define FREE_TARGET 25u
  892. static unsigned percent_to_target(struct smq_policy *mq, unsigned p)
  893. {
  894. return from_cblock(mq->cache_size) * p / 100u;
  895. }
  896. static bool clean_target_met(struct smq_policy *mq, bool idle)
  897. {
  898. /*
  899. * Cache entries may not be populated. So we cannot rely on the
  900. * size of the clean queue.
  901. */
  902. if (idle) {
  903. /*
  904. * We'd like to clean everything.
  905. */
  906. return q_size(&mq->dirty) == 0u;
  907. }
  908. /*
  909. * If we're busy we don't worry about cleaning at all.
  910. */
  911. return true;
  912. }
  913. static bool free_target_met(struct smq_policy *mq)
  914. {
  915. unsigned nr_free;
  916. nr_free = from_cblock(mq->cache_size) - mq->cache_alloc.nr_allocated;
  917. return (nr_free + btracker_nr_demotions_queued(mq->bg_work)) >=
  918. percent_to_target(mq, FREE_TARGET);
  919. }
  920. /*----------------------------------------------------------------*/
  921. static void mark_pending(struct smq_policy *mq, struct entry *e)
  922. {
  923. BUG_ON(e->sentinel);
  924. BUG_ON(!e->allocated);
  925. BUG_ON(e->pending_work);
  926. e->pending_work = true;
  927. }
  928. static void clear_pending(struct smq_policy *mq, struct entry *e)
  929. {
  930. BUG_ON(!e->pending_work);
  931. e->pending_work = false;
  932. }
  933. static void queue_writeback(struct smq_policy *mq)
  934. {
  935. int r;
  936. struct policy_work work;
  937. struct entry *e;
  938. e = q_peek(&mq->dirty, mq->dirty.nr_levels, !mq->migrations_allowed);
  939. if (e) {
  940. mark_pending(mq, e);
  941. q_del(&mq->dirty, e);
  942. work.op = POLICY_WRITEBACK;
  943. work.oblock = e->oblock;
  944. work.cblock = infer_cblock(mq, e);
  945. r = btracker_queue(mq->bg_work, &work, NULL);
  946. WARN_ON_ONCE(r); // FIXME: finish, I think we have to get rid of this race.
  947. }
  948. }
  949. static void queue_demotion(struct smq_policy *mq)
  950. {
  951. struct policy_work work;
  952. struct entry *e;
  953. if (unlikely(WARN_ON_ONCE(!mq->migrations_allowed)))
  954. return;
  955. e = q_peek(&mq->clean, mq->clean.nr_levels / 2, true);
  956. if (!e) {
  957. if (!clean_target_met(mq, true))
  958. queue_writeback(mq);
  959. return;
  960. }
  961. mark_pending(mq, e);
  962. q_del(&mq->clean, e);
  963. work.op = POLICY_DEMOTE;
  964. work.oblock = e->oblock;
  965. work.cblock = infer_cblock(mq, e);
  966. btracker_queue(mq->bg_work, &work, NULL);
  967. }
  968. static void queue_promotion(struct smq_policy *mq, dm_oblock_t oblock,
  969. struct policy_work **workp)
  970. {
  971. struct entry *e;
  972. struct policy_work work;
  973. if (!mq->migrations_allowed)
  974. return;
  975. if (allocator_empty(&mq->cache_alloc)) {
  976. /*
  977. * We always claim to be 'idle' to ensure some demotions happen
  978. * with continuous loads.
  979. */
  980. if (!free_target_met(mq))
  981. queue_demotion(mq);
  982. return;
  983. }
  984. if (btracker_promotion_already_present(mq->bg_work, oblock))
  985. return;
  986. /*
  987. * We allocate the entry now to reserve the cblock. If the
  988. * background work is aborted we must remember to free it.
  989. */
  990. e = alloc_entry(&mq->cache_alloc);
  991. BUG_ON(!e);
  992. e->pending_work = true;
  993. work.op = POLICY_PROMOTE;
  994. work.oblock = oblock;
  995. work.cblock = infer_cblock(mq, e);
  996. btracker_queue(mq->bg_work, &work, workp);
  997. }
  998. /*----------------------------------------------------------------*/
  999. enum promote_result {
  1000. PROMOTE_NOT,
  1001. PROMOTE_TEMPORARY,
  1002. PROMOTE_PERMANENT
  1003. };
  1004. /*
  1005. * Converts a boolean into a promote result.
  1006. */
  1007. static enum promote_result maybe_promote(bool promote)
  1008. {
  1009. return promote ? PROMOTE_PERMANENT : PROMOTE_NOT;
  1010. }
  1011. static enum promote_result should_promote(struct smq_policy *mq, struct entry *hs_e,
  1012. int data_dir, bool fast_promote)
  1013. {
  1014. if (data_dir == WRITE) {
  1015. if (!allocator_empty(&mq->cache_alloc) && fast_promote)
  1016. return PROMOTE_TEMPORARY;
  1017. return maybe_promote(hs_e->level >= mq->write_promote_level);
  1018. } else
  1019. return maybe_promote(hs_e->level >= mq->read_promote_level);
  1020. }
  1021. static dm_oblock_t to_hblock(struct smq_policy *mq, dm_oblock_t b)
  1022. {
  1023. sector_t r = from_oblock(b);
  1024. (void) sector_div(r, mq->cache_blocks_per_hotspot_block);
  1025. return to_oblock(r);
  1026. }
  1027. static struct entry *update_hotspot_queue(struct smq_policy *mq, dm_oblock_t b)
  1028. {
  1029. unsigned hi;
  1030. dm_oblock_t hb = to_hblock(mq, b);
  1031. struct entry *e = h_lookup(&mq->hotspot_table, hb);
  1032. if (e) {
  1033. stats_level_accessed(&mq->hotspot_stats, e->level);
  1034. hi = get_index(&mq->hotspot_alloc, e);
  1035. q_requeue(&mq->hotspot, e,
  1036. test_and_set_bit(hi, mq->hotspot_hit_bits) ?
  1037. 0u : mq->hotspot_level_jump,
  1038. NULL, NULL);
  1039. } else {
  1040. stats_miss(&mq->hotspot_stats);
  1041. e = alloc_entry(&mq->hotspot_alloc);
  1042. if (!e) {
  1043. e = q_pop(&mq->hotspot);
  1044. if (e) {
  1045. h_remove(&mq->hotspot_table, e);
  1046. hi = get_index(&mq->hotspot_alloc, e);
  1047. clear_bit(hi, mq->hotspot_hit_bits);
  1048. }
  1049. }
  1050. if (e) {
  1051. e->oblock = hb;
  1052. q_push(&mq->hotspot, e);
  1053. h_insert(&mq->hotspot_table, e);
  1054. }
  1055. }
  1056. return e;
  1057. }
  1058. /*----------------------------------------------------------------*/
  1059. /*
  1060. * Public interface, via the policy struct. See dm-cache-policy.h for a
  1061. * description of these.
  1062. */
  1063. static struct smq_policy *to_smq_policy(struct dm_cache_policy *p)
  1064. {
  1065. return container_of(p, struct smq_policy, policy);
  1066. }
  1067. static void smq_destroy(struct dm_cache_policy *p)
  1068. {
  1069. struct smq_policy *mq = to_smq_policy(p);
  1070. btracker_destroy(mq->bg_work);
  1071. h_exit(&mq->hotspot_table);
  1072. h_exit(&mq->table);
  1073. free_bitset(mq->hotspot_hit_bits);
  1074. free_bitset(mq->cache_hit_bits);
  1075. space_exit(&mq->es);
  1076. kfree(mq);
  1077. }
  1078. /*----------------------------------------------------------------*/
  1079. static int __lookup(struct smq_policy *mq, dm_oblock_t oblock, dm_cblock_t *cblock,
  1080. int data_dir, bool fast_copy,
  1081. struct policy_work **work, bool *background_work)
  1082. {
  1083. struct entry *e, *hs_e;
  1084. enum promote_result pr;
  1085. *background_work = false;
  1086. e = h_lookup(&mq->table, oblock);
  1087. if (e) {
  1088. stats_level_accessed(&mq->cache_stats, e->level);
  1089. requeue(mq, e);
  1090. *cblock = infer_cblock(mq, e);
  1091. return 0;
  1092. } else {
  1093. stats_miss(&mq->cache_stats);
  1094. /*
  1095. * The hotspot queue only gets updated with misses.
  1096. */
  1097. hs_e = update_hotspot_queue(mq, oblock);
  1098. pr = should_promote(mq, hs_e, data_dir, fast_copy);
  1099. if (pr != PROMOTE_NOT) {
  1100. queue_promotion(mq, oblock, work);
  1101. *background_work = true;
  1102. }
  1103. return -ENOENT;
  1104. }
  1105. }
  1106. static int smq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock,
  1107. int data_dir, bool fast_copy,
  1108. bool *background_work)
  1109. {
  1110. int r;
  1111. unsigned long flags;
  1112. struct smq_policy *mq = to_smq_policy(p);
  1113. spin_lock_irqsave(&mq->lock, flags);
  1114. r = __lookup(mq, oblock, cblock,
  1115. data_dir, fast_copy,
  1116. NULL, background_work);
  1117. spin_unlock_irqrestore(&mq->lock, flags);
  1118. return r;
  1119. }
  1120. static int smq_lookup_with_work(struct dm_cache_policy *p,
  1121. dm_oblock_t oblock, dm_cblock_t *cblock,
  1122. int data_dir, bool fast_copy,
  1123. struct policy_work **work)
  1124. {
  1125. int r;
  1126. bool background_queued;
  1127. unsigned long flags;
  1128. struct smq_policy *mq = to_smq_policy(p);
  1129. spin_lock_irqsave(&mq->lock, flags);
  1130. r = __lookup(mq, oblock, cblock, data_dir, fast_copy, work, &background_queued);
  1131. spin_unlock_irqrestore(&mq->lock, flags);
  1132. return r;
  1133. }
  1134. static int smq_get_background_work(struct dm_cache_policy *p, bool idle,
  1135. struct policy_work **result)
  1136. {
  1137. int r;
  1138. unsigned long flags;
  1139. struct smq_policy *mq = to_smq_policy(p);
  1140. spin_lock_irqsave(&mq->lock, flags);
  1141. r = btracker_issue(mq->bg_work, result);
  1142. if (r == -ENODATA) {
  1143. if (!clean_target_met(mq, idle)) {
  1144. queue_writeback(mq);
  1145. r = btracker_issue(mq->bg_work, result);
  1146. }
  1147. }
  1148. spin_unlock_irqrestore(&mq->lock, flags);
  1149. return r;
  1150. }
  1151. /*
  1152. * We need to clear any pending work flags that have been set, and in the
  1153. * case of promotion free the entry for the destination cblock.
  1154. */
  1155. static void __complete_background_work(struct smq_policy *mq,
  1156. struct policy_work *work,
  1157. bool success)
  1158. {
  1159. struct entry *e = get_entry(&mq->cache_alloc,
  1160. from_cblock(work->cblock));
  1161. switch (work->op) {
  1162. case POLICY_PROMOTE:
  1163. // !h, !q, a
  1164. clear_pending(mq, e);
  1165. if (success) {
  1166. e->oblock = work->oblock;
  1167. e->level = NR_CACHE_LEVELS - 1;
  1168. push(mq, e);
  1169. // h, q, a
  1170. } else {
  1171. free_entry(&mq->cache_alloc, e);
  1172. // !h, !q, !a
  1173. }
  1174. break;
  1175. case POLICY_DEMOTE:
  1176. // h, !q, a
  1177. if (success) {
  1178. h_remove(&mq->table, e);
  1179. free_entry(&mq->cache_alloc, e);
  1180. // !h, !q, !a
  1181. } else {
  1182. clear_pending(mq, e);
  1183. push_queue(mq, e);
  1184. // h, q, a
  1185. }
  1186. break;
  1187. case POLICY_WRITEBACK:
  1188. // h, !q, a
  1189. clear_pending(mq, e);
  1190. push_queue(mq, e);
  1191. // h, q, a
  1192. break;
  1193. }
  1194. btracker_complete(mq->bg_work, work);
  1195. }
  1196. static void smq_complete_background_work(struct dm_cache_policy *p,
  1197. struct policy_work *work,
  1198. bool success)
  1199. {
  1200. unsigned long flags;
  1201. struct smq_policy *mq = to_smq_policy(p);
  1202. spin_lock_irqsave(&mq->lock, flags);
  1203. __complete_background_work(mq, work, success);
  1204. spin_unlock_irqrestore(&mq->lock, flags);
  1205. }
  1206. // in_hash(oblock) -> in_hash(oblock)
  1207. static void __smq_set_clear_dirty(struct smq_policy *mq, dm_cblock_t cblock, bool set)
  1208. {
  1209. struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
  1210. if (e->pending_work)
  1211. e->dirty = set;
  1212. else {
  1213. del_queue(mq, e);
  1214. e->dirty = set;
  1215. push_queue(mq, e);
  1216. }
  1217. }
  1218. static void smq_set_dirty(struct dm_cache_policy *p, dm_cblock_t cblock)
  1219. {
  1220. unsigned long flags;
  1221. struct smq_policy *mq = to_smq_policy(p);
  1222. spin_lock_irqsave(&mq->lock, flags);
  1223. __smq_set_clear_dirty(mq, cblock, true);
  1224. spin_unlock_irqrestore(&mq->lock, flags);
  1225. }
  1226. static void smq_clear_dirty(struct dm_cache_policy *p, dm_cblock_t cblock)
  1227. {
  1228. struct smq_policy *mq = to_smq_policy(p);
  1229. unsigned long flags;
  1230. spin_lock_irqsave(&mq->lock, flags);
  1231. __smq_set_clear_dirty(mq, cblock, false);
  1232. spin_unlock_irqrestore(&mq->lock, flags);
  1233. }
  1234. static unsigned random_level(dm_cblock_t cblock)
  1235. {
  1236. return hash_32(from_cblock(cblock), 9) & (NR_CACHE_LEVELS - 1);
  1237. }
  1238. static int smq_load_mapping(struct dm_cache_policy *p,
  1239. dm_oblock_t oblock, dm_cblock_t cblock,
  1240. bool dirty, uint32_t hint, bool hint_valid)
  1241. {
  1242. struct smq_policy *mq = to_smq_policy(p);
  1243. struct entry *e;
  1244. e = alloc_particular_entry(&mq->cache_alloc, from_cblock(cblock));
  1245. e->oblock = oblock;
  1246. e->dirty = dirty;
  1247. e->level = hint_valid ? min(hint, NR_CACHE_LEVELS - 1) : random_level(cblock);
  1248. e->pending_work = false;
  1249. /*
  1250. * When we load mappings we push ahead of both sentinels in order to
  1251. * allow demotions and cleaning to occur immediately.
  1252. */
  1253. push_front(mq, e);
  1254. return 0;
  1255. }
  1256. static int smq_invalidate_mapping(struct dm_cache_policy *p, dm_cblock_t cblock)
  1257. {
  1258. struct smq_policy *mq = to_smq_policy(p);
  1259. struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
  1260. if (!e->allocated)
  1261. return -ENODATA;
  1262. // FIXME: what if this block has pending background work?
  1263. del_queue(mq, e);
  1264. h_remove(&mq->table, e);
  1265. free_entry(&mq->cache_alloc, e);
  1266. return 0;
  1267. }
  1268. static uint32_t smq_get_hint(struct dm_cache_policy *p, dm_cblock_t cblock)
  1269. {
  1270. struct smq_policy *mq = to_smq_policy(p);
  1271. struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
  1272. if (!e->allocated)
  1273. return 0;
  1274. return e->level;
  1275. }
  1276. static dm_cblock_t smq_residency(struct dm_cache_policy *p)
  1277. {
  1278. dm_cblock_t r;
  1279. unsigned long flags;
  1280. struct smq_policy *mq = to_smq_policy(p);
  1281. spin_lock_irqsave(&mq->lock, flags);
  1282. r = to_cblock(mq->cache_alloc.nr_allocated);
  1283. spin_unlock_irqrestore(&mq->lock, flags);
  1284. return r;
  1285. }
  1286. static void smq_tick(struct dm_cache_policy *p, bool can_block)
  1287. {
  1288. struct smq_policy *mq = to_smq_policy(p);
  1289. unsigned long flags;
  1290. spin_lock_irqsave(&mq->lock, flags);
  1291. mq->tick++;
  1292. update_sentinels(mq);
  1293. end_hotspot_period(mq);
  1294. end_cache_period(mq);
  1295. spin_unlock_irqrestore(&mq->lock, flags);
  1296. }
  1297. static void smq_allow_migrations(struct dm_cache_policy *p, bool allow)
  1298. {
  1299. struct smq_policy *mq = to_smq_policy(p);
  1300. mq->migrations_allowed = allow;
  1301. }
  1302. /*
  1303. * smq has no config values, but the old mq policy did. To avoid breaking
  1304. * software we continue to accept these configurables for the mq policy,
  1305. * but they have no effect.
  1306. */
  1307. static int mq_set_config_value(struct dm_cache_policy *p,
  1308. const char *key, const char *value)
  1309. {
  1310. unsigned long tmp;
  1311. if (kstrtoul(value, 10, &tmp))
  1312. return -EINVAL;
  1313. if (!strcasecmp(key, "random_threshold") ||
  1314. !strcasecmp(key, "sequential_threshold") ||
  1315. !strcasecmp(key, "discard_promote_adjustment") ||
  1316. !strcasecmp(key, "read_promote_adjustment") ||
  1317. !strcasecmp(key, "write_promote_adjustment")) {
  1318. DMWARN("tunable '%s' no longer has any effect, mq policy is now an alias for smq", key);
  1319. return 0;
  1320. }
  1321. return -EINVAL;
  1322. }
  1323. static int mq_emit_config_values(struct dm_cache_policy *p, char *result,
  1324. unsigned maxlen, ssize_t *sz_ptr)
  1325. {
  1326. ssize_t sz = *sz_ptr;
  1327. DMEMIT("10 random_threshold 0 "
  1328. "sequential_threshold 0 "
  1329. "discard_promote_adjustment 0 "
  1330. "read_promote_adjustment 0 "
  1331. "write_promote_adjustment 0 ");
  1332. *sz_ptr = sz;
  1333. return 0;
  1334. }
  1335. /* Init the policy plugin interface function pointers. */
  1336. static void init_policy_functions(struct smq_policy *mq, bool mimic_mq)
  1337. {
  1338. mq->policy.destroy = smq_destroy;
  1339. mq->policy.lookup = smq_lookup;
  1340. mq->policy.lookup_with_work = smq_lookup_with_work;
  1341. mq->policy.get_background_work = smq_get_background_work;
  1342. mq->policy.complete_background_work = smq_complete_background_work;
  1343. mq->policy.set_dirty = smq_set_dirty;
  1344. mq->policy.clear_dirty = smq_clear_dirty;
  1345. mq->policy.load_mapping = smq_load_mapping;
  1346. mq->policy.invalidate_mapping = smq_invalidate_mapping;
  1347. mq->policy.get_hint = smq_get_hint;
  1348. mq->policy.residency = smq_residency;
  1349. mq->policy.tick = smq_tick;
  1350. mq->policy.allow_migrations = smq_allow_migrations;
  1351. if (mimic_mq) {
  1352. mq->policy.set_config_value = mq_set_config_value;
  1353. mq->policy.emit_config_values = mq_emit_config_values;
  1354. }
  1355. }
  1356. static bool too_many_hotspot_blocks(sector_t origin_size,
  1357. sector_t hotspot_block_size,
  1358. unsigned nr_hotspot_blocks)
  1359. {
  1360. return (hotspot_block_size * nr_hotspot_blocks) > origin_size;
  1361. }
  1362. static void calc_hotspot_params(sector_t origin_size,
  1363. sector_t cache_block_size,
  1364. unsigned nr_cache_blocks,
  1365. sector_t *hotspot_block_size,
  1366. unsigned *nr_hotspot_blocks)
  1367. {
  1368. *hotspot_block_size = cache_block_size * 16u;
  1369. *nr_hotspot_blocks = max(nr_cache_blocks / 4u, 1024u);
  1370. while ((*hotspot_block_size > cache_block_size) &&
  1371. too_many_hotspot_blocks(origin_size, *hotspot_block_size, *nr_hotspot_blocks))
  1372. *hotspot_block_size /= 2u;
  1373. }
  1374. static struct dm_cache_policy *__smq_create(dm_cblock_t cache_size,
  1375. sector_t origin_size,
  1376. sector_t cache_block_size,
  1377. bool mimic_mq,
  1378. bool migrations_allowed)
  1379. {
  1380. unsigned i;
  1381. unsigned nr_sentinels_per_queue = 2u * NR_CACHE_LEVELS;
  1382. unsigned total_sentinels = 2u * nr_sentinels_per_queue;
  1383. struct smq_policy *mq = kzalloc(sizeof(*mq), GFP_KERNEL);
  1384. if (!mq)
  1385. return NULL;
  1386. init_policy_functions(mq, mimic_mq);
  1387. mq->cache_size = cache_size;
  1388. mq->cache_block_size = cache_block_size;
  1389. calc_hotspot_params(origin_size, cache_block_size, from_cblock(cache_size),
  1390. &mq->hotspot_block_size, &mq->nr_hotspot_blocks);
  1391. mq->cache_blocks_per_hotspot_block = div64_u64(mq->hotspot_block_size, mq->cache_block_size);
  1392. mq->hotspot_level_jump = 1u;
  1393. if (space_init(&mq->es, total_sentinels + mq->nr_hotspot_blocks + from_cblock(cache_size))) {
  1394. DMERR("couldn't initialize entry space");
  1395. goto bad_pool_init;
  1396. }
  1397. init_allocator(&mq->writeback_sentinel_alloc, &mq->es, 0, nr_sentinels_per_queue);
  1398. for (i = 0; i < nr_sentinels_per_queue; i++)
  1399. get_entry(&mq->writeback_sentinel_alloc, i)->sentinel = true;
  1400. init_allocator(&mq->demote_sentinel_alloc, &mq->es, nr_sentinels_per_queue, total_sentinels);
  1401. for (i = 0; i < nr_sentinels_per_queue; i++)
  1402. get_entry(&mq->demote_sentinel_alloc, i)->sentinel = true;
  1403. init_allocator(&mq->hotspot_alloc, &mq->es, total_sentinels,
  1404. total_sentinels + mq->nr_hotspot_blocks);
  1405. init_allocator(&mq->cache_alloc, &mq->es,
  1406. total_sentinels + mq->nr_hotspot_blocks,
  1407. total_sentinels + mq->nr_hotspot_blocks + from_cblock(cache_size));
  1408. mq->hotspot_hit_bits = alloc_bitset(mq->nr_hotspot_blocks);
  1409. if (!mq->hotspot_hit_bits) {
  1410. DMERR("couldn't allocate hotspot hit bitset");
  1411. goto bad_hotspot_hit_bits;
  1412. }
  1413. clear_bitset(mq->hotspot_hit_bits, mq->nr_hotspot_blocks);
  1414. if (from_cblock(cache_size)) {
  1415. mq->cache_hit_bits = alloc_bitset(from_cblock(cache_size));
  1416. if (!mq->cache_hit_bits) {
  1417. DMERR("couldn't allocate cache hit bitset");
  1418. goto bad_cache_hit_bits;
  1419. }
  1420. clear_bitset(mq->cache_hit_bits, from_cblock(mq->cache_size));
  1421. } else
  1422. mq->cache_hit_bits = NULL;
  1423. mq->tick = 0;
  1424. spin_lock_init(&mq->lock);
  1425. q_init(&mq->hotspot, &mq->es, NR_HOTSPOT_LEVELS);
  1426. mq->hotspot.nr_top_levels = 8;
  1427. mq->hotspot.nr_in_top_levels = min(mq->nr_hotspot_blocks / NR_HOTSPOT_LEVELS,
  1428. from_cblock(mq->cache_size) / mq->cache_blocks_per_hotspot_block);
  1429. q_init(&mq->clean, &mq->es, NR_CACHE_LEVELS);
  1430. q_init(&mq->dirty, &mq->es, NR_CACHE_LEVELS);
  1431. stats_init(&mq->hotspot_stats, NR_HOTSPOT_LEVELS);
  1432. stats_init(&mq->cache_stats, NR_CACHE_LEVELS);
  1433. if (h_init(&mq->table, &mq->es, from_cblock(cache_size)))
  1434. goto bad_alloc_table;
  1435. if (h_init(&mq->hotspot_table, &mq->es, mq->nr_hotspot_blocks))
  1436. goto bad_alloc_hotspot_table;
  1437. sentinels_init(mq);
  1438. mq->write_promote_level = mq->read_promote_level = NR_HOTSPOT_LEVELS;
  1439. mq->next_hotspot_period = jiffies;
  1440. mq->next_cache_period = jiffies;
  1441. mq->bg_work = btracker_create(10240); /* FIXME: hard coded value */
  1442. if (!mq->bg_work)
  1443. goto bad_btracker;
  1444. mq->migrations_allowed = migrations_allowed;
  1445. return &mq->policy;
  1446. bad_btracker:
  1447. h_exit(&mq->hotspot_table);
  1448. bad_alloc_hotspot_table:
  1449. h_exit(&mq->table);
  1450. bad_alloc_table:
  1451. free_bitset(mq->cache_hit_bits);
  1452. bad_cache_hit_bits:
  1453. free_bitset(mq->hotspot_hit_bits);
  1454. bad_hotspot_hit_bits:
  1455. space_exit(&mq->es);
  1456. bad_pool_init:
  1457. kfree(mq);
  1458. return NULL;
  1459. }
  1460. static struct dm_cache_policy *smq_create(dm_cblock_t cache_size,
  1461. sector_t origin_size,
  1462. sector_t cache_block_size)
  1463. {
  1464. return __smq_create(cache_size, origin_size, cache_block_size, false, true);
  1465. }
  1466. static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
  1467. sector_t origin_size,
  1468. sector_t cache_block_size)
  1469. {
  1470. return __smq_create(cache_size, origin_size, cache_block_size, true, true);
  1471. }
  1472. static struct dm_cache_policy *cleaner_create(dm_cblock_t cache_size,
  1473. sector_t origin_size,
  1474. sector_t cache_block_size)
  1475. {
  1476. return __smq_create(cache_size, origin_size, cache_block_size, false, false);
  1477. }
  1478. /*----------------------------------------------------------------*/
  1479. static struct dm_cache_policy_type smq_policy_type = {
  1480. .name = "smq",
  1481. .version = {2, 0, 0},
  1482. .hint_size = 4,
  1483. .owner = THIS_MODULE,
  1484. .create = smq_create
  1485. };
  1486. static struct dm_cache_policy_type mq_policy_type = {
  1487. .name = "mq",
  1488. .version = {2, 0, 0},
  1489. .hint_size = 4,
  1490. .owner = THIS_MODULE,
  1491. .create = mq_create,
  1492. };
  1493. static struct dm_cache_policy_type cleaner_policy_type = {
  1494. .name = "cleaner",
  1495. .version = {2, 0, 0},
  1496. .hint_size = 4,
  1497. .owner = THIS_MODULE,
  1498. .create = cleaner_create,
  1499. };
  1500. static struct dm_cache_policy_type default_policy_type = {
  1501. .name = "default",
  1502. .version = {2, 0, 0},
  1503. .hint_size = 4,
  1504. .owner = THIS_MODULE,
  1505. .create = smq_create,
  1506. .real = &smq_policy_type
  1507. };
  1508. static int __init smq_init(void)
  1509. {
  1510. int r;
  1511. r = dm_cache_policy_register(&smq_policy_type);
  1512. if (r) {
  1513. DMERR("register failed %d", r);
  1514. return -ENOMEM;
  1515. }
  1516. r = dm_cache_policy_register(&mq_policy_type);
  1517. if (r) {
  1518. DMERR("register failed (as mq) %d", r);
  1519. goto out_mq;
  1520. }
  1521. r = dm_cache_policy_register(&cleaner_policy_type);
  1522. if (r) {
  1523. DMERR("register failed (as cleaner) %d", r);
  1524. goto out_cleaner;
  1525. }
  1526. r = dm_cache_policy_register(&default_policy_type);
  1527. if (r) {
  1528. DMERR("register failed (as default) %d", r);
  1529. goto out_default;
  1530. }
  1531. return 0;
  1532. out_default:
  1533. dm_cache_policy_unregister(&cleaner_policy_type);
  1534. out_cleaner:
  1535. dm_cache_policy_unregister(&mq_policy_type);
  1536. out_mq:
  1537. dm_cache_policy_unregister(&smq_policy_type);
  1538. return -ENOMEM;
  1539. }
  1540. static void __exit smq_exit(void)
  1541. {
  1542. dm_cache_policy_unregister(&cleaner_policy_type);
  1543. dm_cache_policy_unregister(&smq_policy_type);
  1544. dm_cache_policy_unregister(&mq_policy_type);
  1545. dm_cache_policy_unregister(&default_policy_type);
  1546. }
  1547. module_init(smq_init);
  1548. module_exit(smq_exit);
  1549. MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
  1550. MODULE_LICENSE("GPL");
  1551. MODULE_DESCRIPTION("smq cache policy");
  1552. MODULE_ALIAS("dm-cache-default");
  1553. MODULE_ALIAS("dm-cache-mq");
  1554. MODULE_ALIAS("dm-cache-cleaner");