dm-cache-target.c 95 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869
  1. /*
  2. * Copyright (C) 2012 Red Hat. All rights reserved.
  3. *
  4. * This file is released under the GPL.
  5. */
  6. #include "dm.h"
  7. #include "dm-bio-prison.h"
  8. #include "dm-bio-record.h"
  9. #include "dm-cache-metadata.h"
  10. #include <linux/dm-io.h>
  11. #include <linux/dm-kcopyd.h>
  12. #include <linux/jiffies.h>
  13. #include <linux/init.h>
  14. #include <linux/mempool.h>
  15. #include <linux/module.h>
  16. #include <linux/slab.h>
  17. #include <linux/vmalloc.h>
  18. #define DM_MSG_PREFIX "cache"
  19. DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle,
  20. "A percentage of time allocated for copying to and/or from cache");
  21. /*----------------------------------------------------------------*/
  22. #define IOT_RESOLUTION 4
  23. struct io_tracker {
  24. spinlock_t lock;
  25. /*
  26. * Sectors of in-flight IO.
  27. */
  28. sector_t in_flight;
  29. /*
  30. * The time, in jiffies, when this device became idle (if it is
  31. * indeed idle).
  32. */
  33. unsigned long idle_time;
  34. unsigned long last_update_time;
  35. };
  36. static void iot_init(struct io_tracker *iot)
  37. {
  38. spin_lock_init(&iot->lock);
  39. iot->in_flight = 0ul;
  40. iot->idle_time = 0ul;
  41. iot->last_update_time = jiffies;
  42. }
  43. static bool __iot_idle_for(struct io_tracker *iot, unsigned long jifs)
  44. {
  45. if (iot->in_flight)
  46. return false;
  47. return time_after(jiffies, iot->idle_time + jifs);
  48. }
  49. static bool iot_idle_for(struct io_tracker *iot, unsigned long jifs)
  50. {
  51. bool r;
  52. unsigned long flags;
  53. spin_lock_irqsave(&iot->lock, flags);
  54. r = __iot_idle_for(iot, jifs);
  55. spin_unlock_irqrestore(&iot->lock, flags);
  56. return r;
  57. }
  58. static void iot_io_begin(struct io_tracker *iot, sector_t len)
  59. {
  60. unsigned long flags;
  61. spin_lock_irqsave(&iot->lock, flags);
  62. iot->in_flight += len;
  63. spin_unlock_irqrestore(&iot->lock, flags);
  64. }
  65. static void __iot_io_end(struct io_tracker *iot, sector_t len)
  66. {
  67. iot->in_flight -= len;
  68. if (!iot->in_flight)
  69. iot->idle_time = jiffies;
  70. }
  71. static void iot_io_end(struct io_tracker *iot, sector_t len)
  72. {
  73. unsigned long flags;
  74. spin_lock_irqsave(&iot->lock, flags);
  75. __iot_io_end(iot, len);
  76. spin_unlock_irqrestore(&iot->lock, flags);
  77. }
  78. /*----------------------------------------------------------------*/
  79. /*
  80. * Glossary:
  81. *
  82. * oblock: index of an origin block
  83. * cblock: index of a cache block
  84. * promotion: movement of a block from origin to cache
  85. * demotion: movement of a block from cache to origin
  86. * migration: movement of a block between the origin and cache device,
  87. * either direction
  88. */
  89. /*----------------------------------------------------------------*/
  90. /*
  91. * There are a couple of places where we let a bio run, but want to do some
  92. * work before calling its endio function. We do this by temporarily
  93. * changing the endio fn.
  94. */
  95. struct dm_hook_info {
  96. bio_end_io_t *bi_end_io;
  97. void *bi_private;
  98. };
  99. static void dm_hook_bio(struct dm_hook_info *h, struct bio *bio,
  100. bio_end_io_t *bi_end_io, void *bi_private)
  101. {
  102. h->bi_end_io = bio->bi_end_io;
  103. h->bi_private = bio->bi_private;
  104. bio->bi_end_io = bi_end_io;
  105. bio->bi_private = bi_private;
  106. }
  107. static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio)
  108. {
  109. bio->bi_end_io = h->bi_end_io;
  110. bio->bi_private = h->bi_private;
  111. }
  112. /*----------------------------------------------------------------*/
  113. #define MIGRATION_POOL_SIZE 128
  114. #define COMMIT_PERIOD HZ
  115. #define MIGRATION_COUNT_WINDOW 10
  116. /*
  117. * The block size of the device holding cache data must be
  118. * between 32KB and 1GB.
  119. */
  120. #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT)
  121. #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
  122. enum cache_metadata_mode {
  123. CM_WRITE, /* metadata may be changed */
  124. CM_READ_ONLY, /* metadata may not be changed */
  125. CM_FAIL
  126. };
  127. enum cache_io_mode {
  128. /*
  129. * Data is written to cached blocks only. These blocks are marked
  130. * dirty. If you lose the cache device you will lose data.
  131. * Potential performance increase for both reads and writes.
  132. */
  133. CM_IO_WRITEBACK,
  134. /*
  135. * Data is written to both cache and origin. Blocks are never
  136. * dirty. Potential performance benfit for reads only.
  137. */
  138. CM_IO_WRITETHROUGH,
  139. /*
  140. * A degraded mode useful for various cache coherency situations
  141. * (eg, rolling back snapshots). Reads and writes always go to the
  142. * origin. If a write goes to a cached oblock, then the cache
  143. * block is invalidated.
  144. */
  145. CM_IO_PASSTHROUGH
  146. };
  147. struct cache_features {
  148. enum cache_metadata_mode mode;
  149. enum cache_io_mode io_mode;
  150. };
  151. struct cache_stats {
  152. atomic_t read_hit;
  153. atomic_t read_miss;
  154. atomic_t write_hit;
  155. atomic_t write_miss;
  156. atomic_t demotion;
  157. atomic_t promotion;
  158. atomic_t copies_avoided;
  159. atomic_t cache_cell_clash;
  160. atomic_t commit_count;
  161. atomic_t discard_count;
  162. };
  163. /*
  164. * Defines a range of cblocks, begin to (end - 1) are in the range. end is
  165. * the one-past-the-end value.
  166. */
  167. struct cblock_range {
  168. dm_cblock_t begin;
  169. dm_cblock_t end;
  170. };
  171. struct invalidation_request {
  172. struct list_head list;
  173. struct cblock_range *cblocks;
  174. atomic_t complete;
  175. int err;
  176. wait_queue_head_t result_wait;
  177. };
  178. struct cache {
  179. struct dm_target *ti;
  180. struct dm_target_callbacks callbacks;
  181. struct dm_cache_metadata *cmd;
  182. /*
  183. * Metadata is written to this device.
  184. */
  185. struct dm_dev *metadata_dev;
  186. /*
  187. * The slower of the two data devices. Typically a spindle.
  188. */
  189. struct dm_dev *origin_dev;
  190. /*
  191. * The faster of the two data devices. Typically an SSD.
  192. */
  193. struct dm_dev *cache_dev;
  194. /*
  195. * Size of the origin device in _complete_ blocks and native sectors.
  196. */
  197. dm_oblock_t origin_blocks;
  198. sector_t origin_sectors;
  199. /*
  200. * Size of the cache device in blocks.
  201. */
  202. dm_cblock_t cache_size;
  203. /*
  204. * Fields for converting from sectors to blocks.
  205. */
  206. uint32_t sectors_per_block;
  207. int sectors_per_block_shift;
  208. spinlock_t lock;
  209. struct list_head deferred_cells;
  210. struct bio_list deferred_bios;
  211. struct bio_list deferred_flush_bios;
  212. struct bio_list deferred_writethrough_bios;
  213. struct list_head quiesced_migrations;
  214. struct list_head completed_migrations;
  215. struct list_head need_commit_migrations;
  216. sector_t migration_threshold;
  217. wait_queue_head_t migration_wait;
  218. atomic_t nr_allocated_migrations;
  219. /*
  220. * The number of in flight migrations that are performing
  221. * background io. eg, promotion, writeback.
  222. */
  223. atomic_t nr_io_migrations;
  224. wait_queue_head_t quiescing_wait;
  225. atomic_t quiescing;
  226. atomic_t quiescing_ack;
  227. /*
  228. * cache_size entries, dirty if set
  229. */
  230. atomic_t nr_dirty;
  231. unsigned long *dirty_bitset;
  232. /*
  233. * origin_blocks entries, discarded if set.
  234. */
  235. dm_dblock_t discard_nr_blocks;
  236. unsigned long *discard_bitset;
  237. uint32_t discard_block_size; /* a power of 2 times sectors per block */
  238. /*
  239. * Rather than reconstructing the table line for the status we just
  240. * save it and regurgitate.
  241. */
  242. unsigned nr_ctr_args;
  243. const char **ctr_args;
  244. struct dm_kcopyd_client *copier;
  245. struct workqueue_struct *wq;
  246. struct work_struct worker;
  247. struct delayed_work waker;
  248. unsigned long last_commit_jiffies;
  249. struct dm_bio_prison *prison;
  250. struct dm_deferred_set *all_io_ds;
  251. mempool_t *migration_pool;
  252. struct dm_cache_policy *policy;
  253. unsigned policy_nr_args;
  254. bool need_tick_bio:1;
  255. bool sized:1;
  256. bool invalidate:1;
  257. bool commit_requested:1;
  258. bool loaded_mappings:1;
  259. bool loaded_discards:1;
  260. /*
  261. * Cache features such as write-through.
  262. */
  263. struct cache_features features;
  264. struct cache_stats stats;
  265. /*
  266. * Invalidation fields.
  267. */
  268. spinlock_t invalidation_lock;
  269. struct list_head invalidation_requests;
  270. struct io_tracker origin_tracker;
  271. };
  272. struct per_bio_data {
  273. bool tick:1;
  274. unsigned req_nr:2;
  275. struct dm_deferred_entry *all_io_entry;
  276. struct dm_hook_info hook_info;
  277. sector_t len;
  278. /*
  279. * writethrough fields. These MUST remain at the end of this
  280. * structure and the 'cache' member must be the first as it
  281. * is used to determine the offset of the writethrough fields.
  282. */
  283. struct cache *cache;
  284. dm_cblock_t cblock;
  285. struct dm_bio_details bio_details;
  286. };
  287. struct dm_cache_migration {
  288. struct list_head list;
  289. struct cache *cache;
  290. unsigned long start_jiffies;
  291. dm_oblock_t old_oblock;
  292. dm_oblock_t new_oblock;
  293. dm_cblock_t cblock;
  294. bool err:1;
  295. bool discard:1;
  296. bool writeback:1;
  297. bool demote:1;
  298. bool promote:1;
  299. bool requeue_holder:1;
  300. bool invalidate:1;
  301. struct dm_bio_prison_cell *old_ocell;
  302. struct dm_bio_prison_cell *new_ocell;
  303. };
  304. /*
  305. * Processing a bio in the worker thread may require these memory
  306. * allocations. We prealloc to avoid deadlocks (the same worker thread
  307. * frees them back to the mempool).
  308. */
  309. struct prealloc {
  310. struct dm_cache_migration *mg;
  311. struct dm_bio_prison_cell *cell1;
  312. struct dm_bio_prison_cell *cell2;
  313. };
  314. static enum cache_metadata_mode get_cache_mode(struct cache *cache);
  315. static void wake_worker(struct cache *cache)
  316. {
  317. queue_work(cache->wq, &cache->worker);
  318. }
  319. /*----------------------------------------------------------------*/
  320. static struct dm_bio_prison_cell *alloc_prison_cell(struct cache *cache)
  321. {
  322. /* FIXME: change to use a local slab. */
  323. return dm_bio_prison_alloc_cell(cache->prison, GFP_NOWAIT);
  324. }
  325. static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell *cell)
  326. {
  327. dm_bio_prison_free_cell(cache->prison, cell);
  328. }
  329. static struct dm_cache_migration *alloc_migration(struct cache *cache)
  330. {
  331. struct dm_cache_migration *mg;
  332. mg = mempool_alloc(cache->migration_pool, GFP_NOWAIT);
  333. if (mg) {
  334. mg->cache = cache;
  335. atomic_inc(&mg->cache->nr_allocated_migrations);
  336. }
  337. return mg;
  338. }
  339. static void free_migration(struct dm_cache_migration *mg)
  340. {
  341. struct cache *cache = mg->cache;
  342. if (atomic_dec_and_test(&cache->nr_allocated_migrations))
  343. wake_up(&cache->migration_wait);
  344. mempool_free(mg, cache->migration_pool);
  345. wake_worker(cache);
  346. }
  347. static int prealloc_data_structs(struct cache *cache, struct prealloc *p)
  348. {
  349. if (!p->mg) {
  350. p->mg = alloc_migration(cache);
  351. if (!p->mg)
  352. return -ENOMEM;
  353. }
  354. if (!p->cell1) {
  355. p->cell1 = alloc_prison_cell(cache);
  356. if (!p->cell1)
  357. return -ENOMEM;
  358. }
  359. if (!p->cell2) {
  360. p->cell2 = alloc_prison_cell(cache);
  361. if (!p->cell2)
  362. return -ENOMEM;
  363. }
  364. return 0;
  365. }
  366. static void prealloc_free_structs(struct cache *cache, struct prealloc *p)
  367. {
  368. if (p->cell2)
  369. free_prison_cell(cache, p->cell2);
  370. if (p->cell1)
  371. free_prison_cell(cache, p->cell1);
  372. if (p->mg)
  373. free_migration(p->mg);
  374. }
  375. static struct dm_cache_migration *prealloc_get_migration(struct prealloc *p)
  376. {
  377. struct dm_cache_migration *mg = p->mg;
  378. BUG_ON(!mg);
  379. p->mg = NULL;
  380. return mg;
  381. }
  382. /*
  383. * You must have a cell within the prealloc struct to return. If not this
  384. * function will BUG() rather than returning NULL.
  385. */
  386. static struct dm_bio_prison_cell *prealloc_get_cell(struct prealloc *p)
  387. {
  388. struct dm_bio_prison_cell *r = NULL;
  389. if (p->cell1) {
  390. r = p->cell1;
  391. p->cell1 = NULL;
  392. } else if (p->cell2) {
  393. r = p->cell2;
  394. p->cell2 = NULL;
  395. } else
  396. BUG();
  397. return r;
  398. }
  399. /*
  400. * You can't have more than two cells in a prealloc struct. BUG() will be
  401. * called if you try and overfill.
  402. */
  403. static void prealloc_put_cell(struct prealloc *p, struct dm_bio_prison_cell *cell)
  404. {
  405. if (!p->cell2)
  406. p->cell2 = cell;
  407. else if (!p->cell1)
  408. p->cell1 = cell;
  409. else
  410. BUG();
  411. }
  412. /*----------------------------------------------------------------*/
  413. static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key *key)
  414. {
  415. key->virtual = 0;
  416. key->dev = 0;
  417. key->block_begin = from_oblock(begin);
  418. key->block_end = from_oblock(end);
  419. }
  420. /*
  421. * The caller hands in a preallocated cell, and a free function for it.
  422. * The cell will be freed if there's an error, or if it wasn't used because
  423. * a cell with that key already exists.
  424. */
  425. typedef void (*cell_free_fn)(void *context, struct dm_bio_prison_cell *cell);
  426. static int bio_detain_range(struct cache *cache, dm_oblock_t oblock_begin, dm_oblock_t oblock_end,
  427. struct bio *bio, struct dm_bio_prison_cell *cell_prealloc,
  428. cell_free_fn free_fn, void *free_context,
  429. struct dm_bio_prison_cell **cell_result)
  430. {
  431. int r;
  432. struct dm_cell_key key;
  433. build_key(oblock_begin, oblock_end, &key);
  434. r = dm_bio_detain(cache->prison, &key, bio, cell_prealloc, cell_result);
  435. if (r)
  436. free_fn(free_context, cell_prealloc);
  437. return r;
  438. }
  439. static int bio_detain(struct cache *cache, dm_oblock_t oblock,
  440. struct bio *bio, struct dm_bio_prison_cell *cell_prealloc,
  441. cell_free_fn free_fn, void *free_context,
  442. struct dm_bio_prison_cell **cell_result)
  443. {
  444. dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL);
  445. return bio_detain_range(cache, oblock, end, bio,
  446. cell_prealloc, free_fn, free_context, cell_result);
  447. }
  448. static int get_cell(struct cache *cache,
  449. dm_oblock_t oblock,
  450. struct prealloc *structs,
  451. struct dm_bio_prison_cell **cell_result)
  452. {
  453. int r;
  454. struct dm_cell_key key;
  455. struct dm_bio_prison_cell *cell_prealloc;
  456. cell_prealloc = prealloc_get_cell(structs);
  457. build_key(oblock, to_oblock(from_oblock(oblock) + 1ULL), &key);
  458. r = dm_get_cell(cache->prison, &key, cell_prealloc, cell_result);
  459. if (r)
  460. prealloc_put_cell(structs, cell_prealloc);
  461. return r;
  462. }
  463. /*----------------------------------------------------------------*/
  464. static bool is_dirty(struct cache *cache, dm_cblock_t b)
  465. {
  466. return test_bit(from_cblock(b), cache->dirty_bitset);
  467. }
  468. static void set_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
  469. {
  470. if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) {
  471. atomic_inc(&cache->nr_dirty);
  472. policy_set_dirty(cache->policy, oblock);
  473. }
  474. }
  475. static void clear_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
  476. {
  477. if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) {
  478. policy_clear_dirty(cache->policy, oblock);
  479. if (atomic_dec_return(&cache->nr_dirty) == 0)
  480. dm_table_event(cache->ti->table);
  481. }
  482. }
  483. /*----------------------------------------------------------------*/
  484. static bool block_size_is_power_of_two(struct cache *cache)
  485. {
  486. return cache->sectors_per_block_shift >= 0;
  487. }
  488. /* gcc on ARM generates spurious references to __udivdi3 and __umoddi3 */
  489. #if defined(CONFIG_ARM) && __GNUC__ == 4 && __GNUC_MINOR__ <= 6
  490. __always_inline
  491. #endif
  492. static dm_block_t block_div(dm_block_t b, uint32_t n)
  493. {
  494. do_div(b, n);
  495. return b;
  496. }
  497. static dm_block_t oblocks_per_dblock(struct cache *cache)
  498. {
  499. dm_block_t oblocks = cache->discard_block_size;
  500. if (block_size_is_power_of_two(cache))
  501. oblocks >>= cache->sectors_per_block_shift;
  502. else
  503. oblocks = block_div(oblocks, cache->sectors_per_block);
  504. return oblocks;
  505. }
  506. static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock)
  507. {
  508. return to_dblock(block_div(from_oblock(oblock),
  509. oblocks_per_dblock(cache)));
  510. }
  511. static dm_oblock_t dblock_to_oblock(struct cache *cache, dm_dblock_t dblock)
  512. {
  513. return to_oblock(from_dblock(dblock) * oblocks_per_dblock(cache));
  514. }
  515. static void set_discard(struct cache *cache, dm_dblock_t b)
  516. {
  517. unsigned long flags;
  518. BUG_ON(from_dblock(b) >= from_dblock(cache->discard_nr_blocks));
  519. atomic_inc(&cache->stats.discard_count);
  520. spin_lock_irqsave(&cache->lock, flags);
  521. set_bit(from_dblock(b), cache->discard_bitset);
  522. spin_unlock_irqrestore(&cache->lock, flags);
  523. }
  524. static void clear_discard(struct cache *cache, dm_dblock_t b)
  525. {
  526. unsigned long flags;
  527. spin_lock_irqsave(&cache->lock, flags);
  528. clear_bit(from_dblock(b), cache->discard_bitset);
  529. spin_unlock_irqrestore(&cache->lock, flags);
  530. }
  531. static bool is_discarded(struct cache *cache, dm_dblock_t b)
  532. {
  533. int r;
  534. unsigned long flags;
  535. spin_lock_irqsave(&cache->lock, flags);
  536. r = test_bit(from_dblock(b), cache->discard_bitset);
  537. spin_unlock_irqrestore(&cache->lock, flags);
  538. return r;
  539. }
  540. static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b)
  541. {
  542. int r;
  543. unsigned long flags;
  544. spin_lock_irqsave(&cache->lock, flags);
  545. r = test_bit(from_dblock(oblock_to_dblock(cache, b)),
  546. cache->discard_bitset);
  547. spin_unlock_irqrestore(&cache->lock, flags);
  548. return r;
  549. }
  550. /*----------------------------------------------------------------*/
  551. static void load_stats(struct cache *cache)
  552. {
  553. struct dm_cache_statistics stats;
  554. dm_cache_metadata_get_stats(cache->cmd, &stats);
  555. atomic_set(&cache->stats.read_hit, stats.read_hits);
  556. atomic_set(&cache->stats.read_miss, stats.read_misses);
  557. atomic_set(&cache->stats.write_hit, stats.write_hits);
  558. atomic_set(&cache->stats.write_miss, stats.write_misses);
  559. }
  560. static void save_stats(struct cache *cache)
  561. {
  562. struct dm_cache_statistics stats;
  563. if (get_cache_mode(cache) >= CM_READ_ONLY)
  564. return;
  565. stats.read_hits = atomic_read(&cache->stats.read_hit);
  566. stats.read_misses = atomic_read(&cache->stats.read_miss);
  567. stats.write_hits = atomic_read(&cache->stats.write_hit);
  568. stats.write_misses = atomic_read(&cache->stats.write_miss);
  569. dm_cache_metadata_set_stats(cache->cmd, &stats);
  570. }
  571. /*----------------------------------------------------------------
  572. * Per bio data
  573. *--------------------------------------------------------------*/
  574. /*
  575. * If using writeback, leave out struct per_bio_data's writethrough fields.
  576. */
  577. #define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache))
  578. #define PB_DATA_SIZE_WT (sizeof(struct per_bio_data))
  579. static bool writethrough_mode(struct cache_features *f)
  580. {
  581. return f->io_mode == CM_IO_WRITETHROUGH;
  582. }
  583. static bool writeback_mode(struct cache_features *f)
  584. {
  585. return f->io_mode == CM_IO_WRITEBACK;
  586. }
  587. static bool passthrough_mode(struct cache_features *f)
  588. {
  589. return f->io_mode == CM_IO_PASSTHROUGH;
  590. }
  591. static size_t get_per_bio_data_size(struct cache *cache)
  592. {
  593. return writethrough_mode(&cache->features) ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB;
  594. }
  595. static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size)
  596. {
  597. struct per_bio_data *pb = dm_per_bio_data(bio, data_size);
  598. BUG_ON(!pb);
  599. return pb;
  600. }
  601. static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size)
  602. {
  603. struct per_bio_data *pb = get_per_bio_data(bio, data_size);
  604. pb->tick = false;
  605. pb->req_nr = dm_bio_get_target_bio_nr(bio);
  606. pb->all_io_entry = NULL;
  607. pb->len = 0;
  608. return pb;
  609. }
  610. /*----------------------------------------------------------------
  611. * Remapping
  612. *--------------------------------------------------------------*/
  613. static void remap_to_origin(struct cache *cache, struct bio *bio)
  614. {
  615. bio->bi_bdev = cache->origin_dev->bdev;
  616. }
  617. static void remap_to_cache(struct cache *cache, struct bio *bio,
  618. dm_cblock_t cblock)
  619. {
  620. sector_t bi_sector = bio->bi_iter.bi_sector;
  621. sector_t block = from_cblock(cblock);
  622. bio->bi_bdev = cache->cache_dev->bdev;
  623. if (!block_size_is_power_of_two(cache))
  624. bio->bi_iter.bi_sector =
  625. (block * cache->sectors_per_block) +
  626. sector_div(bi_sector, cache->sectors_per_block);
  627. else
  628. bio->bi_iter.bi_sector =
  629. (block << cache->sectors_per_block_shift) |
  630. (bi_sector & (cache->sectors_per_block - 1));
  631. }
  632. static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio)
  633. {
  634. unsigned long flags;
  635. size_t pb_data_size = get_per_bio_data_size(cache);
  636. struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
  637. spin_lock_irqsave(&cache->lock, flags);
  638. if (cache->need_tick_bio &&
  639. !(bio->bi_rw & (REQ_FUA | REQ_FLUSH | REQ_DISCARD))) {
  640. pb->tick = true;
  641. cache->need_tick_bio = false;
  642. }
  643. spin_unlock_irqrestore(&cache->lock, flags);
  644. }
  645. static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
  646. dm_oblock_t oblock)
  647. {
  648. check_if_tick_bio_needed(cache, bio);
  649. remap_to_origin(cache, bio);
  650. if (bio_data_dir(bio) == WRITE)
  651. clear_discard(cache, oblock_to_dblock(cache, oblock));
  652. }
  653. static void remap_to_cache_dirty(struct cache *cache, struct bio *bio,
  654. dm_oblock_t oblock, dm_cblock_t cblock)
  655. {
  656. check_if_tick_bio_needed(cache, bio);
  657. remap_to_cache(cache, bio, cblock);
  658. if (bio_data_dir(bio) == WRITE) {
  659. set_dirty(cache, oblock, cblock);
  660. clear_discard(cache, oblock_to_dblock(cache, oblock));
  661. }
  662. }
  663. static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio)
  664. {
  665. sector_t block_nr = bio->bi_iter.bi_sector;
  666. if (!block_size_is_power_of_two(cache))
  667. (void) sector_div(block_nr, cache->sectors_per_block);
  668. else
  669. block_nr >>= cache->sectors_per_block_shift;
  670. return to_oblock(block_nr);
  671. }
  672. static int bio_triggers_commit(struct cache *cache, struct bio *bio)
  673. {
  674. return bio->bi_rw & (REQ_FLUSH | REQ_FUA);
  675. }
  676. /*
  677. * You must increment the deferred set whilst the prison cell is held. To
  678. * encourage this, we ask for 'cell' to be passed in.
  679. */
  680. static void inc_ds(struct cache *cache, struct bio *bio,
  681. struct dm_bio_prison_cell *cell)
  682. {
  683. size_t pb_data_size = get_per_bio_data_size(cache);
  684. struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
  685. BUG_ON(!cell);
  686. BUG_ON(pb->all_io_entry);
  687. pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
  688. }
  689. static bool accountable_bio(struct cache *cache, struct bio *bio)
  690. {
  691. return ((bio->bi_bdev == cache->origin_dev->bdev) &&
  692. !(bio->bi_rw & REQ_DISCARD));
  693. }
  694. static void accounted_begin(struct cache *cache, struct bio *bio)
  695. {
  696. size_t pb_data_size = get_per_bio_data_size(cache);
  697. struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
  698. if (accountable_bio(cache, bio)) {
  699. pb->len = bio_sectors(bio);
  700. iot_io_begin(&cache->origin_tracker, pb->len);
  701. }
  702. }
  703. static void accounted_complete(struct cache *cache, struct bio *bio)
  704. {
  705. size_t pb_data_size = get_per_bio_data_size(cache);
  706. struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
  707. iot_io_end(&cache->origin_tracker, pb->len);
  708. }
  709. static void accounted_request(struct cache *cache, struct bio *bio)
  710. {
  711. accounted_begin(cache, bio);
  712. generic_make_request(bio);
  713. }
  714. static void issue(struct cache *cache, struct bio *bio)
  715. {
  716. unsigned long flags;
  717. if (!bio_triggers_commit(cache, bio)) {
  718. accounted_request(cache, bio);
  719. return;
  720. }
  721. /*
  722. * Batch together any bios that trigger commits and then issue a
  723. * single commit for them in do_worker().
  724. */
  725. spin_lock_irqsave(&cache->lock, flags);
  726. cache->commit_requested = true;
  727. bio_list_add(&cache->deferred_flush_bios, bio);
  728. spin_unlock_irqrestore(&cache->lock, flags);
  729. }
  730. static void inc_and_issue(struct cache *cache, struct bio *bio, struct dm_bio_prison_cell *cell)
  731. {
  732. inc_ds(cache, bio, cell);
  733. issue(cache, bio);
  734. }
  735. static void defer_writethrough_bio(struct cache *cache, struct bio *bio)
  736. {
  737. unsigned long flags;
  738. spin_lock_irqsave(&cache->lock, flags);
  739. bio_list_add(&cache->deferred_writethrough_bios, bio);
  740. spin_unlock_irqrestore(&cache->lock, flags);
  741. wake_worker(cache);
  742. }
  743. static void writethrough_endio(struct bio *bio, int err)
  744. {
  745. struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT);
  746. dm_unhook_bio(&pb->hook_info, bio);
  747. if (err) {
  748. bio_endio(bio, err);
  749. return;
  750. }
  751. dm_bio_restore(&pb->bio_details, bio);
  752. remap_to_cache(pb->cache, bio, pb->cblock);
  753. /*
  754. * We can't issue this bio directly, since we're in interrupt
  755. * context. So it gets put on a bio list for processing by the
  756. * worker thread.
  757. */
  758. defer_writethrough_bio(pb->cache, bio);
  759. }
  760. /*
  761. * When running in writethrough mode we need to send writes to clean blocks
  762. * to both the cache and origin devices. In future we'd like to clone the
  763. * bio and send them in parallel, but for now we're doing them in
  764. * series as this is easier.
  765. */
  766. static void remap_to_origin_then_cache(struct cache *cache, struct bio *bio,
  767. dm_oblock_t oblock, dm_cblock_t cblock)
  768. {
  769. struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT);
  770. pb->cache = cache;
  771. pb->cblock = cblock;
  772. dm_hook_bio(&pb->hook_info, bio, writethrough_endio, NULL);
  773. dm_bio_record(&pb->bio_details, bio);
  774. remap_to_origin_clear_discard(pb->cache, bio, oblock);
  775. }
  776. /*----------------------------------------------------------------
  777. * Failure modes
  778. *--------------------------------------------------------------*/
  779. static enum cache_metadata_mode get_cache_mode(struct cache *cache)
  780. {
  781. return cache->features.mode;
  782. }
  783. static const char *cache_device_name(struct cache *cache)
  784. {
  785. return dm_device_name(dm_table_get_md(cache->ti->table));
  786. }
  787. static void notify_mode_switch(struct cache *cache, enum cache_metadata_mode mode)
  788. {
  789. const char *descs[] = {
  790. "write",
  791. "read-only",
  792. "fail"
  793. };
  794. dm_table_event(cache->ti->table);
  795. DMINFO("%s: switching cache to %s mode",
  796. cache_device_name(cache), descs[(int)mode]);
  797. }
  798. static void set_cache_mode(struct cache *cache, enum cache_metadata_mode new_mode)
  799. {
  800. bool needs_check = dm_cache_metadata_needs_check(cache->cmd);
  801. enum cache_metadata_mode old_mode = get_cache_mode(cache);
  802. if (new_mode == CM_WRITE && needs_check) {
  803. DMERR("%s: unable to switch cache to write mode until repaired.",
  804. cache_device_name(cache));
  805. if (old_mode != new_mode)
  806. new_mode = old_mode;
  807. else
  808. new_mode = CM_READ_ONLY;
  809. }
  810. /* Never move out of fail mode */
  811. if (old_mode == CM_FAIL)
  812. new_mode = CM_FAIL;
  813. switch (new_mode) {
  814. case CM_FAIL:
  815. case CM_READ_ONLY:
  816. dm_cache_metadata_set_read_only(cache->cmd);
  817. break;
  818. case CM_WRITE:
  819. dm_cache_metadata_set_read_write(cache->cmd);
  820. break;
  821. }
  822. cache->features.mode = new_mode;
  823. if (new_mode != old_mode)
  824. notify_mode_switch(cache, new_mode);
  825. }
  826. static void abort_transaction(struct cache *cache)
  827. {
  828. const char *dev_name = cache_device_name(cache);
  829. if (get_cache_mode(cache) >= CM_READ_ONLY)
  830. return;
  831. if (dm_cache_metadata_set_needs_check(cache->cmd)) {
  832. DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name);
  833. set_cache_mode(cache, CM_FAIL);
  834. }
  835. DMERR_LIMIT("%s: aborting current metadata transaction", dev_name);
  836. if (dm_cache_metadata_abort(cache->cmd)) {
  837. DMERR("%s: failed to abort metadata transaction", dev_name);
  838. set_cache_mode(cache, CM_FAIL);
  839. }
  840. }
  841. static void metadata_operation_failed(struct cache *cache, const char *op, int r)
  842. {
  843. DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d",
  844. cache_device_name(cache), op, r);
  845. abort_transaction(cache);
  846. set_cache_mode(cache, CM_READ_ONLY);
  847. }
  848. /*----------------------------------------------------------------
  849. * Migration processing
  850. *
  851. * Migration covers moving data from the origin device to the cache, or
  852. * vice versa.
  853. *--------------------------------------------------------------*/
  854. static void inc_io_migrations(struct cache *cache)
  855. {
  856. atomic_inc(&cache->nr_io_migrations);
  857. }
  858. static void dec_io_migrations(struct cache *cache)
  859. {
  860. atomic_dec(&cache->nr_io_migrations);
  861. }
  862. static void __cell_release(struct cache *cache, struct dm_bio_prison_cell *cell,
  863. bool holder, struct bio_list *bios)
  864. {
  865. (holder ? dm_cell_release : dm_cell_release_no_holder)
  866. (cache->prison, cell, bios);
  867. free_prison_cell(cache, cell);
  868. }
  869. static bool discard_or_flush(struct bio *bio)
  870. {
  871. return bio->bi_rw & (REQ_FLUSH | REQ_FUA | REQ_DISCARD);
  872. }
  873. static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell)
  874. {
  875. if (discard_or_flush(cell->holder))
  876. /*
  877. * We have to handle these bios
  878. * individually.
  879. */
  880. __cell_release(cache, cell, true, &cache->deferred_bios);
  881. else
  882. list_add_tail(&cell->user_list, &cache->deferred_cells);
  883. }
  884. static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell, bool holder)
  885. {
  886. unsigned long flags;
  887. if (!holder && dm_cell_promote_or_release(cache->prison, cell)) {
  888. /*
  889. * There was no prisoner to promote to holder, the
  890. * cell has been released.
  891. */
  892. free_prison_cell(cache, cell);
  893. return;
  894. }
  895. spin_lock_irqsave(&cache->lock, flags);
  896. __cell_defer(cache, cell);
  897. spin_unlock_irqrestore(&cache->lock, flags);
  898. wake_worker(cache);
  899. }
  900. static void cell_error_with_code(struct cache *cache, struct dm_bio_prison_cell *cell, int err)
  901. {
  902. dm_cell_error(cache->prison, cell, err);
  903. dm_bio_prison_free_cell(cache->prison, cell);
  904. }
  905. static void cell_requeue(struct cache *cache, struct dm_bio_prison_cell *cell)
  906. {
  907. cell_error_with_code(cache, cell, DM_ENDIO_REQUEUE);
  908. }
  909. static void free_io_migration(struct dm_cache_migration *mg)
  910. {
  911. dec_io_migrations(mg->cache);
  912. free_migration(mg);
  913. }
  914. static void migration_failure(struct dm_cache_migration *mg)
  915. {
  916. struct cache *cache = mg->cache;
  917. const char *dev_name = cache_device_name(cache);
  918. if (mg->writeback) {
  919. DMERR_LIMIT("%s: writeback failed; couldn't copy block", dev_name);
  920. set_dirty(cache, mg->old_oblock, mg->cblock);
  921. cell_defer(cache, mg->old_ocell, false);
  922. } else if (mg->demote) {
  923. DMERR_LIMIT("%s: demotion failed; couldn't copy block", dev_name);
  924. policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock);
  925. cell_defer(cache, mg->old_ocell, mg->promote ? false : true);
  926. if (mg->promote)
  927. cell_defer(cache, mg->new_ocell, true);
  928. } else {
  929. DMERR_LIMIT("%s: promotion failed; couldn't copy block", dev_name);
  930. policy_remove_mapping(cache->policy, mg->new_oblock);
  931. cell_defer(cache, mg->new_ocell, true);
  932. }
  933. free_io_migration(mg);
  934. }
  935. static void migration_success_pre_commit(struct dm_cache_migration *mg)
  936. {
  937. int r;
  938. unsigned long flags;
  939. struct cache *cache = mg->cache;
  940. if (mg->writeback) {
  941. clear_dirty(cache, mg->old_oblock, mg->cblock);
  942. cell_defer(cache, mg->old_ocell, false);
  943. free_io_migration(mg);
  944. return;
  945. } else if (mg->demote) {
  946. r = dm_cache_remove_mapping(cache->cmd, mg->cblock);
  947. if (r) {
  948. DMERR_LIMIT("%s: demotion failed; couldn't update on disk metadata",
  949. cache_device_name(cache));
  950. metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
  951. policy_force_mapping(cache->policy, mg->new_oblock,
  952. mg->old_oblock);
  953. if (mg->promote)
  954. cell_defer(cache, mg->new_ocell, true);
  955. free_io_migration(mg);
  956. return;
  957. }
  958. } else {
  959. r = dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock);
  960. if (r) {
  961. DMERR_LIMIT("%s: promotion failed; couldn't update on disk metadata",
  962. cache_device_name(cache));
  963. metadata_operation_failed(cache, "dm_cache_insert_mapping", r);
  964. policy_remove_mapping(cache->policy, mg->new_oblock);
  965. free_io_migration(mg);
  966. return;
  967. }
  968. }
  969. spin_lock_irqsave(&cache->lock, flags);
  970. list_add_tail(&mg->list, &cache->need_commit_migrations);
  971. cache->commit_requested = true;
  972. spin_unlock_irqrestore(&cache->lock, flags);
  973. }
  974. static void migration_success_post_commit(struct dm_cache_migration *mg)
  975. {
  976. unsigned long flags;
  977. struct cache *cache = mg->cache;
  978. if (mg->writeback) {
  979. DMWARN_LIMIT("%s: writeback unexpectedly triggered commit",
  980. cache_device_name(cache));
  981. return;
  982. } else if (mg->demote) {
  983. cell_defer(cache, mg->old_ocell, mg->promote ? false : true);
  984. if (mg->promote) {
  985. mg->demote = false;
  986. spin_lock_irqsave(&cache->lock, flags);
  987. list_add_tail(&mg->list, &cache->quiesced_migrations);
  988. spin_unlock_irqrestore(&cache->lock, flags);
  989. } else {
  990. if (mg->invalidate)
  991. policy_remove_mapping(cache->policy, mg->old_oblock);
  992. free_io_migration(mg);
  993. }
  994. } else {
  995. if (mg->requeue_holder) {
  996. clear_dirty(cache, mg->new_oblock, mg->cblock);
  997. cell_defer(cache, mg->new_ocell, true);
  998. } else {
  999. /*
  1000. * The block was promoted via an overwrite, so it's dirty.
  1001. */
  1002. set_dirty(cache, mg->new_oblock, mg->cblock);
  1003. bio_endio(mg->new_ocell->holder, 0);
  1004. cell_defer(cache, mg->new_ocell, false);
  1005. }
  1006. free_io_migration(mg);
  1007. }
  1008. }
  1009. static void copy_complete(int read_err, unsigned long write_err, void *context)
  1010. {
  1011. unsigned long flags;
  1012. struct dm_cache_migration *mg = (struct dm_cache_migration *) context;
  1013. struct cache *cache = mg->cache;
  1014. if (read_err || write_err)
  1015. mg->err = true;
  1016. spin_lock_irqsave(&cache->lock, flags);
  1017. list_add_tail(&mg->list, &cache->completed_migrations);
  1018. spin_unlock_irqrestore(&cache->lock, flags);
  1019. wake_worker(cache);
  1020. }
  1021. static void issue_copy(struct dm_cache_migration *mg)
  1022. {
  1023. int r;
  1024. struct dm_io_region o_region, c_region;
  1025. struct cache *cache = mg->cache;
  1026. sector_t cblock = from_cblock(mg->cblock);
  1027. o_region.bdev = cache->origin_dev->bdev;
  1028. o_region.count = cache->sectors_per_block;
  1029. c_region.bdev = cache->cache_dev->bdev;
  1030. c_region.sector = cblock * cache->sectors_per_block;
  1031. c_region.count = cache->sectors_per_block;
  1032. if (mg->writeback || mg->demote) {
  1033. /* demote */
  1034. o_region.sector = from_oblock(mg->old_oblock) * cache->sectors_per_block;
  1035. r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, mg);
  1036. } else {
  1037. /* promote */
  1038. o_region.sector = from_oblock(mg->new_oblock) * cache->sectors_per_block;
  1039. r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, mg);
  1040. }
  1041. if (r < 0) {
  1042. DMERR_LIMIT("%s: issuing migration failed", cache_device_name(cache));
  1043. migration_failure(mg);
  1044. }
  1045. }
  1046. static void overwrite_endio(struct bio *bio, int err)
  1047. {
  1048. struct dm_cache_migration *mg = bio->bi_private;
  1049. struct cache *cache = mg->cache;
  1050. size_t pb_data_size = get_per_bio_data_size(cache);
  1051. struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
  1052. unsigned long flags;
  1053. dm_unhook_bio(&pb->hook_info, bio);
  1054. if (err)
  1055. mg->err = true;
  1056. mg->requeue_holder = false;
  1057. spin_lock_irqsave(&cache->lock, flags);
  1058. list_add_tail(&mg->list, &cache->completed_migrations);
  1059. spin_unlock_irqrestore(&cache->lock, flags);
  1060. wake_worker(cache);
  1061. }
  1062. static void issue_overwrite(struct dm_cache_migration *mg, struct bio *bio)
  1063. {
  1064. size_t pb_data_size = get_per_bio_data_size(mg->cache);
  1065. struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
  1066. dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg);
  1067. remap_to_cache_dirty(mg->cache, bio, mg->new_oblock, mg->cblock);
  1068. /*
  1069. * No need to inc_ds() here, since the cell will be held for the
  1070. * duration of the io.
  1071. */
  1072. accounted_request(mg->cache, bio);
  1073. }
  1074. static bool bio_writes_complete_block(struct cache *cache, struct bio *bio)
  1075. {
  1076. return (bio_data_dir(bio) == WRITE) &&
  1077. (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT));
  1078. }
  1079. static void avoid_copy(struct dm_cache_migration *mg)
  1080. {
  1081. atomic_inc(&mg->cache->stats.copies_avoided);
  1082. migration_success_pre_commit(mg);
  1083. }
  1084. static void calc_discard_block_range(struct cache *cache, struct bio *bio,
  1085. dm_dblock_t *b, dm_dblock_t *e)
  1086. {
  1087. sector_t sb = bio->bi_iter.bi_sector;
  1088. sector_t se = bio_end_sector(bio);
  1089. *b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size));
  1090. if (se - sb < cache->discard_block_size)
  1091. *e = *b;
  1092. else
  1093. *e = to_dblock(block_div(se, cache->discard_block_size));
  1094. }
  1095. static void issue_discard(struct dm_cache_migration *mg)
  1096. {
  1097. dm_dblock_t b, e;
  1098. struct bio *bio = mg->new_ocell->holder;
  1099. calc_discard_block_range(mg->cache, bio, &b, &e);
  1100. while (b != e) {
  1101. set_discard(mg->cache, b);
  1102. b = to_dblock(from_dblock(b) + 1);
  1103. }
  1104. bio_endio(bio, 0);
  1105. cell_defer(mg->cache, mg->new_ocell, false);
  1106. free_migration(mg);
  1107. }
  1108. static void issue_copy_or_discard(struct dm_cache_migration *mg)
  1109. {
  1110. bool avoid;
  1111. struct cache *cache = mg->cache;
  1112. if (mg->discard) {
  1113. issue_discard(mg);
  1114. return;
  1115. }
  1116. if (mg->writeback || mg->demote)
  1117. avoid = !is_dirty(cache, mg->cblock) ||
  1118. is_discarded_oblock(cache, mg->old_oblock);
  1119. else {
  1120. struct bio *bio = mg->new_ocell->holder;
  1121. avoid = is_discarded_oblock(cache, mg->new_oblock);
  1122. if (writeback_mode(&cache->features) &&
  1123. !avoid && bio_writes_complete_block(cache, bio)) {
  1124. issue_overwrite(mg, bio);
  1125. return;
  1126. }
  1127. }
  1128. avoid ? avoid_copy(mg) : issue_copy(mg);
  1129. }
  1130. static void complete_migration(struct dm_cache_migration *mg)
  1131. {
  1132. if (mg->err)
  1133. migration_failure(mg);
  1134. else
  1135. migration_success_pre_commit(mg);
  1136. }
  1137. static void process_migrations(struct cache *cache, struct list_head *head,
  1138. void (*fn)(struct dm_cache_migration *))
  1139. {
  1140. unsigned long flags;
  1141. struct list_head list;
  1142. struct dm_cache_migration *mg, *tmp;
  1143. INIT_LIST_HEAD(&list);
  1144. spin_lock_irqsave(&cache->lock, flags);
  1145. list_splice_init(head, &list);
  1146. spin_unlock_irqrestore(&cache->lock, flags);
  1147. list_for_each_entry_safe(mg, tmp, &list, list)
  1148. fn(mg);
  1149. }
  1150. static void __queue_quiesced_migration(struct dm_cache_migration *mg)
  1151. {
  1152. list_add_tail(&mg->list, &mg->cache->quiesced_migrations);
  1153. }
  1154. static void queue_quiesced_migration(struct dm_cache_migration *mg)
  1155. {
  1156. unsigned long flags;
  1157. struct cache *cache = mg->cache;
  1158. spin_lock_irqsave(&cache->lock, flags);
  1159. __queue_quiesced_migration(mg);
  1160. spin_unlock_irqrestore(&cache->lock, flags);
  1161. wake_worker(cache);
  1162. }
  1163. static void queue_quiesced_migrations(struct cache *cache, struct list_head *work)
  1164. {
  1165. unsigned long flags;
  1166. struct dm_cache_migration *mg, *tmp;
  1167. spin_lock_irqsave(&cache->lock, flags);
  1168. list_for_each_entry_safe(mg, tmp, work, list)
  1169. __queue_quiesced_migration(mg);
  1170. spin_unlock_irqrestore(&cache->lock, flags);
  1171. wake_worker(cache);
  1172. }
  1173. static void check_for_quiesced_migrations(struct cache *cache,
  1174. struct per_bio_data *pb)
  1175. {
  1176. struct list_head work;
  1177. if (!pb->all_io_entry)
  1178. return;
  1179. INIT_LIST_HEAD(&work);
  1180. dm_deferred_entry_dec(pb->all_io_entry, &work);
  1181. if (!list_empty(&work))
  1182. queue_quiesced_migrations(cache, &work);
  1183. }
  1184. static void quiesce_migration(struct dm_cache_migration *mg)
  1185. {
  1186. if (!dm_deferred_set_add_work(mg->cache->all_io_ds, &mg->list))
  1187. queue_quiesced_migration(mg);
  1188. }
  1189. static void promote(struct cache *cache, struct prealloc *structs,
  1190. dm_oblock_t oblock, dm_cblock_t cblock,
  1191. struct dm_bio_prison_cell *cell)
  1192. {
  1193. struct dm_cache_migration *mg = prealloc_get_migration(structs);
  1194. mg->err = false;
  1195. mg->discard = false;
  1196. mg->writeback = false;
  1197. mg->demote = false;
  1198. mg->promote = true;
  1199. mg->requeue_holder = true;
  1200. mg->invalidate = false;
  1201. mg->cache = cache;
  1202. mg->new_oblock = oblock;
  1203. mg->cblock = cblock;
  1204. mg->old_ocell = NULL;
  1205. mg->new_ocell = cell;
  1206. mg->start_jiffies = jiffies;
  1207. inc_io_migrations(cache);
  1208. quiesce_migration(mg);
  1209. }
  1210. static void writeback(struct cache *cache, struct prealloc *structs,
  1211. dm_oblock_t oblock, dm_cblock_t cblock,
  1212. struct dm_bio_prison_cell *cell)
  1213. {
  1214. struct dm_cache_migration *mg = prealloc_get_migration(structs);
  1215. mg->err = false;
  1216. mg->discard = false;
  1217. mg->writeback = true;
  1218. mg->demote = false;
  1219. mg->promote = false;
  1220. mg->requeue_holder = true;
  1221. mg->invalidate = false;
  1222. mg->cache = cache;
  1223. mg->old_oblock = oblock;
  1224. mg->cblock = cblock;
  1225. mg->old_ocell = cell;
  1226. mg->new_ocell = NULL;
  1227. mg->start_jiffies = jiffies;
  1228. inc_io_migrations(cache);
  1229. quiesce_migration(mg);
  1230. }
  1231. static void demote_then_promote(struct cache *cache, struct prealloc *structs,
  1232. dm_oblock_t old_oblock, dm_oblock_t new_oblock,
  1233. dm_cblock_t cblock,
  1234. struct dm_bio_prison_cell *old_ocell,
  1235. struct dm_bio_prison_cell *new_ocell)
  1236. {
  1237. struct dm_cache_migration *mg = prealloc_get_migration(structs);
  1238. mg->err = false;
  1239. mg->discard = false;
  1240. mg->writeback = false;
  1241. mg->demote = true;
  1242. mg->promote = true;
  1243. mg->requeue_holder = true;
  1244. mg->invalidate = false;
  1245. mg->cache = cache;
  1246. mg->old_oblock = old_oblock;
  1247. mg->new_oblock = new_oblock;
  1248. mg->cblock = cblock;
  1249. mg->old_ocell = old_ocell;
  1250. mg->new_ocell = new_ocell;
  1251. mg->start_jiffies = jiffies;
  1252. inc_io_migrations(cache);
  1253. quiesce_migration(mg);
  1254. }
  1255. /*
  1256. * Invalidate a cache entry. No writeback occurs; any changes in the cache
  1257. * block are thrown away.
  1258. */
  1259. static void invalidate(struct cache *cache, struct prealloc *structs,
  1260. dm_oblock_t oblock, dm_cblock_t cblock,
  1261. struct dm_bio_prison_cell *cell)
  1262. {
  1263. struct dm_cache_migration *mg = prealloc_get_migration(structs);
  1264. mg->err = false;
  1265. mg->discard = false;
  1266. mg->writeback = false;
  1267. mg->demote = true;
  1268. mg->promote = false;
  1269. mg->requeue_holder = true;
  1270. mg->invalidate = true;
  1271. mg->cache = cache;
  1272. mg->old_oblock = oblock;
  1273. mg->cblock = cblock;
  1274. mg->old_ocell = cell;
  1275. mg->new_ocell = NULL;
  1276. mg->start_jiffies = jiffies;
  1277. inc_io_migrations(cache);
  1278. quiesce_migration(mg);
  1279. }
  1280. static void discard(struct cache *cache, struct prealloc *structs,
  1281. struct dm_bio_prison_cell *cell)
  1282. {
  1283. struct dm_cache_migration *mg = prealloc_get_migration(structs);
  1284. mg->err = false;
  1285. mg->discard = true;
  1286. mg->writeback = false;
  1287. mg->demote = false;
  1288. mg->promote = false;
  1289. mg->requeue_holder = false;
  1290. mg->invalidate = false;
  1291. mg->cache = cache;
  1292. mg->old_ocell = NULL;
  1293. mg->new_ocell = cell;
  1294. mg->start_jiffies = jiffies;
  1295. quiesce_migration(mg);
  1296. }
  1297. /*----------------------------------------------------------------
  1298. * bio processing
  1299. *--------------------------------------------------------------*/
  1300. static void defer_bio(struct cache *cache, struct bio *bio)
  1301. {
  1302. unsigned long flags;
  1303. spin_lock_irqsave(&cache->lock, flags);
  1304. bio_list_add(&cache->deferred_bios, bio);
  1305. spin_unlock_irqrestore(&cache->lock, flags);
  1306. wake_worker(cache);
  1307. }
  1308. static void process_flush_bio(struct cache *cache, struct bio *bio)
  1309. {
  1310. size_t pb_data_size = get_per_bio_data_size(cache);
  1311. struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
  1312. BUG_ON(bio->bi_iter.bi_size);
  1313. if (!pb->req_nr)
  1314. remap_to_origin(cache, bio);
  1315. else
  1316. remap_to_cache(cache, bio, 0);
  1317. /*
  1318. * REQ_FLUSH is not directed at any particular block so we don't
  1319. * need to inc_ds(). REQ_FUA's are split into a write + REQ_FLUSH
  1320. * by dm-core.
  1321. */
  1322. issue(cache, bio);
  1323. }
  1324. static void process_discard_bio(struct cache *cache, struct prealloc *structs,
  1325. struct bio *bio)
  1326. {
  1327. int r;
  1328. dm_dblock_t b, e;
  1329. struct dm_bio_prison_cell *cell_prealloc, *new_ocell;
  1330. calc_discard_block_range(cache, bio, &b, &e);
  1331. if (b == e) {
  1332. bio_endio(bio, 0);
  1333. return;
  1334. }
  1335. cell_prealloc = prealloc_get_cell(structs);
  1336. r = bio_detain_range(cache, dblock_to_oblock(cache, b), dblock_to_oblock(cache, e), bio, cell_prealloc,
  1337. (cell_free_fn) prealloc_put_cell,
  1338. structs, &new_ocell);
  1339. if (r > 0)
  1340. return;
  1341. discard(cache, structs, new_ocell);
  1342. }
  1343. static bool spare_migration_bandwidth(struct cache *cache)
  1344. {
  1345. sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) *
  1346. cache->sectors_per_block;
  1347. return current_volume < cache->migration_threshold;
  1348. }
  1349. static void inc_hit_counter(struct cache *cache, struct bio *bio)
  1350. {
  1351. atomic_inc(bio_data_dir(bio) == READ ?
  1352. &cache->stats.read_hit : &cache->stats.write_hit);
  1353. }
  1354. static void inc_miss_counter(struct cache *cache, struct bio *bio)
  1355. {
  1356. atomic_inc(bio_data_dir(bio) == READ ?
  1357. &cache->stats.read_miss : &cache->stats.write_miss);
  1358. }
  1359. /*----------------------------------------------------------------*/
  1360. struct inc_detail {
  1361. struct cache *cache;
  1362. struct bio_list bios_for_issue;
  1363. struct bio_list unhandled_bios;
  1364. bool any_writes;
  1365. };
  1366. static void inc_fn(void *context, struct dm_bio_prison_cell *cell)
  1367. {
  1368. struct bio *bio;
  1369. struct inc_detail *detail = context;
  1370. struct cache *cache = detail->cache;
  1371. inc_ds(cache, cell->holder, cell);
  1372. if (bio_data_dir(cell->holder) == WRITE)
  1373. detail->any_writes = true;
  1374. while ((bio = bio_list_pop(&cell->bios))) {
  1375. if (discard_or_flush(bio)) {
  1376. bio_list_add(&detail->unhandled_bios, bio);
  1377. continue;
  1378. }
  1379. if (bio_data_dir(bio) == WRITE)
  1380. detail->any_writes = true;
  1381. bio_list_add(&detail->bios_for_issue, bio);
  1382. inc_ds(cache, bio, cell);
  1383. }
  1384. }
  1385. // FIXME: refactor these two
  1386. static void remap_cell_to_origin_clear_discard(struct cache *cache,
  1387. struct dm_bio_prison_cell *cell,
  1388. dm_oblock_t oblock, bool issue_holder)
  1389. {
  1390. struct bio *bio;
  1391. unsigned long flags;
  1392. struct inc_detail detail;
  1393. detail.cache = cache;
  1394. bio_list_init(&detail.bios_for_issue);
  1395. bio_list_init(&detail.unhandled_bios);
  1396. detail.any_writes = false;
  1397. spin_lock_irqsave(&cache->lock, flags);
  1398. dm_cell_visit_release(cache->prison, inc_fn, &detail, cell);
  1399. bio_list_merge(&cache->deferred_bios, &detail.unhandled_bios);
  1400. spin_unlock_irqrestore(&cache->lock, flags);
  1401. remap_to_origin(cache, cell->holder);
  1402. if (issue_holder)
  1403. issue(cache, cell->holder);
  1404. else
  1405. accounted_begin(cache, cell->holder);
  1406. if (detail.any_writes)
  1407. clear_discard(cache, oblock_to_dblock(cache, oblock));
  1408. while ((bio = bio_list_pop(&detail.bios_for_issue))) {
  1409. remap_to_origin(cache, bio);
  1410. issue(cache, bio);
  1411. }
  1412. }
  1413. static void remap_cell_to_cache_dirty(struct cache *cache, struct dm_bio_prison_cell *cell,
  1414. dm_oblock_t oblock, dm_cblock_t cblock, bool issue_holder)
  1415. {
  1416. struct bio *bio;
  1417. unsigned long flags;
  1418. struct inc_detail detail;
  1419. detail.cache = cache;
  1420. bio_list_init(&detail.bios_for_issue);
  1421. bio_list_init(&detail.unhandled_bios);
  1422. detail.any_writes = false;
  1423. spin_lock_irqsave(&cache->lock, flags);
  1424. dm_cell_visit_release(cache->prison, inc_fn, &detail, cell);
  1425. bio_list_merge(&cache->deferred_bios, &detail.unhandled_bios);
  1426. spin_unlock_irqrestore(&cache->lock, flags);
  1427. remap_to_cache(cache, cell->holder, cblock);
  1428. if (issue_holder)
  1429. issue(cache, cell->holder);
  1430. else
  1431. accounted_begin(cache, cell->holder);
  1432. if (detail.any_writes) {
  1433. set_dirty(cache, oblock, cblock);
  1434. clear_discard(cache, oblock_to_dblock(cache, oblock));
  1435. }
  1436. while ((bio = bio_list_pop(&detail.bios_for_issue))) {
  1437. remap_to_cache(cache, bio, cblock);
  1438. issue(cache, bio);
  1439. }
  1440. }
  1441. /*----------------------------------------------------------------*/
  1442. struct old_oblock_lock {
  1443. struct policy_locker locker;
  1444. struct cache *cache;
  1445. struct prealloc *structs;
  1446. struct dm_bio_prison_cell *cell;
  1447. };
  1448. static int null_locker(struct policy_locker *locker, dm_oblock_t b)
  1449. {
  1450. /* This should never be called */
  1451. BUG();
  1452. return 0;
  1453. }
  1454. static int cell_locker(struct policy_locker *locker, dm_oblock_t b)
  1455. {
  1456. struct old_oblock_lock *l = container_of(locker, struct old_oblock_lock, locker);
  1457. struct dm_bio_prison_cell *cell_prealloc = prealloc_get_cell(l->structs);
  1458. return bio_detain(l->cache, b, NULL, cell_prealloc,
  1459. (cell_free_fn) prealloc_put_cell,
  1460. l->structs, &l->cell);
  1461. }
  1462. static void process_cell(struct cache *cache, struct prealloc *structs,
  1463. struct dm_bio_prison_cell *new_ocell)
  1464. {
  1465. int r;
  1466. bool release_cell = true;
  1467. struct bio *bio = new_ocell->holder;
  1468. dm_oblock_t block = get_bio_block(cache, bio);
  1469. struct policy_result lookup_result;
  1470. bool passthrough = passthrough_mode(&cache->features);
  1471. bool fast_promotion, can_migrate;
  1472. struct old_oblock_lock ool;
  1473. fast_promotion = is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio);
  1474. can_migrate = !passthrough && (fast_promotion || spare_migration_bandwidth(cache));
  1475. ool.locker.fn = cell_locker;
  1476. ool.cache = cache;
  1477. ool.structs = structs;
  1478. ool.cell = NULL;
  1479. r = policy_map(cache->policy, block, true, can_migrate, fast_promotion,
  1480. bio, &ool.locker, &lookup_result);
  1481. if (r == -EWOULDBLOCK)
  1482. /* migration has been denied */
  1483. lookup_result.op = POLICY_MISS;
  1484. switch (lookup_result.op) {
  1485. case POLICY_HIT:
  1486. if (passthrough) {
  1487. inc_miss_counter(cache, bio);
  1488. /*
  1489. * Passthrough always maps to the origin,
  1490. * invalidating any cache blocks that are written
  1491. * to.
  1492. */
  1493. if (bio_data_dir(bio) == WRITE) {
  1494. atomic_inc(&cache->stats.demotion);
  1495. invalidate(cache, structs, block, lookup_result.cblock, new_ocell);
  1496. release_cell = false;
  1497. } else {
  1498. /* FIXME: factor out issue_origin() */
  1499. remap_to_origin_clear_discard(cache, bio, block);
  1500. inc_and_issue(cache, bio, new_ocell);
  1501. }
  1502. } else {
  1503. inc_hit_counter(cache, bio);
  1504. if (bio_data_dir(bio) == WRITE &&
  1505. writethrough_mode(&cache->features) &&
  1506. !is_dirty(cache, lookup_result.cblock)) {
  1507. remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
  1508. inc_and_issue(cache, bio, new_ocell);
  1509. } else {
  1510. remap_cell_to_cache_dirty(cache, new_ocell, block, lookup_result.cblock, true);
  1511. release_cell = false;
  1512. }
  1513. }
  1514. break;
  1515. case POLICY_MISS:
  1516. inc_miss_counter(cache, bio);
  1517. remap_cell_to_origin_clear_discard(cache, new_ocell, block, true);
  1518. release_cell = false;
  1519. break;
  1520. case POLICY_NEW:
  1521. atomic_inc(&cache->stats.promotion);
  1522. promote(cache, structs, block, lookup_result.cblock, new_ocell);
  1523. release_cell = false;
  1524. break;
  1525. case POLICY_REPLACE:
  1526. atomic_inc(&cache->stats.demotion);
  1527. atomic_inc(&cache->stats.promotion);
  1528. demote_then_promote(cache, structs, lookup_result.old_oblock,
  1529. block, lookup_result.cblock,
  1530. ool.cell, new_ocell);
  1531. release_cell = false;
  1532. break;
  1533. default:
  1534. DMERR_LIMIT("%s: %s: erroring bio, unknown policy op: %u",
  1535. cache_device_name(cache), __func__,
  1536. (unsigned) lookup_result.op);
  1537. bio_io_error(bio);
  1538. }
  1539. if (release_cell)
  1540. cell_defer(cache, new_ocell, false);
  1541. }
  1542. static void process_bio(struct cache *cache, struct prealloc *structs,
  1543. struct bio *bio)
  1544. {
  1545. int r;
  1546. dm_oblock_t block = get_bio_block(cache, bio);
  1547. struct dm_bio_prison_cell *cell_prealloc, *new_ocell;
  1548. /*
  1549. * Check to see if that block is currently migrating.
  1550. */
  1551. cell_prealloc = prealloc_get_cell(structs);
  1552. r = bio_detain(cache, block, bio, cell_prealloc,
  1553. (cell_free_fn) prealloc_put_cell,
  1554. structs, &new_ocell);
  1555. if (r > 0)
  1556. return;
  1557. process_cell(cache, structs, new_ocell);
  1558. }
  1559. static int need_commit_due_to_time(struct cache *cache)
  1560. {
  1561. return jiffies < cache->last_commit_jiffies ||
  1562. jiffies > cache->last_commit_jiffies + COMMIT_PERIOD;
  1563. }
  1564. /*
  1565. * A non-zero return indicates read_only or fail_io mode.
  1566. */
  1567. static int commit(struct cache *cache, bool clean_shutdown)
  1568. {
  1569. int r;
  1570. if (get_cache_mode(cache) >= CM_READ_ONLY)
  1571. return -EINVAL;
  1572. atomic_inc(&cache->stats.commit_count);
  1573. r = dm_cache_commit(cache->cmd, clean_shutdown);
  1574. if (r)
  1575. metadata_operation_failed(cache, "dm_cache_commit", r);
  1576. return r;
  1577. }
  1578. static int commit_if_needed(struct cache *cache)
  1579. {
  1580. int r = 0;
  1581. if ((cache->commit_requested || need_commit_due_to_time(cache)) &&
  1582. dm_cache_changed_this_transaction(cache->cmd)) {
  1583. r = commit(cache, false);
  1584. cache->commit_requested = false;
  1585. cache->last_commit_jiffies = jiffies;
  1586. }
  1587. return r;
  1588. }
  1589. static void process_deferred_bios(struct cache *cache)
  1590. {
  1591. unsigned long flags;
  1592. struct bio_list bios;
  1593. struct bio *bio;
  1594. struct prealloc structs;
  1595. memset(&structs, 0, sizeof(structs));
  1596. bio_list_init(&bios);
  1597. spin_lock_irqsave(&cache->lock, flags);
  1598. bio_list_merge(&bios, &cache->deferred_bios);
  1599. bio_list_init(&cache->deferred_bios);
  1600. spin_unlock_irqrestore(&cache->lock, flags);
  1601. while (!bio_list_empty(&bios)) {
  1602. /*
  1603. * If we've got no free migration structs, and processing
  1604. * this bio might require one, we pause until there are some
  1605. * prepared mappings to process.
  1606. */
  1607. if (prealloc_data_structs(cache, &structs)) {
  1608. spin_lock_irqsave(&cache->lock, flags);
  1609. bio_list_merge(&cache->deferred_bios, &bios);
  1610. spin_unlock_irqrestore(&cache->lock, flags);
  1611. break;
  1612. }
  1613. bio = bio_list_pop(&bios);
  1614. if (bio->bi_rw & REQ_FLUSH)
  1615. process_flush_bio(cache, bio);
  1616. else if (bio->bi_rw & REQ_DISCARD)
  1617. process_discard_bio(cache, &structs, bio);
  1618. else
  1619. process_bio(cache, &structs, bio);
  1620. }
  1621. prealloc_free_structs(cache, &structs);
  1622. }
  1623. static void process_deferred_cells(struct cache *cache)
  1624. {
  1625. unsigned long flags;
  1626. struct dm_bio_prison_cell *cell, *tmp;
  1627. struct list_head cells;
  1628. struct prealloc structs;
  1629. memset(&structs, 0, sizeof(structs));
  1630. INIT_LIST_HEAD(&cells);
  1631. spin_lock_irqsave(&cache->lock, flags);
  1632. list_splice_init(&cache->deferred_cells, &cells);
  1633. spin_unlock_irqrestore(&cache->lock, flags);
  1634. list_for_each_entry_safe(cell, tmp, &cells, user_list) {
  1635. /*
  1636. * If we've got no free migration structs, and processing
  1637. * this bio might require one, we pause until there are some
  1638. * prepared mappings to process.
  1639. */
  1640. if (prealloc_data_structs(cache, &structs)) {
  1641. spin_lock_irqsave(&cache->lock, flags);
  1642. list_splice(&cells, &cache->deferred_cells);
  1643. spin_unlock_irqrestore(&cache->lock, flags);
  1644. break;
  1645. }
  1646. process_cell(cache, &structs, cell);
  1647. }
  1648. prealloc_free_structs(cache, &structs);
  1649. }
  1650. static void process_deferred_flush_bios(struct cache *cache, bool submit_bios)
  1651. {
  1652. unsigned long flags;
  1653. struct bio_list bios;
  1654. struct bio *bio;
  1655. bio_list_init(&bios);
  1656. spin_lock_irqsave(&cache->lock, flags);
  1657. bio_list_merge(&bios, &cache->deferred_flush_bios);
  1658. bio_list_init(&cache->deferred_flush_bios);
  1659. spin_unlock_irqrestore(&cache->lock, flags);
  1660. /*
  1661. * These bios have already been through inc_ds()
  1662. */
  1663. while ((bio = bio_list_pop(&bios)))
  1664. submit_bios ? accounted_request(cache, bio) : bio_io_error(bio);
  1665. }
  1666. static void process_deferred_writethrough_bios(struct cache *cache)
  1667. {
  1668. unsigned long flags;
  1669. struct bio_list bios;
  1670. struct bio *bio;
  1671. bio_list_init(&bios);
  1672. spin_lock_irqsave(&cache->lock, flags);
  1673. bio_list_merge(&bios, &cache->deferred_writethrough_bios);
  1674. bio_list_init(&cache->deferred_writethrough_bios);
  1675. spin_unlock_irqrestore(&cache->lock, flags);
  1676. /*
  1677. * These bios have already been through inc_ds()
  1678. */
  1679. while ((bio = bio_list_pop(&bios)))
  1680. accounted_request(cache, bio);
  1681. }
  1682. static void writeback_some_dirty_blocks(struct cache *cache)
  1683. {
  1684. int r = 0;
  1685. dm_oblock_t oblock;
  1686. dm_cblock_t cblock;
  1687. struct prealloc structs;
  1688. struct dm_bio_prison_cell *old_ocell;
  1689. bool busy = !iot_idle_for(&cache->origin_tracker, HZ);
  1690. memset(&structs, 0, sizeof(structs));
  1691. while (spare_migration_bandwidth(cache)) {
  1692. if (prealloc_data_structs(cache, &structs))
  1693. break;
  1694. r = policy_writeback_work(cache->policy, &oblock, &cblock, busy);
  1695. if (r)
  1696. break;
  1697. r = get_cell(cache, oblock, &structs, &old_ocell);
  1698. if (r) {
  1699. policy_set_dirty(cache->policy, oblock);
  1700. break;
  1701. }
  1702. writeback(cache, &structs, oblock, cblock, old_ocell);
  1703. }
  1704. prealloc_free_structs(cache, &structs);
  1705. }
  1706. /*----------------------------------------------------------------
  1707. * Invalidations.
  1708. * Dropping something from the cache *without* writing back.
  1709. *--------------------------------------------------------------*/
  1710. static void process_invalidation_request(struct cache *cache, struct invalidation_request *req)
  1711. {
  1712. int r = 0;
  1713. uint64_t begin = from_cblock(req->cblocks->begin);
  1714. uint64_t end = from_cblock(req->cblocks->end);
  1715. while (begin != end) {
  1716. r = policy_remove_cblock(cache->policy, to_cblock(begin));
  1717. if (!r) {
  1718. r = dm_cache_remove_mapping(cache->cmd, to_cblock(begin));
  1719. if (r) {
  1720. metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
  1721. break;
  1722. }
  1723. } else if (r == -ENODATA) {
  1724. /* harmless, already unmapped */
  1725. r = 0;
  1726. } else {
  1727. DMERR("%s: policy_remove_cblock failed", cache_device_name(cache));
  1728. break;
  1729. }
  1730. begin++;
  1731. }
  1732. cache->commit_requested = true;
  1733. req->err = r;
  1734. atomic_set(&req->complete, 1);
  1735. wake_up(&req->result_wait);
  1736. }
  1737. static void process_invalidation_requests(struct cache *cache)
  1738. {
  1739. struct list_head list;
  1740. struct invalidation_request *req, *tmp;
  1741. INIT_LIST_HEAD(&list);
  1742. spin_lock(&cache->invalidation_lock);
  1743. list_splice_init(&cache->invalidation_requests, &list);
  1744. spin_unlock(&cache->invalidation_lock);
  1745. list_for_each_entry_safe (req, tmp, &list, list)
  1746. process_invalidation_request(cache, req);
  1747. }
  1748. /*----------------------------------------------------------------
  1749. * Main worker loop
  1750. *--------------------------------------------------------------*/
  1751. static bool is_quiescing(struct cache *cache)
  1752. {
  1753. return atomic_read(&cache->quiescing);
  1754. }
  1755. static void ack_quiescing(struct cache *cache)
  1756. {
  1757. if (is_quiescing(cache)) {
  1758. atomic_inc(&cache->quiescing_ack);
  1759. wake_up(&cache->quiescing_wait);
  1760. }
  1761. }
  1762. static void wait_for_quiescing_ack(struct cache *cache)
  1763. {
  1764. wait_event(cache->quiescing_wait, atomic_read(&cache->quiescing_ack));
  1765. }
  1766. static void start_quiescing(struct cache *cache)
  1767. {
  1768. atomic_inc(&cache->quiescing);
  1769. wait_for_quiescing_ack(cache);
  1770. }
  1771. static void stop_quiescing(struct cache *cache)
  1772. {
  1773. atomic_set(&cache->quiescing, 0);
  1774. atomic_set(&cache->quiescing_ack, 0);
  1775. }
  1776. static void wait_for_migrations(struct cache *cache)
  1777. {
  1778. wait_event(cache->migration_wait, !atomic_read(&cache->nr_allocated_migrations));
  1779. }
  1780. static void stop_worker(struct cache *cache)
  1781. {
  1782. cancel_delayed_work(&cache->waker);
  1783. flush_workqueue(cache->wq);
  1784. }
  1785. static void requeue_deferred_cells(struct cache *cache)
  1786. {
  1787. unsigned long flags;
  1788. struct list_head cells;
  1789. struct dm_bio_prison_cell *cell, *tmp;
  1790. INIT_LIST_HEAD(&cells);
  1791. spin_lock_irqsave(&cache->lock, flags);
  1792. list_splice_init(&cache->deferred_cells, &cells);
  1793. spin_unlock_irqrestore(&cache->lock, flags);
  1794. list_for_each_entry_safe(cell, tmp, &cells, user_list)
  1795. cell_requeue(cache, cell);
  1796. }
  1797. static void requeue_deferred_bios(struct cache *cache)
  1798. {
  1799. struct bio *bio;
  1800. struct bio_list bios;
  1801. bio_list_init(&bios);
  1802. bio_list_merge(&bios, &cache->deferred_bios);
  1803. bio_list_init(&cache->deferred_bios);
  1804. while ((bio = bio_list_pop(&bios)))
  1805. bio_endio(bio, DM_ENDIO_REQUEUE);
  1806. }
  1807. static int more_work(struct cache *cache)
  1808. {
  1809. if (is_quiescing(cache))
  1810. return !list_empty(&cache->quiesced_migrations) ||
  1811. !list_empty(&cache->completed_migrations) ||
  1812. !list_empty(&cache->need_commit_migrations);
  1813. else
  1814. return !bio_list_empty(&cache->deferred_bios) ||
  1815. !list_empty(&cache->deferred_cells) ||
  1816. !bio_list_empty(&cache->deferred_flush_bios) ||
  1817. !bio_list_empty(&cache->deferred_writethrough_bios) ||
  1818. !list_empty(&cache->quiesced_migrations) ||
  1819. !list_empty(&cache->completed_migrations) ||
  1820. !list_empty(&cache->need_commit_migrations) ||
  1821. cache->invalidate;
  1822. }
  1823. static void do_worker(struct work_struct *ws)
  1824. {
  1825. struct cache *cache = container_of(ws, struct cache, worker);
  1826. do {
  1827. if (!is_quiescing(cache)) {
  1828. writeback_some_dirty_blocks(cache);
  1829. process_deferred_writethrough_bios(cache);
  1830. process_deferred_bios(cache);
  1831. process_deferred_cells(cache);
  1832. process_invalidation_requests(cache);
  1833. }
  1834. process_migrations(cache, &cache->quiesced_migrations, issue_copy_or_discard);
  1835. process_migrations(cache, &cache->completed_migrations, complete_migration);
  1836. if (commit_if_needed(cache)) {
  1837. process_deferred_flush_bios(cache, false);
  1838. process_migrations(cache, &cache->need_commit_migrations, migration_failure);
  1839. } else {
  1840. process_deferred_flush_bios(cache, true);
  1841. process_migrations(cache, &cache->need_commit_migrations,
  1842. migration_success_post_commit);
  1843. }
  1844. ack_quiescing(cache);
  1845. } while (more_work(cache));
  1846. }
  1847. /*
  1848. * We want to commit periodically so that not too much
  1849. * unwritten metadata builds up.
  1850. */
  1851. static void do_waker(struct work_struct *ws)
  1852. {
  1853. struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker);
  1854. policy_tick(cache->policy, true);
  1855. wake_worker(cache);
  1856. queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD);
  1857. }
  1858. /*----------------------------------------------------------------*/
  1859. static int is_congested(struct dm_dev *dev, int bdi_bits)
  1860. {
  1861. struct request_queue *q = bdev_get_queue(dev->bdev);
  1862. return bdi_congested(&q->backing_dev_info, bdi_bits);
  1863. }
  1864. static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
  1865. {
  1866. struct cache *cache = container_of(cb, struct cache, callbacks);
  1867. return is_congested(cache->origin_dev, bdi_bits) ||
  1868. is_congested(cache->cache_dev, bdi_bits);
  1869. }
  1870. /*----------------------------------------------------------------
  1871. * Target methods
  1872. *--------------------------------------------------------------*/
  1873. /*
  1874. * This function gets called on the error paths of the constructor, so we
  1875. * have to cope with a partially initialised struct.
  1876. */
  1877. static void destroy(struct cache *cache)
  1878. {
  1879. unsigned i;
  1880. if (cache->migration_pool)
  1881. mempool_destroy(cache->migration_pool);
  1882. if (cache->all_io_ds)
  1883. dm_deferred_set_destroy(cache->all_io_ds);
  1884. if (cache->prison)
  1885. dm_bio_prison_destroy(cache->prison);
  1886. if (cache->wq)
  1887. destroy_workqueue(cache->wq);
  1888. if (cache->dirty_bitset)
  1889. free_bitset(cache->dirty_bitset);
  1890. if (cache->discard_bitset)
  1891. free_bitset(cache->discard_bitset);
  1892. if (cache->copier)
  1893. dm_kcopyd_client_destroy(cache->copier);
  1894. if (cache->cmd)
  1895. dm_cache_metadata_close(cache->cmd);
  1896. if (cache->metadata_dev)
  1897. dm_put_device(cache->ti, cache->metadata_dev);
  1898. if (cache->origin_dev)
  1899. dm_put_device(cache->ti, cache->origin_dev);
  1900. if (cache->cache_dev)
  1901. dm_put_device(cache->ti, cache->cache_dev);
  1902. if (cache->policy)
  1903. dm_cache_policy_destroy(cache->policy);
  1904. for (i = 0; i < cache->nr_ctr_args ; i++)
  1905. kfree(cache->ctr_args[i]);
  1906. kfree(cache->ctr_args);
  1907. kfree(cache);
  1908. }
  1909. static void cache_dtr(struct dm_target *ti)
  1910. {
  1911. struct cache *cache = ti->private;
  1912. destroy(cache);
  1913. }
  1914. static sector_t get_dev_size(struct dm_dev *dev)
  1915. {
  1916. return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
  1917. }
  1918. /*----------------------------------------------------------------*/
  1919. /*
  1920. * Construct a cache device mapping.
  1921. *
  1922. * cache <metadata dev> <cache dev> <origin dev> <block size>
  1923. * <#feature args> [<feature arg>]*
  1924. * <policy> <#policy args> [<policy arg>]*
  1925. *
  1926. * metadata dev : fast device holding the persistent metadata
  1927. * cache dev : fast device holding cached data blocks
  1928. * origin dev : slow device holding original data blocks
  1929. * block size : cache unit size in sectors
  1930. *
  1931. * #feature args : number of feature arguments passed
  1932. * feature args : writethrough. (The default is writeback.)
  1933. *
  1934. * policy : the replacement policy to use
  1935. * #policy args : an even number of policy arguments corresponding
  1936. * to key/value pairs passed to the policy
  1937. * policy args : key/value pairs passed to the policy
  1938. * E.g. 'sequential_threshold 1024'
  1939. * See cache-policies.txt for details.
  1940. *
  1941. * Optional feature arguments are:
  1942. * writethrough : write through caching that prohibits cache block
  1943. * content from being different from origin block content.
  1944. * Without this argument, the default behaviour is to write
  1945. * back cache block contents later for performance reasons,
  1946. * so they may differ from the corresponding origin blocks.
  1947. */
  1948. struct cache_args {
  1949. struct dm_target *ti;
  1950. struct dm_dev *metadata_dev;
  1951. struct dm_dev *cache_dev;
  1952. sector_t cache_sectors;
  1953. struct dm_dev *origin_dev;
  1954. sector_t origin_sectors;
  1955. uint32_t block_size;
  1956. const char *policy_name;
  1957. int policy_argc;
  1958. const char **policy_argv;
  1959. struct cache_features features;
  1960. };
  1961. static void destroy_cache_args(struct cache_args *ca)
  1962. {
  1963. if (ca->metadata_dev)
  1964. dm_put_device(ca->ti, ca->metadata_dev);
  1965. if (ca->cache_dev)
  1966. dm_put_device(ca->ti, ca->cache_dev);
  1967. if (ca->origin_dev)
  1968. dm_put_device(ca->ti, ca->origin_dev);
  1969. kfree(ca);
  1970. }
  1971. static bool at_least_one_arg(struct dm_arg_set *as, char **error)
  1972. {
  1973. if (!as->argc) {
  1974. *error = "Insufficient args";
  1975. return false;
  1976. }
  1977. return true;
  1978. }
  1979. static int parse_metadata_dev(struct cache_args *ca, struct dm_arg_set *as,
  1980. char **error)
  1981. {
  1982. int r;
  1983. sector_t metadata_dev_size;
  1984. char b[BDEVNAME_SIZE];
  1985. if (!at_least_one_arg(as, error))
  1986. return -EINVAL;
  1987. r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
  1988. &ca->metadata_dev);
  1989. if (r) {
  1990. *error = "Error opening metadata device";
  1991. return r;
  1992. }
  1993. metadata_dev_size = get_dev_size(ca->metadata_dev);
  1994. if (metadata_dev_size > DM_CACHE_METADATA_MAX_SECTORS_WARNING)
  1995. DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
  1996. bdevname(ca->metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS);
  1997. return 0;
  1998. }
  1999. static int parse_cache_dev(struct cache_args *ca, struct dm_arg_set *as,
  2000. char **error)
  2001. {
  2002. int r;
  2003. if (!at_least_one_arg(as, error))
  2004. return -EINVAL;
  2005. r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
  2006. &ca->cache_dev);
  2007. if (r) {
  2008. *error = "Error opening cache device";
  2009. return r;
  2010. }
  2011. ca->cache_sectors = get_dev_size(ca->cache_dev);
  2012. return 0;
  2013. }
  2014. static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as,
  2015. char **error)
  2016. {
  2017. int r;
  2018. if (!at_least_one_arg(as, error))
  2019. return -EINVAL;
  2020. r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
  2021. &ca->origin_dev);
  2022. if (r) {
  2023. *error = "Error opening origin device";
  2024. return r;
  2025. }
  2026. ca->origin_sectors = get_dev_size(ca->origin_dev);
  2027. if (ca->ti->len > ca->origin_sectors) {
  2028. *error = "Device size larger than cached device";
  2029. return -EINVAL;
  2030. }
  2031. return 0;
  2032. }
  2033. static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as,
  2034. char **error)
  2035. {
  2036. unsigned long block_size;
  2037. if (!at_least_one_arg(as, error))
  2038. return -EINVAL;
  2039. if (kstrtoul(dm_shift_arg(as), 10, &block_size) || !block_size ||
  2040. block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
  2041. block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS ||
  2042. block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {
  2043. *error = "Invalid data block size";
  2044. return -EINVAL;
  2045. }
  2046. if (block_size > ca->cache_sectors) {
  2047. *error = "Data block size is larger than the cache device";
  2048. return -EINVAL;
  2049. }
  2050. ca->block_size = block_size;
  2051. return 0;
  2052. }
  2053. static void init_features(struct cache_features *cf)
  2054. {
  2055. cf->mode = CM_WRITE;
  2056. cf->io_mode = CM_IO_WRITEBACK;
  2057. }
  2058. static int parse_features(struct cache_args *ca, struct dm_arg_set *as,
  2059. char **error)
  2060. {
  2061. static struct dm_arg _args[] = {
  2062. {0, 1, "Invalid number of cache feature arguments"},
  2063. };
  2064. int r;
  2065. unsigned argc;
  2066. const char *arg;
  2067. struct cache_features *cf = &ca->features;
  2068. init_features(cf);
  2069. r = dm_read_arg_group(_args, as, &argc, error);
  2070. if (r)
  2071. return -EINVAL;
  2072. while (argc--) {
  2073. arg = dm_shift_arg(as);
  2074. if (!strcasecmp(arg, "writeback"))
  2075. cf->io_mode = CM_IO_WRITEBACK;
  2076. else if (!strcasecmp(arg, "writethrough"))
  2077. cf->io_mode = CM_IO_WRITETHROUGH;
  2078. else if (!strcasecmp(arg, "passthrough"))
  2079. cf->io_mode = CM_IO_PASSTHROUGH;
  2080. else {
  2081. *error = "Unrecognised cache feature requested";
  2082. return -EINVAL;
  2083. }
  2084. }
  2085. return 0;
  2086. }
  2087. static int parse_policy(struct cache_args *ca, struct dm_arg_set *as,
  2088. char **error)
  2089. {
  2090. static struct dm_arg _args[] = {
  2091. {0, 1024, "Invalid number of policy arguments"},
  2092. };
  2093. int r;
  2094. if (!at_least_one_arg(as, error))
  2095. return -EINVAL;
  2096. ca->policy_name = dm_shift_arg(as);
  2097. r = dm_read_arg_group(_args, as, &ca->policy_argc, error);
  2098. if (r)
  2099. return -EINVAL;
  2100. ca->policy_argv = (const char **)as->argv;
  2101. dm_consume_args(as, ca->policy_argc);
  2102. return 0;
  2103. }
  2104. static int parse_cache_args(struct cache_args *ca, int argc, char **argv,
  2105. char **error)
  2106. {
  2107. int r;
  2108. struct dm_arg_set as;
  2109. as.argc = argc;
  2110. as.argv = argv;
  2111. r = parse_metadata_dev(ca, &as, error);
  2112. if (r)
  2113. return r;
  2114. r = parse_cache_dev(ca, &as, error);
  2115. if (r)
  2116. return r;
  2117. r = parse_origin_dev(ca, &as, error);
  2118. if (r)
  2119. return r;
  2120. r = parse_block_size(ca, &as, error);
  2121. if (r)
  2122. return r;
  2123. r = parse_features(ca, &as, error);
  2124. if (r)
  2125. return r;
  2126. r = parse_policy(ca, &as, error);
  2127. if (r)
  2128. return r;
  2129. return 0;
  2130. }
  2131. /*----------------------------------------------------------------*/
  2132. static struct kmem_cache *migration_cache;
  2133. #define NOT_CORE_OPTION 1
  2134. static int process_config_option(struct cache *cache, const char *key, const char *value)
  2135. {
  2136. unsigned long tmp;
  2137. if (!strcasecmp(key, "migration_threshold")) {
  2138. if (kstrtoul(value, 10, &tmp))
  2139. return -EINVAL;
  2140. cache->migration_threshold = tmp;
  2141. return 0;
  2142. }
  2143. return NOT_CORE_OPTION;
  2144. }
  2145. static int set_config_value(struct cache *cache, const char *key, const char *value)
  2146. {
  2147. int r = process_config_option(cache, key, value);
  2148. if (r == NOT_CORE_OPTION)
  2149. r = policy_set_config_value(cache->policy, key, value);
  2150. if (r)
  2151. DMWARN("bad config value for %s: %s", key, value);
  2152. return r;
  2153. }
  2154. static int set_config_values(struct cache *cache, int argc, const char **argv)
  2155. {
  2156. int r = 0;
  2157. if (argc & 1) {
  2158. DMWARN("Odd number of policy arguments given but they should be <key> <value> pairs.");
  2159. return -EINVAL;
  2160. }
  2161. while (argc) {
  2162. r = set_config_value(cache, argv[0], argv[1]);
  2163. if (r)
  2164. break;
  2165. argc -= 2;
  2166. argv += 2;
  2167. }
  2168. return r;
  2169. }
  2170. static int create_cache_policy(struct cache *cache, struct cache_args *ca,
  2171. char **error)
  2172. {
  2173. struct dm_cache_policy *p = dm_cache_policy_create(ca->policy_name,
  2174. cache->cache_size,
  2175. cache->origin_sectors,
  2176. cache->sectors_per_block);
  2177. if (IS_ERR(p)) {
  2178. *error = "Error creating cache's policy";
  2179. return PTR_ERR(p);
  2180. }
  2181. cache->policy = p;
  2182. return 0;
  2183. }
  2184. /*
  2185. * We want the discard block size to be at least the size of the cache
  2186. * block size and have no more than 2^14 discard blocks across the origin.
  2187. */
  2188. #define MAX_DISCARD_BLOCKS (1 << 14)
  2189. static bool too_many_discard_blocks(sector_t discard_block_size,
  2190. sector_t origin_size)
  2191. {
  2192. (void) sector_div(origin_size, discard_block_size);
  2193. return origin_size > MAX_DISCARD_BLOCKS;
  2194. }
  2195. static sector_t calculate_discard_block_size(sector_t cache_block_size,
  2196. sector_t origin_size)
  2197. {
  2198. sector_t discard_block_size = cache_block_size;
  2199. if (origin_size)
  2200. while (too_many_discard_blocks(discard_block_size, origin_size))
  2201. discard_block_size *= 2;
  2202. return discard_block_size;
  2203. }
  2204. static void set_cache_size(struct cache *cache, dm_cblock_t size)
  2205. {
  2206. dm_block_t nr_blocks = from_cblock(size);
  2207. if (nr_blocks > (1 << 20) && cache->cache_size != size)
  2208. DMWARN_LIMIT("You have created a cache device with a lot of individual cache blocks (%llu)\n"
  2209. "All these mappings can consume a lot of kernel memory, and take some time to read/write.\n"
  2210. "Please consider increasing the cache block size to reduce the overall cache block count.",
  2211. (unsigned long long) nr_blocks);
  2212. cache->cache_size = size;
  2213. }
  2214. #define DEFAULT_MIGRATION_THRESHOLD 2048
  2215. static int cache_create(struct cache_args *ca, struct cache **result)
  2216. {
  2217. int r = 0;
  2218. char **error = &ca->ti->error;
  2219. struct cache *cache;
  2220. struct dm_target *ti = ca->ti;
  2221. dm_block_t origin_blocks;
  2222. struct dm_cache_metadata *cmd;
  2223. bool may_format = ca->features.mode == CM_WRITE;
  2224. cache = kzalloc(sizeof(*cache), GFP_KERNEL);
  2225. if (!cache)
  2226. return -ENOMEM;
  2227. cache->ti = ca->ti;
  2228. ti->private = cache;
  2229. ti->num_flush_bios = 2;
  2230. ti->flush_supported = true;
  2231. ti->num_discard_bios = 1;
  2232. ti->discards_supported = true;
  2233. ti->discard_zeroes_data_unsupported = true;
  2234. ti->split_discard_bios = false;
  2235. cache->features = ca->features;
  2236. ti->per_bio_data_size = get_per_bio_data_size(cache);
  2237. cache->callbacks.congested_fn = cache_is_congested;
  2238. dm_table_add_target_callbacks(ti->table, &cache->callbacks);
  2239. cache->metadata_dev = ca->metadata_dev;
  2240. cache->origin_dev = ca->origin_dev;
  2241. cache->cache_dev = ca->cache_dev;
  2242. ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL;
  2243. /* FIXME: factor out this whole section */
  2244. origin_blocks = cache->origin_sectors = ca->origin_sectors;
  2245. origin_blocks = block_div(origin_blocks, ca->block_size);
  2246. cache->origin_blocks = to_oblock(origin_blocks);
  2247. cache->sectors_per_block = ca->block_size;
  2248. if (dm_set_target_max_io_len(ti, cache->sectors_per_block)) {
  2249. r = -EINVAL;
  2250. goto bad;
  2251. }
  2252. if (ca->block_size & (ca->block_size - 1)) {
  2253. dm_block_t cache_size = ca->cache_sectors;
  2254. cache->sectors_per_block_shift = -1;
  2255. cache_size = block_div(cache_size, ca->block_size);
  2256. set_cache_size(cache, to_cblock(cache_size));
  2257. } else {
  2258. cache->sectors_per_block_shift = __ffs(ca->block_size);
  2259. set_cache_size(cache, to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift));
  2260. }
  2261. r = create_cache_policy(cache, ca, error);
  2262. if (r)
  2263. goto bad;
  2264. cache->policy_nr_args = ca->policy_argc;
  2265. cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD;
  2266. r = set_config_values(cache, ca->policy_argc, ca->policy_argv);
  2267. if (r) {
  2268. *error = "Error setting cache policy's config values";
  2269. goto bad;
  2270. }
  2271. cmd = dm_cache_metadata_open(cache->metadata_dev->bdev,
  2272. ca->block_size, may_format,
  2273. dm_cache_policy_get_hint_size(cache->policy));
  2274. if (IS_ERR(cmd)) {
  2275. *error = "Error creating metadata object";
  2276. r = PTR_ERR(cmd);
  2277. goto bad;
  2278. }
  2279. cache->cmd = cmd;
  2280. set_cache_mode(cache, CM_WRITE);
  2281. if (get_cache_mode(cache) != CM_WRITE) {
  2282. *error = "Unable to get write access to metadata, please check/repair metadata.";
  2283. r = -EINVAL;
  2284. goto bad;
  2285. }
  2286. if (passthrough_mode(&cache->features)) {
  2287. bool all_clean;
  2288. r = dm_cache_metadata_all_clean(cache->cmd, &all_clean);
  2289. if (r) {
  2290. *error = "dm_cache_metadata_all_clean() failed";
  2291. goto bad;
  2292. }
  2293. if (!all_clean) {
  2294. *error = "Cannot enter passthrough mode unless all blocks are clean";
  2295. r = -EINVAL;
  2296. goto bad;
  2297. }
  2298. }
  2299. spin_lock_init(&cache->lock);
  2300. INIT_LIST_HEAD(&cache->deferred_cells);
  2301. bio_list_init(&cache->deferred_bios);
  2302. bio_list_init(&cache->deferred_flush_bios);
  2303. bio_list_init(&cache->deferred_writethrough_bios);
  2304. INIT_LIST_HEAD(&cache->quiesced_migrations);
  2305. INIT_LIST_HEAD(&cache->completed_migrations);
  2306. INIT_LIST_HEAD(&cache->need_commit_migrations);
  2307. atomic_set(&cache->nr_allocated_migrations, 0);
  2308. atomic_set(&cache->nr_io_migrations, 0);
  2309. init_waitqueue_head(&cache->migration_wait);
  2310. init_waitqueue_head(&cache->quiescing_wait);
  2311. atomic_set(&cache->quiescing, 0);
  2312. atomic_set(&cache->quiescing_ack, 0);
  2313. r = -ENOMEM;
  2314. atomic_set(&cache->nr_dirty, 0);
  2315. cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size));
  2316. if (!cache->dirty_bitset) {
  2317. *error = "could not allocate dirty bitset";
  2318. goto bad;
  2319. }
  2320. clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size));
  2321. cache->discard_block_size =
  2322. calculate_discard_block_size(cache->sectors_per_block,
  2323. cache->origin_sectors);
  2324. cache->discard_nr_blocks = to_dblock(dm_sector_div_up(cache->origin_sectors,
  2325. cache->discard_block_size));
  2326. cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks));
  2327. if (!cache->discard_bitset) {
  2328. *error = "could not allocate discard bitset";
  2329. goto bad;
  2330. }
  2331. clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks));
  2332. cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);
  2333. if (IS_ERR(cache->copier)) {
  2334. *error = "could not create kcopyd client";
  2335. r = PTR_ERR(cache->copier);
  2336. goto bad;
  2337. }
  2338. cache->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
  2339. if (!cache->wq) {
  2340. *error = "could not create workqueue for metadata object";
  2341. goto bad;
  2342. }
  2343. INIT_WORK(&cache->worker, do_worker);
  2344. INIT_DELAYED_WORK(&cache->waker, do_waker);
  2345. cache->last_commit_jiffies = jiffies;
  2346. cache->prison = dm_bio_prison_create();
  2347. if (!cache->prison) {
  2348. *error = "could not create bio prison";
  2349. goto bad;
  2350. }
  2351. cache->all_io_ds = dm_deferred_set_create();
  2352. if (!cache->all_io_ds) {
  2353. *error = "could not create all_io deferred set";
  2354. goto bad;
  2355. }
  2356. cache->migration_pool = mempool_create_slab_pool(MIGRATION_POOL_SIZE,
  2357. migration_cache);
  2358. if (!cache->migration_pool) {
  2359. *error = "Error creating cache's migration mempool";
  2360. goto bad;
  2361. }
  2362. cache->need_tick_bio = true;
  2363. cache->sized = false;
  2364. cache->invalidate = false;
  2365. cache->commit_requested = false;
  2366. cache->loaded_mappings = false;
  2367. cache->loaded_discards = false;
  2368. load_stats(cache);
  2369. atomic_set(&cache->stats.demotion, 0);
  2370. atomic_set(&cache->stats.promotion, 0);
  2371. atomic_set(&cache->stats.copies_avoided, 0);
  2372. atomic_set(&cache->stats.cache_cell_clash, 0);
  2373. atomic_set(&cache->stats.commit_count, 0);
  2374. atomic_set(&cache->stats.discard_count, 0);
  2375. spin_lock_init(&cache->invalidation_lock);
  2376. INIT_LIST_HEAD(&cache->invalidation_requests);
  2377. iot_init(&cache->origin_tracker);
  2378. *result = cache;
  2379. return 0;
  2380. bad:
  2381. destroy(cache);
  2382. return r;
  2383. }
  2384. static int copy_ctr_args(struct cache *cache, int argc, const char **argv)
  2385. {
  2386. unsigned i;
  2387. const char **copy;
  2388. copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL);
  2389. if (!copy)
  2390. return -ENOMEM;
  2391. for (i = 0; i < argc; i++) {
  2392. copy[i] = kstrdup(argv[i], GFP_KERNEL);
  2393. if (!copy[i]) {
  2394. while (i--)
  2395. kfree(copy[i]);
  2396. kfree(copy);
  2397. return -ENOMEM;
  2398. }
  2399. }
  2400. cache->nr_ctr_args = argc;
  2401. cache->ctr_args = copy;
  2402. return 0;
  2403. }
  2404. static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv)
  2405. {
  2406. int r = -EINVAL;
  2407. struct cache_args *ca;
  2408. struct cache *cache = NULL;
  2409. ca = kzalloc(sizeof(*ca), GFP_KERNEL);
  2410. if (!ca) {
  2411. ti->error = "Error allocating memory for cache";
  2412. return -ENOMEM;
  2413. }
  2414. ca->ti = ti;
  2415. r = parse_cache_args(ca, argc, argv, &ti->error);
  2416. if (r)
  2417. goto out;
  2418. r = cache_create(ca, &cache);
  2419. if (r)
  2420. goto out;
  2421. r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3);
  2422. if (r) {
  2423. destroy(cache);
  2424. goto out;
  2425. }
  2426. ti->private = cache;
  2427. out:
  2428. destroy_cache_args(ca);
  2429. return r;
  2430. }
  2431. /*----------------------------------------------------------------*/
  2432. static int cache_map(struct dm_target *ti, struct bio *bio)
  2433. {
  2434. struct cache *cache = ti->private;
  2435. int r;
  2436. struct dm_bio_prison_cell *cell = NULL;
  2437. dm_oblock_t block = get_bio_block(cache, bio);
  2438. size_t pb_data_size = get_per_bio_data_size(cache);
  2439. bool can_migrate = false;
  2440. bool fast_promotion;
  2441. struct policy_result lookup_result;
  2442. struct per_bio_data *pb = init_per_bio_data(bio, pb_data_size);
  2443. struct old_oblock_lock ool;
  2444. ool.locker.fn = null_locker;
  2445. if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) {
  2446. /*
  2447. * This can only occur if the io goes to a partial block at
  2448. * the end of the origin device. We don't cache these.
  2449. * Just remap to the origin and carry on.
  2450. */
  2451. remap_to_origin(cache, bio);
  2452. accounted_begin(cache, bio);
  2453. return DM_MAPIO_REMAPPED;
  2454. }
  2455. if (discard_or_flush(bio)) {
  2456. defer_bio(cache, bio);
  2457. return DM_MAPIO_SUBMITTED;
  2458. }
  2459. /*
  2460. * Check to see if that block is currently migrating.
  2461. */
  2462. cell = alloc_prison_cell(cache);
  2463. if (!cell) {
  2464. defer_bio(cache, bio);
  2465. return DM_MAPIO_SUBMITTED;
  2466. }
  2467. r = bio_detain(cache, block, bio, cell,
  2468. (cell_free_fn) free_prison_cell,
  2469. cache, &cell);
  2470. if (r) {
  2471. if (r < 0)
  2472. defer_bio(cache, bio);
  2473. return DM_MAPIO_SUBMITTED;
  2474. }
  2475. fast_promotion = is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio);
  2476. r = policy_map(cache->policy, block, false, can_migrate, fast_promotion,
  2477. bio, &ool.locker, &lookup_result);
  2478. if (r == -EWOULDBLOCK) {
  2479. cell_defer(cache, cell, true);
  2480. return DM_MAPIO_SUBMITTED;
  2481. } else if (r) {
  2482. DMERR_LIMIT("%s: Unexpected return from cache replacement policy: %d",
  2483. cache_device_name(cache), r);
  2484. cell_defer(cache, cell, false);
  2485. bio_io_error(bio);
  2486. return DM_MAPIO_SUBMITTED;
  2487. }
  2488. r = DM_MAPIO_REMAPPED;
  2489. switch (lookup_result.op) {
  2490. case POLICY_HIT:
  2491. if (passthrough_mode(&cache->features)) {
  2492. if (bio_data_dir(bio) == WRITE) {
  2493. /*
  2494. * We need to invalidate this block, so
  2495. * defer for the worker thread.
  2496. */
  2497. cell_defer(cache, cell, true);
  2498. r = DM_MAPIO_SUBMITTED;
  2499. } else {
  2500. inc_miss_counter(cache, bio);
  2501. remap_to_origin_clear_discard(cache, bio, block);
  2502. accounted_begin(cache, bio);
  2503. inc_ds(cache, bio, cell);
  2504. // FIXME: we want to remap hits or misses straight
  2505. // away rather than passing over to the worker.
  2506. cell_defer(cache, cell, false);
  2507. }
  2508. } else {
  2509. inc_hit_counter(cache, bio);
  2510. if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) &&
  2511. !is_dirty(cache, lookup_result.cblock)) {
  2512. remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
  2513. accounted_begin(cache, bio);
  2514. inc_ds(cache, bio, cell);
  2515. cell_defer(cache, cell, false);
  2516. } else
  2517. remap_cell_to_cache_dirty(cache, cell, block, lookup_result.cblock, false);
  2518. }
  2519. break;
  2520. case POLICY_MISS:
  2521. inc_miss_counter(cache, bio);
  2522. if (pb->req_nr != 0) {
  2523. /*
  2524. * This is a duplicate writethrough io that is no
  2525. * longer needed because the block has been demoted.
  2526. */
  2527. bio_endio(bio, 0);
  2528. // FIXME: remap everything as a miss
  2529. cell_defer(cache, cell, false);
  2530. r = DM_MAPIO_SUBMITTED;
  2531. } else
  2532. remap_cell_to_origin_clear_discard(cache, cell, block, false);
  2533. break;
  2534. default:
  2535. DMERR_LIMIT("%s: %s: erroring bio: unknown policy op: %u",
  2536. cache_device_name(cache), __func__,
  2537. (unsigned) lookup_result.op);
  2538. cell_defer(cache, cell, false);
  2539. bio_io_error(bio);
  2540. r = DM_MAPIO_SUBMITTED;
  2541. }
  2542. return r;
  2543. }
  2544. static int cache_end_io(struct dm_target *ti, struct bio *bio, int error)
  2545. {
  2546. struct cache *cache = ti->private;
  2547. unsigned long flags;
  2548. size_t pb_data_size = get_per_bio_data_size(cache);
  2549. struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
  2550. if (pb->tick) {
  2551. policy_tick(cache->policy, false);
  2552. spin_lock_irqsave(&cache->lock, flags);
  2553. cache->need_tick_bio = true;
  2554. spin_unlock_irqrestore(&cache->lock, flags);
  2555. }
  2556. check_for_quiesced_migrations(cache, pb);
  2557. accounted_complete(cache, bio);
  2558. return 0;
  2559. }
  2560. static int write_dirty_bitset(struct cache *cache)
  2561. {
  2562. unsigned i, r;
  2563. if (get_cache_mode(cache) >= CM_READ_ONLY)
  2564. return -EINVAL;
  2565. for (i = 0; i < from_cblock(cache->cache_size); i++) {
  2566. r = dm_cache_set_dirty(cache->cmd, to_cblock(i),
  2567. is_dirty(cache, to_cblock(i)));
  2568. if (r) {
  2569. metadata_operation_failed(cache, "dm_cache_set_dirty", r);
  2570. return r;
  2571. }
  2572. }
  2573. return 0;
  2574. }
  2575. static int write_discard_bitset(struct cache *cache)
  2576. {
  2577. unsigned i, r;
  2578. if (get_cache_mode(cache) >= CM_READ_ONLY)
  2579. return -EINVAL;
  2580. r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size,
  2581. cache->discard_nr_blocks);
  2582. if (r) {
  2583. DMERR("%s: could not resize on-disk discard bitset", cache_device_name(cache));
  2584. metadata_operation_failed(cache, "dm_cache_discard_bitset_resize", r);
  2585. return r;
  2586. }
  2587. for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) {
  2588. r = dm_cache_set_discard(cache->cmd, to_dblock(i),
  2589. is_discarded(cache, to_dblock(i)));
  2590. if (r) {
  2591. metadata_operation_failed(cache, "dm_cache_set_discard", r);
  2592. return r;
  2593. }
  2594. }
  2595. return 0;
  2596. }
  2597. static int write_hints(struct cache *cache)
  2598. {
  2599. int r;
  2600. if (get_cache_mode(cache) >= CM_READ_ONLY)
  2601. return -EINVAL;
  2602. r = dm_cache_write_hints(cache->cmd, cache->policy);
  2603. if (r) {
  2604. metadata_operation_failed(cache, "dm_cache_write_hints", r);
  2605. return r;
  2606. }
  2607. return 0;
  2608. }
  2609. /*
  2610. * returns true on success
  2611. */
  2612. static bool sync_metadata(struct cache *cache)
  2613. {
  2614. int r1, r2, r3, r4;
  2615. r1 = write_dirty_bitset(cache);
  2616. if (r1)
  2617. DMERR("%s: could not write dirty bitset", cache_device_name(cache));
  2618. r2 = write_discard_bitset(cache);
  2619. if (r2)
  2620. DMERR("%s: could not write discard bitset", cache_device_name(cache));
  2621. save_stats(cache);
  2622. r3 = write_hints(cache);
  2623. if (r3)
  2624. DMERR("%s: could not write hints", cache_device_name(cache));
  2625. /*
  2626. * If writing the above metadata failed, we still commit, but don't
  2627. * set the clean shutdown flag. This will effectively force every
  2628. * dirty bit to be set on reload.
  2629. */
  2630. r4 = commit(cache, !r1 && !r2 && !r3);
  2631. if (r4)
  2632. DMERR("%s: could not write cache metadata", cache_device_name(cache));
  2633. return !r1 && !r2 && !r3 && !r4;
  2634. }
  2635. static void cache_postsuspend(struct dm_target *ti)
  2636. {
  2637. struct cache *cache = ti->private;
  2638. start_quiescing(cache);
  2639. wait_for_migrations(cache);
  2640. stop_worker(cache);
  2641. requeue_deferred_bios(cache);
  2642. requeue_deferred_cells(cache);
  2643. stop_quiescing(cache);
  2644. if (get_cache_mode(cache) == CM_WRITE)
  2645. (void) sync_metadata(cache);
  2646. }
  2647. static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock,
  2648. bool dirty, uint32_t hint, bool hint_valid)
  2649. {
  2650. int r;
  2651. struct cache *cache = context;
  2652. r = policy_load_mapping(cache->policy, oblock, cblock, hint, hint_valid);
  2653. if (r)
  2654. return r;
  2655. if (dirty)
  2656. set_dirty(cache, oblock, cblock);
  2657. else
  2658. clear_dirty(cache, oblock, cblock);
  2659. return 0;
  2660. }
  2661. /*
  2662. * The discard block size in the on disk metadata is not
  2663. * neccessarily the same as we're currently using. So we have to
  2664. * be careful to only set the discarded attribute if we know it
  2665. * covers a complete block of the new size.
  2666. */
  2667. struct discard_load_info {
  2668. struct cache *cache;
  2669. /*
  2670. * These blocks are sized using the on disk dblock size, rather
  2671. * than the current one.
  2672. */
  2673. dm_block_t block_size;
  2674. dm_block_t discard_begin, discard_end;
  2675. };
  2676. static void discard_load_info_init(struct cache *cache,
  2677. struct discard_load_info *li)
  2678. {
  2679. li->cache = cache;
  2680. li->discard_begin = li->discard_end = 0;
  2681. }
  2682. static void set_discard_range(struct discard_load_info *li)
  2683. {
  2684. sector_t b, e;
  2685. if (li->discard_begin == li->discard_end)
  2686. return;
  2687. /*
  2688. * Convert to sectors.
  2689. */
  2690. b = li->discard_begin * li->block_size;
  2691. e = li->discard_end * li->block_size;
  2692. /*
  2693. * Then convert back to the current dblock size.
  2694. */
  2695. b = dm_sector_div_up(b, li->cache->discard_block_size);
  2696. sector_div(e, li->cache->discard_block_size);
  2697. /*
  2698. * The origin may have shrunk, so we need to check we're still in
  2699. * bounds.
  2700. */
  2701. if (e > from_dblock(li->cache->discard_nr_blocks))
  2702. e = from_dblock(li->cache->discard_nr_blocks);
  2703. for (; b < e; b++)
  2704. set_discard(li->cache, to_dblock(b));
  2705. }
  2706. static int load_discard(void *context, sector_t discard_block_size,
  2707. dm_dblock_t dblock, bool discard)
  2708. {
  2709. struct discard_load_info *li = context;
  2710. li->block_size = discard_block_size;
  2711. if (discard) {
  2712. if (from_dblock(dblock) == li->discard_end)
  2713. /*
  2714. * We're already in a discard range, just extend it.
  2715. */
  2716. li->discard_end = li->discard_end + 1ULL;
  2717. else {
  2718. /*
  2719. * Emit the old range and start a new one.
  2720. */
  2721. set_discard_range(li);
  2722. li->discard_begin = from_dblock(dblock);
  2723. li->discard_end = li->discard_begin + 1ULL;
  2724. }
  2725. } else {
  2726. set_discard_range(li);
  2727. li->discard_begin = li->discard_end = 0;
  2728. }
  2729. return 0;
  2730. }
  2731. static dm_cblock_t get_cache_dev_size(struct cache *cache)
  2732. {
  2733. sector_t size = get_dev_size(cache->cache_dev);
  2734. (void) sector_div(size, cache->sectors_per_block);
  2735. return to_cblock(size);
  2736. }
  2737. static bool can_resize(struct cache *cache, dm_cblock_t new_size)
  2738. {
  2739. if (from_cblock(new_size) > from_cblock(cache->cache_size))
  2740. return true;
  2741. /*
  2742. * We can't drop a dirty block when shrinking the cache.
  2743. */
  2744. while (from_cblock(new_size) < from_cblock(cache->cache_size)) {
  2745. new_size = to_cblock(from_cblock(new_size) + 1);
  2746. if (is_dirty(cache, new_size)) {
  2747. DMERR("%s: unable to shrink cache; cache block %llu is dirty",
  2748. cache_device_name(cache),
  2749. (unsigned long long) from_cblock(new_size));
  2750. return false;
  2751. }
  2752. }
  2753. return true;
  2754. }
  2755. static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size)
  2756. {
  2757. int r;
  2758. r = dm_cache_resize(cache->cmd, new_size);
  2759. if (r) {
  2760. DMERR("%s: could not resize cache metadata", cache_device_name(cache));
  2761. metadata_operation_failed(cache, "dm_cache_resize", r);
  2762. return r;
  2763. }
  2764. set_cache_size(cache, new_size);
  2765. return 0;
  2766. }
  2767. static int cache_preresume(struct dm_target *ti)
  2768. {
  2769. int r = 0;
  2770. struct cache *cache = ti->private;
  2771. dm_cblock_t csize = get_cache_dev_size(cache);
  2772. /*
  2773. * Check to see if the cache has resized.
  2774. */
  2775. if (!cache->sized) {
  2776. r = resize_cache_dev(cache, csize);
  2777. if (r)
  2778. return r;
  2779. cache->sized = true;
  2780. } else if (csize != cache->cache_size) {
  2781. if (!can_resize(cache, csize))
  2782. return -EINVAL;
  2783. r = resize_cache_dev(cache, csize);
  2784. if (r)
  2785. return r;
  2786. }
  2787. if (!cache->loaded_mappings) {
  2788. r = dm_cache_load_mappings(cache->cmd, cache->policy,
  2789. load_mapping, cache);
  2790. if (r) {
  2791. DMERR("%s: could not load cache mappings", cache_device_name(cache));
  2792. metadata_operation_failed(cache, "dm_cache_load_mappings", r);
  2793. return r;
  2794. }
  2795. cache->loaded_mappings = true;
  2796. }
  2797. if (!cache->loaded_discards) {
  2798. struct discard_load_info li;
  2799. /*
  2800. * The discard bitset could have been resized, or the
  2801. * discard block size changed. To be safe we start by
  2802. * setting every dblock to not discarded.
  2803. */
  2804. clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks));
  2805. discard_load_info_init(cache, &li);
  2806. r = dm_cache_load_discards(cache->cmd, load_discard, &li);
  2807. if (r) {
  2808. DMERR("%s: could not load origin discards", cache_device_name(cache));
  2809. metadata_operation_failed(cache, "dm_cache_load_discards", r);
  2810. return r;
  2811. }
  2812. set_discard_range(&li);
  2813. cache->loaded_discards = true;
  2814. }
  2815. return r;
  2816. }
  2817. static void cache_resume(struct dm_target *ti)
  2818. {
  2819. struct cache *cache = ti->private;
  2820. cache->need_tick_bio = true;
  2821. do_waker(&cache->waker.work);
  2822. }
  2823. /*
  2824. * Status format:
  2825. *
  2826. * <metadata block size> <#used metadata blocks>/<#total metadata blocks>
  2827. * <cache block size> <#used cache blocks>/<#total cache blocks>
  2828. * <#read hits> <#read misses> <#write hits> <#write misses>
  2829. * <#demotions> <#promotions> <#dirty>
  2830. * <#features> <features>*
  2831. * <#core args> <core args>
  2832. * <policy name> <#policy args> <policy args>* <cache metadata mode>
  2833. */
  2834. static void cache_status(struct dm_target *ti, status_type_t type,
  2835. unsigned status_flags, char *result, unsigned maxlen)
  2836. {
  2837. int r = 0;
  2838. unsigned i;
  2839. ssize_t sz = 0;
  2840. dm_block_t nr_free_blocks_metadata = 0;
  2841. dm_block_t nr_blocks_metadata = 0;
  2842. char buf[BDEVNAME_SIZE];
  2843. struct cache *cache = ti->private;
  2844. dm_cblock_t residency;
  2845. switch (type) {
  2846. case STATUSTYPE_INFO:
  2847. if (get_cache_mode(cache) == CM_FAIL) {
  2848. DMEMIT("Fail");
  2849. break;
  2850. }
  2851. /* Commit to ensure statistics aren't out-of-date */
  2852. if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
  2853. (void) commit(cache, false);
  2854. r = dm_cache_get_free_metadata_block_count(cache->cmd, &nr_free_blocks_metadata);
  2855. if (r) {
  2856. DMERR("%s: dm_cache_get_free_metadata_block_count returned %d",
  2857. cache_device_name(cache), r);
  2858. goto err;
  2859. }
  2860. r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata);
  2861. if (r) {
  2862. DMERR("%s: dm_cache_get_metadata_dev_size returned %d",
  2863. cache_device_name(cache), r);
  2864. goto err;
  2865. }
  2866. residency = policy_residency(cache->policy);
  2867. DMEMIT("%u %llu/%llu %u %llu/%llu %u %u %u %u %u %u %lu ",
  2868. (unsigned)DM_CACHE_METADATA_BLOCK_SIZE,
  2869. (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
  2870. (unsigned long long)nr_blocks_metadata,
  2871. cache->sectors_per_block,
  2872. (unsigned long long) from_cblock(residency),
  2873. (unsigned long long) from_cblock(cache->cache_size),
  2874. (unsigned) atomic_read(&cache->stats.read_hit),
  2875. (unsigned) atomic_read(&cache->stats.read_miss),
  2876. (unsigned) atomic_read(&cache->stats.write_hit),
  2877. (unsigned) atomic_read(&cache->stats.write_miss),
  2878. (unsigned) atomic_read(&cache->stats.demotion),
  2879. (unsigned) atomic_read(&cache->stats.promotion),
  2880. (unsigned long) atomic_read(&cache->nr_dirty));
  2881. if (writethrough_mode(&cache->features))
  2882. DMEMIT("1 writethrough ");
  2883. else if (passthrough_mode(&cache->features))
  2884. DMEMIT("1 passthrough ");
  2885. else if (writeback_mode(&cache->features))
  2886. DMEMIT("1 writeback ");
  2887. else {
  2888. DMERR("%s: internal error: unknown io mode: %d",
  2889. cache_device_name(cache), (int) cache->features.io_mode);
  2890. goto err;
  2891. }
  2892. DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold);
  2893. DMEMIT("%s ", dm_cache_policy_get_name(cache->policy));
  2894. if (sz < maxlen) {
  2895. r = policy_emit_config_values(cache->policy, result, maxlen, &sz);
  2896. if (r)
  2897. DMERR("%s: policy_emit_config_values returned %d",
  2898. cache_device_name(cache), r);
  2899. }
  2900. if (get_cache_mode(cache) == CM_READ_ONLY)
  2901. DMEMIT("ro ");
  2902. else
  2903. DMEMIT("rw ");
  2904. break;
  2905. case STATUSTYPE_TABLE:
  2906. format_dev_t(buf, cache->metadata_dev->bdev->bd_dev);
  2907. DMEMIT("%s ", buf);
  2908. format_dev_t(buf, cache->cache_dev->bdev->bd_dev);
  2909. DMEMIT("%s ", buf);
  2910. format_dev_t(buf, cache->origin_dev->bdev->bd_dev);
  2911. DMEMIT("%s", buf);
  2912. for (i = 0; i < cache->nr_ctr_args - 1; i++)
  2913. DMEMIT(" %s", cache->ctr_args[i]);
  2914. if (cache->nr_ctr_args)
  2915. DMEMIT(" %s", cache->ctr_args[cache->nr_ctr_args - 1]);
  2916. }
  2917. return;
  2918. err:
  2919. DMEMIT("Error");
  2920. }
  2921. /*
  2922. * A cache block range can take two forms:
  2923. *
  2924. * i) A single cblock, eg. '3456'
  2925. * ii) A begin and end cblock with dots between, eg. 123-234
  2926. */
  2927. static int parse_cblock_range(struct cache *cache, const char *str,
  2928. struct cblock_range *result)
  2929. {
  2930. char dummy;
  2931. uint64_t b, e;
  2932. int r;
  2933. /*
  2934. * Try and parse form (ii) first.
  2935. */
  2936. r = sscanf(str, "%llu-%llu%c", &b, &e, &dummy);
  2937. if (r < 0)
  2938. return r;
  2939. if (r == 2) {
  2940. result->begin = to_cblock(b);
  2941. result->end = to_cblock(e);
  2942. return 0;
  2943. }
  2944. /*
  2945. * That didn't work, try form (i).
  2946. */
  2947. r = sscanf(str, "%llu%c", &b, &dummy);
  2948. if (r < 0)
  2949. return r;
  2950. if (r == 1) {
  2951. result->begin = to_cblock(b);
  2952. result->end = to_cblock(from_cblock(result->begin) + 1u);
  2953. return 0;
  2954. }
  2955. DMERR("%s: invalid cblock range '%s'", cache_device_name(cache), str);
  2956. return -EINVAL;
  2957. }
  2958. static int validate_cblock_range(struct cache *cache, struct cblock_range *range)
  2959. {
  2960. uint64_t b = from_cblock(range->begin);
  2961. uint64_t e = from_cblock(range->end);
  2962. uint64_t n = from_cblock(cache->cache_size);
  2963. if (b >= n) {
  2964. DMERR("%s: begin cblock out of range: %llu >= %llu",
  2965. cache_device_name(cache), b, n);
  2966. return -EINVAL;
  2967. }
  2968. if (e > n) {
  2969. DMERR("%s: end cblock out of range: %llu > %llu",
  2970. cache_device_name(cache), e, n);
  2971. return -EINVAL;
  2972. }
  2973. if (b >= e) {
  2974. DMERR("%s: invalid cblock range: %llu >= %llu",
  2975. cache_device_name(cache), b, e);
  2976. return -EINVAL;
  2977. }
  2978. return 0;
  2979. }
  2980. static int request_invalidation(struct cache *cache, struct cblock_range *range)
  2981. {
  2982. struct invalidation_request req;
  2983. INIT_LIST_HEAD(&req.list);
  2984. req.cblocks = range;
  2985. atomic_set(&req.complete, 0);
  2986. req.err = 0;
  2987. init_waitqueue_head(&req.result_wait);
  2988. spin_lock(&cache->invalidation_lock);
  2989. list_add(&req.list, &cache->invalidation_requests);
  2990. spin_unlock(&cache->invalidation_lock);
  2991. wake_worker(cache);
  2992. wait_event(req.result_wait, atomic_read(&req.complete));
  2993. return req.err;
  2994. }
  2995. static int process_invalidate_cblocks_message(struct cache *cache, unsigned count,
  2996. const char **cblock_ranges)
  2997. {
  2998. int r = 0;
  2999. unsigned i;
  3000. struct cblock_range range;
  3001. if (!passthrough_mode(&cache->features)) {
  3002. DMERR("%s: cache has to be in passthrough mode for invalidation",
  3003. cache_device_name(cache));
  3004. return -EPERM;
  3005. }
  3006. for (i = 0; i < count; i++) {
  3007. r = parse_cblock_range(cache, cblock_ranges[i], &range);
  3008. if (r)
  3009. break;
  3010. r = validate_cblock_range(cache, &range);
  3011. if (r)
  3012. break;
  3013. /*
  3014. * Pass begin and end origin blocks to the worker and wake it.
  3015. */
  3016. r = request_invalidation(cache, &range);
  3017. if (r)
  3018. break;
  3019. }
  3020. return r;
  3021. }
  3022. /*
  3023. * Supports
  3024. * "<key> <value>"
  3025. * and
  3026. * "invalidate_cblocks [(<begin>)|(<begin>-<end>)]*
  3027. *
  3028. * The key migration_threshold is supported by the cache target core.
  3029. */
  3030. static int cache_message(struct dm_target *ti, unsigned argc, char **argv)
  3031. {
  3032. struct cache *cache = ti->private;
  3033. if (!argc)
  3034. return -EINVAL;
  3035. if (get_cache_mode(cache) >= CM_READ_ONLY) {
  3036. DMERR("%s: unable to service cache target messages in READ_ONLY or FAIL mode",
  3037. cache_device_name(cache));
  3038. return -EOPNOTSUPP;
  3039. }
  3040. if (!strcasecmp(argv[0], "invalidate_cblocks"))
  3041. return process_invalidate_cblocks_message(cache, argc - 1, (const char **) argv + 1);
  3042. if (argc != 2)
  3043. return -EINVAL;
  3044. return set_config_value(cache, argv[0], argv[1]);
  3045. }
  3046. static int cache_iterate_devices(struct dm_target *ti,
  3047. iterate_devices_callout_fn fn, void *data)
  3048. {
  3049. int r = 0;
  3050. struct cache *cache = ti->private;
  3051. r = fn(ti, cache->cache_dev, 0, get_dev_size(cache->cache_dev), data);
  3052. if (!r)
  3053. r = fn(ti, cache->origin_dev, 0, ti->len, data);
  3054. return r;
  3055. }
  3056. /*
  3057. * We assume I/O is going to the origin (which is the volume
  3058. * more likely to have restrictions e.g. by being striped).
  3059. * (Looking up the exact location of the data would be expensive
  3060. * and could always be out of date by the time the bio is submitted.)
  3061. */
  3062. static int cache_bvec_merge(struct dm_target *ti,
  3063. struct bvec_merge_data *bvm,
  3064. struct bio_vec *biovec, int max_size)
  3065. {
  3066. struct cache *cache = ti->private;
  3067. struct request_queue *q = bdev_get_queue(cache->origin_dev->bdev);
  3068. if (!q->merge_bvec_fn)
  3069. return max_size;
  3070. bvm->bi_bdev = cache->origin_dev->bdev;
  3071. return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
  3072. }
  3073. static void set_discard_limits(struct cache *cache, struct queue_limits *limits)
  3074. {
  3075. /*
  3076. * FIXME: these limits may be incompatible with the cache device
  3077. */
  3078. limits->max_discard_sectors = min_t(sector_t, cache->discard_block_size * 1024,
  3079. cache->origin_sectors);
  3080. limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT;
  3081. }
  3082. static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
  3083. {
  3084. struct cache *cache = ti->private;
  3085. uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
  3086. /*
  3087. * If the system-determined stacked limits are compatible with the
  3088. * cache's blocksize (io_opt is a factor) do not override them.
  3089. */
  3090. if (io_opt_sectors < cache->sectors_per_block ||
  3091. do_div(io_opt_sectors, cache->sectors_per_block)) {
  3092. blk_limits_io_min(limits, cache->sectors_per_block << SECTOR_SHIFT);
  3093. blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT);
  3094. }
  3095. set_discard_limits(cache, limits);
  3096. }
  3097. /*----------------------------------------------------------------*/
  3098. static struct target_type cache_target = {
  3099. .name = "cache",
  3100. .version = {1, 7, 0},
  3101. .module = THIS_MODULE,
  3102. .ctr = cache_ctr,
  3103. .dtr = cache_dtr,
  3104. .map = cache_map,
  3105. .end_io = cache_end_io,
  3106. .postsuspend = cache_postsuspend,
  3107. .preresume = cache_preresume,
  3108. .resume = cache_resume,
  3109. .status = cache_status,
  3110. .message = cache_message,
  3111. .iterate_devices = cache_iterate_devices,
  3112. .merge = cache_bvec_merge,
  3113. .io_hints = cache_io_hints,
  3114. };
  3115. static int __init dm_cache_init(void)
  3116. {
  3117. int r;
  3118. r = dm_register_target(&cache_target);
  3119. if (r) {
  3120. DMERR("cache target registration failed: %d", r);
  3121. return r;
  3122. }
  3123. migration_cache = KMEM_CACHE(dm_cache_migration, 0);
  3124. if (!migration_cache) {
  3125. dm_unregister_target(&cache_target);
  3126. return -ENOMEM;
  3127. }
  3128. return 0;
  3129. }
  3130. static void __exit dm_cache_exit(void)
  3131. {
  3132. dm_unregister_target(&cache_target);
  3133. kmem_cache_destroy(migration_cache);
  3134. }
  3135. module_init(dm_cache_init);
  3136. module_exit(dm_cache_exit);
  3137. MODULE_DESCRIPTION(DM_NAME " cache target");
  3138. MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>");
  3139. MODULE_LICENSE("GPL");