caps.c 113 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220
  1. // SPDX-License-Identifier: GPL-2.0
  2. #include <linux/ceph/ceph_debug.h>
  3. #include <linux/fs.h>
  4. #include <linux/kernel.h>
  5. #include <linux/sched/signal.h>
  6. #include <linux/slab.h>
  7. #include <linux/vmalloc.h>
  8. #include <linux/wait.h>
  9. #include <linux/writeback.h>
  10. #include "super.h"
  11. #include "mds_client.h"
  12. #include "cache.h"
  13. #include <linux/ceph/decode.h>
  14. #include <linux/ceph/messenger.h>
  15. /*
  16. * Capability management
  17. *
  18. * The Ceph metadata servers control client access to inode metadata
  19. * and file data by issuing capabilities, granting clients permission
  20. * to read and/or write both inode field and file data to OSDs
  21. * (storage nodes). Each capability consists of a set of bits
  22. * indicating which operations are allowed.
  23. *
  24. * If the client holds a *_SHARED cap, the client has a coherent value
  25. * that can be safely read from the cached inode.
  26. *
  27. * In the case of a *_EXCL (exclusive) or FILE_WR capabilities, the
  28. * client is allowed to change inode attributes (e.g., file size,
  29. * mtime), note its dirty state in the ceph_cap, and asynchronously
  30. * flush that metadata change to the MDS.
  31. *
  32. * In the event of a conflicting operation (perhaps by another
  33. * client), the MDS will revoke the conflicting client capabilities.
  34. *
  35. * In order for a client to cache an inode, it must hold a capability
  36. * with at least one MDS server. When inodes are released, release
  37. * notifications are batched and periodically sent en masse to the MDS
  38. * cluster to release server state.
  39. */
  40. static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc);
  41. static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
  42. struct ceph_mds_session *session,
  43. struct ceph_inode_info *ci,
  44. u64 oldest_flush_tid);
  45. /*
  46. * Generate readable cap strings for debugging output.
  47. */
  48. #define MAX_CAP_STR 20
  49. static char cap_str[MAX_CAP_STR][40];
  50. static DEFINE_SPINLOCK(cap_str_lock);
  51. static int last_cap_str;
  52. static char *gcap_string(char *s, int c)
  53. {
  54. if (c & CEPH_CAP_GSHARED)
  55. *s++ = 's';
  56. if (c & CEPH_CAP_GEXCL)
  57. *s++ = 'x';
  58. if (c & CEPH_CAP_GCACHE)
  59. *s++ = 'c';
  60. if (c & CEPH_CAP_GRD)
  61. *s++ = 'r';
  62. if (c & CEPH_CAP_GWR)
  63. *s++ = 'w';
  64. if (c & CEPH_CAP_GBUFFER)
  65. *s++ = 'b';
  66. if (c & CEPH_CAP_GWREXTEND)
  67. *s++ = 'a';
  68. if (c & CEPH_CAP_GLAZYIO)
  69. *s++ = 'l';
  70. return s;
  71. }
  72. const char *ceph_cap_string(int caps)
  73. {
  74. int i;
  75. char *s;
  76. int c;
  77. spin_lock(&cap_str_lock);
  78. i = last_cap_str++;
  79. if (last_cap_str == MAX_CAP_STR)
  80. last_cap_str = 0;
  81. spin_unlock(&cap_str_lock);
  82. s = cap_str[i];
  83. if (caps & CEPH_CAP_PIN)
  84. *s++ = 'p';
  85. c = (caps >> CEPH_CAP_SAUTH) & 3;
  86. if (c) {
  87. *s++ = 'A';
  88. s = gcap_string(s, c);
  89. }
  90. c = (caps >> CEPH_CAP_SLINK) & 3;
  91. if (c) {
  92. *s++ = 'L';
  93. s = gcap_string(s, c);
  94. }
  95. c = (caps >> CEPH_CAP_SXATTR) & 3;
  96. if (c) {
  97. *s++ = 'X';
  98. s = gcap_string(s, c);
  99. }
  100. c = caps >> CEPH_CAP_SFILE;
  101. if (c) {
  102. *s++ = 'F';
  103. s = gcap_string(s, c);
  104. }
  105. if (s == cap_str[i])
  106. *s++ = '-';
  107. *s = 0;
  108. return cap_str[i];
  109. }
  110. void ceph_caps_init(struct ceph_mds_client *mdsc)
  111. {
  112. INIT_LIST_HEAD(&mdsc->caps_list);
  113. spin_lock_init(&mdsc->caps_list_lock);
  114. }
  115. void ceph_caps_finalize(struct ceph_mds_client *mdsc)
  116. {
  117. struct ceph_cap *cap;
  118. spin_lock(&mdsc->caps_list_lock);
  119. while (!list_empty(&mdsc->caps_list)) {
  120. cap = list_first_entry(&mdsc->caps_list,
  121. struct ceph_cap, caps_item);
  122. list_del(&cap->caps_item);
  123. kmem_cache_free(ceph_cap_cachep, cap);
  124. }
  125. mdsc->caps_total_count = 0;
  126. mdsc->caps_avail_count = 0;
  127. mdsc->caps_use_count = 0;
  128. mdsc->caps_reserve_count = 0;
  129. mdsc->caps_min_count = 0;
  130. spin_unlock(&mdsc->caps_list_lock);
  131. }
  132. void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta)
  133. {
  134. spin_lock(&mdsc->caps_list_lock);
  135. mdsc->caps_min_count += delta;
  136. BUG_ON(mdsc->caps_min_count < 0);
  137. spin_unlock(&mdsc->caps_list_lock);
  138. }
  139. /*
  140. * Called under mdsc->mutex.
  141. */
  142. int ceph_reserve_caps(struct ceph_mds_client *mdsc,
  143. struct ceph_cap_reservation *ctx, int need)
  144. {
  145. int i, j;
  146. struct ceph_cap *cap;
  147. int have;
  148. int alloc = 0;
  149. int max_caps;
  150. bool trimmed = false;
  151. struct ceph_mds_session *s;
  152. LIST_HEAD(newcaps);
  153. dout("reserve caps ctx=%p need=%d\n", ctx, need);
  154. /* first reserve any caps that are already allocated */
  155. spin_lock(&mdsc->caps_list_lock);
  156. if (mdsc->caps_avail_count >= need)
  157. have = need;
  158. else
  159. have = mdsc->caps_avail_count;
  160. mdsc->caps_avail_count -= have;
  161. mdsc->caps_reserve_count += have;
  162. BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
  163. mdsc->caps_reserve_count +
  164. mdsc->caps_avail_count);
  165. spin_unlock(&mdsc->caps_list_lock);
  166. for (i = have; i < need; ) {
  167. cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
  168. if (cap) {
  169. list_add(&cap->caps_item, &newcaps);
  170. alloc++;
  171. i++;
  172. continue;
  173. }
  174. if (!trimmed) {
  175. for (j = 0; j < mdsc->max_sessions; j++) {
  176. s = __ceph_lookup_mds_session(mdsc, j);
  177. if (!s)
  178. continue;
  179. mutex_unlock(&mdsc->mutex);
  180. mutex_lock(&s->s_mutex);
  181. max_caps = s->s_nr_caps - (need - i);
  182. ceph_trim_caps(mdsc, s, max_caps);
  183. mutex_unlock(&s->s_mutex);
  184. ceph_put_mds_session(s);
  185. mutex_lock(&mdsc->mutex);
  186. }
  187. trimmed = true;
  188. spin_lock(&mdsc->caps_list_lock);
  189. if (mdsc->caps_avail_count) {
  190. int more_have;
  191. if (mdsc->caps_avail_count >= need - i)
  192. more_have = need - i;
  193. else
  194. more_have = mdsc->caps_avail_count;
  195. i += more_have;
  196. have += more_have;
  197. mdsc->caps_avail_count -= more_have;
  198. mdsc->caps_reserve_count += more_have;
  199. }
  200. spin_unlock(&mdsc->caps_list_lock);
  201. continue;
  202. }
  203. pr_warn("reserve caps ctx=%p ENOMEM need=%d got=%d\n",
  204. ctx, need, have + alloc);
  205. goto out_nomem;
  206. }
  207. BUG_ON(have + alloc != need);
  208. spin_lock(&mdsc->caps_list_lock);
  209. mdsc->caps_total_count += alloc;
  210. mdsc->caps_reserve_count += alloc;
  211. list_splice(&newcaps, &mdsc->caps_list);
  212. BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
  213. mdsc->caps_reserve_count +
  214. mdsc->caps_avail_count);
  215. spin_unlock(&mdsc->caps_list_lock);
  216. ctx->count = need;
  217. dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n",
  218. ctx, mdsc->caps_total_count, mdsc->caps_use_count,
  219. mdsc->caps_reserve_count, mdsc->caps_avail_count);
  220. return 0;
  221. out_nomem:
  222. spin_lock(&mdsc->caps_list_lock);
  223. mdsc->caps_avail_count += have;
  224. mdsc->caps_reserve_count -= have;
  225. while (!list_empty(&newcaps)) {
  226. cap = list_first_entry(&newcaps,
  227. struct ceph_cap, caps_item);
  228. list_del(&cap->caps_item);
  229. /* Keep some preallocated caps around (ceph_min_count), to
  230. * avoid lots of free/alloc churn. */
  231. if (mdsc->caps_avail_count >=
  232. mdsc->caps_reserve_count + mdsc->caps_min_count) {
  233. kmem_cache_free(ceph_cap_cachep, cap);
  234. } else {
  235. mdsc->caps_avail_count++;
  236. mdsc->caps_total_count++;
  237. list_add(&cap->caps_item, &mdsc->caps_list);
  238. }
  239. }
  240. BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
  241. mdsc->caps_reserve_count +
  242. mdsc->caps_avail_count);
  243. spin_unlock(&mdsc->caps_list_lock);
  244. return -ENOMEM;
  245. }
  246. int ceph_unreserve_caps(struct ceph_mds_client *mdsc,
  247. struct ceph_cap_reservation *ctx)
  248. {
  249. int i;
  250. struct ceph_cap *cap;
  251. dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count);
  252. if (ctx->count) {
  253. spin_lock(&mdsc->caps_list_lock);
  254. BUG_ON(mdsc->caps_reserve_count < ctx->count);
  255. mdsc->caps_reserve_count -= ctx->count;
  256. if (mdsc->caps_avail_count >=
  257. mdsc->caps_reserve_count + mdsc->caps_min_count) {
  258. mdsc->caps_total_count -= ctx->count;
  259. for (i = 0; i < ctx->count; i++) {
  260. cap = list_first_entry(&mdsc->caps_list,
  261. struct ceph_cap, caps_item);
  262. list_del(&cap->caps_item);
  263. kmem_cache_free(ceph_cap_cachep, cap);
  264. }
  265. } else {
  266. mdsc->caps_avail_count += ctx->count;
  267. }
  268. ctx->count = 0;
  269. dout("unreserve caps %d = %d used + %d resv + %d avail\n",
  270. mdsc->caps_total_count, mdsc->caps_use_count,
  271. mdsc->caps_reserve_count, mdsc->caps_avail_count);
  272. BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
  273. mdsc->caps_reserve_count +
  274. mdsc->caps_avail_count);
  275. spin_unlock(&mdsc->caps_list_lock);
  276. }
  277. return 0;
  278. }
  279. struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc,
  280. struct ceph_cap_reservation *ctx)
  281. {
  282. struct ceph_cap *cap = NULL;
  283. /* temporary, until we do something about cap import/export */
  284. if (!ctx) {
  285. cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
  286. if (cap) {
  287. spin_lock(&mdsc->caps_list_lock);
  288. mdsc->caps_use_count++;
  289. mdsc->caps_total_count++;
  290. spin_unlock(&mdsc->caps_list_lock);
  291. } else {
  292. spin_lock(&mdsc->caps_list_lock);
  293. if (mdsc->caps_avail_count) {
  294. BUG_ON(list_empty(&mdsc->caps_list));
  295. mdsc->caps_avail_count--;
  296. mdsc->caps_use_count++;
  297. cap = list_first_entry(&mdsc->caps_list,
  298. struct ceph_cap, caps_item);
  299. list_del(&cap->caps_item);
  300. BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
  301. mdsc->caps_reserve_count + mdsc->caps_avail_count);
  302. }
  303. spin_unlock(&mdsc->caps_list_lock);
  304. }
  305. return cap;
  306. }
  307. spin_lock(&mdsc->caps_list_lock);
  308. dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n",
  309. ctx, ctx->count, mdsc->caps_total_count, mdsc->caps_use_count,
  310. mdsc->caps_reserve_count, mdsc->caps_avail_count);
  311. BUG_ON(!ctx->count);
  312. BUG_ON(ctx->count > mdsc->caps_reserve_count);
  313. BUG_ON(list_empty(&mdsc->caps_list));
  314. ctx->count--;
  315. mdsc->caps_reserve_count--;
  316. mdsc->caps_use_count++;
  317. cap = list_first_entry(&mdsc->caps_list, struct ceph_cap, caps_item);
  318. list_del(&cap->caps_item);
  319. BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
  320. mdsc->caps_reserve_count + mdsc->caps_avail_count);
  321. spin_unlock(&mdsc->caps_list_lock);
  322. return cap;
  323. }
  324. void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap)
  325. {
  326. spin_lock(&mdsc->caps_list_lock);
  327. dout("put_cap %p %d = %d used + %d resv + %d avail\n",
  328. cap, mdsc->caps_total_count, mdsc->caps_use_count,
  329. mdsc->caps_reserve_count, mdsc->caps_avail_count);
  330. mdsc->caps_use_count--;
  331. /*
  332. * Keep some preallocated caps around (ceph_min_count), to
  333. * avoid lots of free/alloc churn.
  334. */
  335. if (mdsc->caps_avail_count >= mdsc->caps_reserve_count +
  336. mdsc->caps_min_count) {
  337. mdsc->caps_total_count--;
  338. kmem_cache_free(ceph_cap_cachep, cap);
  339. } else {
  340. mdsc->caps_avail_count++;
  341. list_add(&cap->caps_item, &mdsc->caps_list);
  342. }
  343. BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
  344. mdsc->caps_reserve_count + mdsc->caps_avail_count);
  345. spin_unlock(&mdsc->caps_list_lock);
  346. }
  347. void ceph_reservation_status(struct ceph_fs_client *fsc,
  348. int *total, int *avail, int *used, int *reserved,
  349. int *min)
  350. {
  351. struct ceph_mds_client *mdsc = fsc->mdsc;
  352. spin_lock(&mdsc->caps_list_lock);
  353. if (total)
  354. *total = mdsc->caps_total_count;
  355. if (avail)
  356. *avail = mdsc->caps_avail_count;
  357. if (used)
  358. *used = mdsc->caps_use_count;
  359. if (reserved)
  360. *reserved = mdsc->caps_reserve_count;
  361. if (min)
  362. *min = mdsc->caps_min_count;
  363. spin_unlock(&mdsc->caps_list_lock);
  364. }
  365. /*
  366. * Find ceph_cap for given mds, if any.
  367. *
  368. * Called with i_ceph_lock held.
  369. */
  370. static struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds)
  371. {
  372. struct ceph_cap *cap;
  373. struct rb_node *n = ci->i_caps.rb_node;
  374. while (n) {
  375. cap = rb_entry(n, struct ceph_cap, ci_node);
  376. if (mds < cap->mds)
  377. n = n->rb_left;
  378. else if (mds > cap->mds)
  379. n = n->rb_right;
  380. else
  381. return cap;
  382. }
  383. return NULL;
  384. }
  385. struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, int mds)
  386. {
  387. struct ceph_cap *cap;
  388. spin_lock(&ci->i_ceph_lock);
  389. cap = __get_cap_for_mds(ci, mds);
  390. spin_unlock(&ci->i_ceph_lock);
  391. return cap;
  392. }
  393. /*
  394. * Return id of any MDS with a cap, preferably FILE_WR|BUFFER|EXCL, else -1.
  395. */
  396. static int __ceph_get_cap_mds(struct ceph_inode_info *ci)
  397. {
  398. struct ceph_cap *cap;
  399. int mds = -1;
  400. struct rb_node *p;
  401. /* prefer mds with WR|BUFFER|EXCL caps */
  402. for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
  403. cap = rb_entry(p, struct ceph_cap, ci_node);
  404. mds = cap->mds;
  405. if (cap->issued & (CEPH_CAP_FILE_WR |
  406. CEPH_CAP_FILE_BUFFER |
  407. CEPH_CAP_FILE_EXCL))
  408. break;
  409. }
  410. return mds;
  411. }
  412. int ceph_get_cap_mds(struct inode *inode)
  413. {
  414. struct ceph_inode_info *ci = ceph_inode(inode);
  415. int mds;
  416. spin_lock(&ci->i_ceph_lock);
  417. mds = __ceph_get_cap_mds(ceph_inode(inode));
  418. spin_unlock(&ci->i_ceph_lock);
  419. return mds;
  420. }
  421. /*
  422. * Called under i_ceph_lock.
  423. */
  424. static void __insert_cap_node(struct ceph_inode_info *ci,
  425. struct ceph_cap *new)
  426. {
  427. struct rb_node **p = &ci->i_caps.rb_node;
  428. struct rb_node *parent = NULL;
  429. struct ceph_cap *cap = NULL;
  430. while (*p) {
  431. parent = *p;
  432. cap = rb_entry(parent, struct ceph_cap, ci_node);
  433. if (new->mds < cap->mds)
  434. p = &(*p)->rb_left;
  435. else if (new->mds > cap->mds)
  436. p = &(*p)->rb_right;
  437. else
  438. BUG();
  439. }
  440. rb_link_node(&new->ci_node, parent, p);
  441. rb_insert_color(&new->ci_node, &ci->i_caps);
  442. }
  443. /*
  444. * (re)set cap hold timeouts, which control the delayed release
  445. * of unused caps back to the MDS. Should be called on cap use.
  446. */
  447. static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
  448. struct ceph_inode_info *ci)
  449. {
  450. struct ceph_mount_options *ma = mdsc->fsc->mount_options;
  451. ci->i_hold_caps_min = round_jiffies(jiffies +
  452. ma->caps_wanted_delay_min * HZ);
  453. ci->i_hold_caps_max = round_jiffies(jiffies +
  454. ma->caps_wanted_delay_max * HZ);
  455. dout("__cap_set_timeouts %p min %lu max %lu\n", &ci->vfs_inode,
  456. ci->i_hold_caps_min - jiffies, ci->i_hold_caps_max - jiffies);
  457. }
  458. /*
  459. * (Re)queue cap at the end of the delayed cap release list.
  460. *
  461. * If I_FLUSH is set, leave the inode at the front of the list.
  462. *
  463. * Caller holds i_ceph_lock
  464. * -> we take mdsc->cap_delay_lock
  465. */
  466. static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
  467. struct ceph_inode_info *ci)
  468. {
  469. __cap_set_timeouts(mdsc, ci);
  470. dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode,
  471. ci->i_ceph_flags, ci->i_hold_caps_max);
  472. if (!mdsc->stopping) {
  473. spin_lock(&mdsc->cap_delay_lock);
  474. if (!list_empty(&ci->i_cap_delay_list)) {
  475. if (ci->i_ceph_flags & CEPH_I_FLUSH)
  476. goto no_change;
  477. list_del_init(&ci->i_cap_delay_list);
  478. }
  479. list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
  480. no_change:
  481. spin_unlock(&mdsc->cap_delay_lock);
  482. }
  483. }
  484. /*
  485. * Queue an inode for immediate writeback. Mark inode with I_FLUSH,
  486. * indicating we should send a cap message to flush dirty metadata
  487. * asap, and move to the front of the delayed cap list.
  488. */
  489. static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
  490. struct ceph_inode_info *ci)
  491. {
  492. dout("__cap_delay_requeue_front %p\n", &ci->vfs_inode);
  493. spin_lock(&mdsc->cap_delay_lock);
  494. ci->i_ceph_flags |= CEPH_I_FLUSH;
  495. if (!list_empty(&ci->i_cap_delay_list))
  496. list_del_init(&ci->i_cap_delay_list);
  497. list_add(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
  498. spin_unlock(&mdsc->cap_delay_lock);
  499. }
  500. /*
  501. * Cancel delayed work on cap.
  502. *
  503. * Caller must hold i_ceph_lock.
  504. */
  505. static void __cap_delay_cancel(struct ceph_mds_client *mdsc,
  506. struct ceph_inode_info *ci)
  507. {
  508. dout("__cap_delay_cancel %p\n", &ci->vfs_inode);
  509. if (list_empty(&ci->i_cap_delay_list))
  510. return;
  511. spin_lock(&mdsc->cap_delay_lock);
  512. list_del_init(&ci->i_cap_delay_list);
  513. spin_unlock(&mdsc->cap_delay_lock);
  514. }
  515. /*
  516. * Common issue checks for add_cap, handle_cap_grant.
  517. */
  518. static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
  519. unsigned issued)
  520. {
  521. unsigned had = __ceph_caps_issued(ci, NULL);
  522. /*
  523. * Each time we receive FILE_CACHE anew, we increment
  524. * i_rdcache_gen.
  525. */
  526. if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
  527. (had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) {
  528. ci->i_rdcache_gen++;
  529. }
  530. /*
  531. * If FILE_SHARED is newly issued, mark dir not complete. We don't
  532. * know what happened to this directory while we didn't have the cap.
  533. * If FILE_SHARED is being revoked, also mark dir not complete. It
  534. * stops on-going cached readdir.
  535. */
  536. if ((issued & CEPH_CAP_FILE_SHARED) != (had & CEPH_CAP_FILE_SHARED)) {
  537. if (issued & CEPH_CAP_FILE_SHARED)
  538. atomic_inc(&ci->i_shared_gen);
  539. if (S_ISDIR(ci->vfs_inode.i_mode)) {
  540. dout(" marking %p NOT complete\n", &ci->vfs_inode);
  541. __ceph_dir_clear_complete(ci);
  542. }
  543. }
  544. }
  545. /*
  546. * Add a capability under the given MDS session.
  547. *
  548. * Caller should hold session snap_rwsem (read) and s_mutex.
  549. *
  550. * @fmode is the open file mode, if we are opening a file, otherwise
  551. * it is < 0. (This is so we can atomically add the cap and add an
  552. * open file reference to it.)
  553. */
  554. void ceph_add_cap(struct inode *inode,
  555. struct ceph_mds_session *session, u64 cap_id,
  556. int fmode, unsigned issued, unsigned wanted,
  557. unsigned seq, unsigned mseq, u64 realmino, int flags,
  558. struct ceph_cap **new_cap)
  559. {
  560. struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
  561. struct ceph_inode_info *ci = ceph_inode(inode);
  562. struct ceph_cap *cap;
  563. int mds = session->s_mds;
  564. int actual_wanted;
  565. dout("add_cap %p mds%d cap %llx %s seq %d\n", inode,
  566. session->s_mds, cap_id, ceph_cap_string(issued), seq);
  567. /*
  568. * If we are opening the file, include file mode wanted bits
  569. * in wanted.
  570. */
  571. if (fmode >= 0)
  572. wanted |= ceph_caps_for_mode(fmode);
  573. cap = __get_cap_for_mds(ci, mds);
  574. if (!cap) {
  575. cap = *new_cap;
  576. *new_cap = NULL;
  577. cap->issued = 0;
  578. cap->implemented = 0;
  579. cap->mds = mds;
  580. cap->mds_wanted = 0;
  581. cap->mseq = 0;
  582. cap->ci = ci;
  583. __insert_cap_node(ci, cap);
  584. /* add to session cap list */
  585. cap->session = session;
  586. spin_lock(&session->s_cap_lock);
  587. list_add_tail(&cap->session_caps, &session->s_caps);
  588. session->s_nr_caps++;
  589. spin_unlock(&session->s_cap_lock);
  590. } else {
  591. /*
  592. * auth mds of the inode changed. we received the cap export
  593. * message, but still haven't received the cap import message.
  594. * handle_cap_export() updated the new auth MDS' cap.
  595. *
  596. * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
  597. * a message that was send before the cap import message. So
  598. * don't remove caps.
  599. */
  600. if (ceph_seq_cmp(seq, cap->seq) <= 0) {
  601. WARN_ON(cap != ci->i_auth_cap);
  602. WARN_ON(cap->cap_id != cap_id);
  603. seq = cap->seq;
  604. mseq = cap->mseq;
  605. issued |= cap->issued;
  606. flags |= CEPH_CAP_FLAG_AUTH;
  607. }
  608. }
  609. if (!ci->i_snap_realm ||
  610. ((flags & CEPH_CAP_FLAG_AUTH) &&
  611. realmino != (u64)-1 && ci->i_snap_realm->ino != realmino)) {
  612. /*
  613. * add this inode to the appropriate snap realm
  614. */
  615. struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc,
  616. realmino);
  617. if (realm) {
  618. struct ceph_snap_realm *oldrealm = ci->i_snap_realm;
  619. if (oldrealm) {
  620. spin_lock(&oldrealm->inodes_with_caps_lock);
  621. list_del_init(&ci->i_snap_realm_item);
  622. spin_unlock(&oldrealm->inodes_with_caps_lock);
  623. }
  624. spin_lock(&realm->inodes_with_caps_lock);
  625. list_add(&ci->i_snap_realm_item,
  626. &realm->inodes_with_caps);
  627. ci->i_snap_realm = realm;
  628. if (realm->ino == ci->i_vino.ino)
  629. realm->inode = inode;
  630. spin_unlock(&realm->inodes_with_caps_lock);
  631. if (oldrealm)
  632. ceph_put_snap_realm(mdsc, oldrealm);
  633. } else {
  634. pr_err("ceph_add_cap: couldn't find snap realm %llx\n",
  635. realmino);
  636. WARN_ON(!realm);
  637. }
  638. }
  639. __check_cap_issue(ci, cap, issued);
  640. /*
  641. * If we are issued caps we don't want, or the mds' wanted
  642. * value appears to be off, queue a check so we'll release
  643. * later and/or update the mds wanted value.
  644. */
  645. actual_wanted = __ceph_caps_wanted(ci);
  646. if ((wanted & ~actual_wanted) ||
  647. (issued & ~actual_wanted & CEPH_CAP_ANY_WR)) {
  648. dout(" issued %s, mds wanted %s, actual %s, queueing\n",
  649. ceph_cap_string(issued), ceph_cap_string(wanted),
  650. ceph_cap_string(actual_wanted));
  651. __cap_delay_requeue(mdsc, ci);
  652. }
  653. if (flags & CEPH_CAP_FLAG_AUTH) {
  654. if (!ci->i_auth_cap ||
  655. ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0) {
  656. ci->i_auth_cap = cap;
  657. cap->mds_wanted = wanted;
  658. }
  659. } else {
  660. WARN_ON(ci->i_auth_cap == cap);
  661. }
  662. dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n",
  663. inode, ceph_vinop(inode), cap, ceph_cap_string(issued),
  664. ceph_cap_string(issued|cap->issued), seq, mds);
  665. cap->cap_id = cap_id;
  666. cap->issued = issued;
  667. cap->implemented |= issued;
  668. if (ceph_seq_cmp(mseq, cap->mseq) > 0)
  669. cap->mds_wanted = wanted;
  670. else
  671. cap->mds_wanted |= wanted;
  672. cap->seq = seq;
  673. cap->issue_seq = seq;
  674. cap->mseq = mseq;
  675. cap->cap_gen = session->s_cap_gen;
  676. if (fmode >= 0)
  677. __ceph_get_fmode(ci, fmode);
  678. }
  679. /*
  680. * Return true if cap has not timed out and belongs to the current
  681. * generation of the MDS session (i.e. has not gone 'stale' due to
  682. * us losing touch with the mds).
  683. */
  684. static int __cap_is_valid(struct ceph_cap *cap)
  685. {
  686. unsigned long ttl;
  687. u32 gen;
  688. spin_lock(&cap->session->s_gen_ttl_lock);
  689. gen = cap->session->s_cap_gen;
  690. ttl = cap->session->s_cap_ttl;
  691. spin_unlock(&cap->session->s_gen_ttl_lock);
  692. if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) {
  693. dout("__cap_is_valid %p cap %p issued %s "
  694. "but STALE (gen %u vs %u)\n", &cap->ci->vfs_inode,
  695. cap, ceph_cap_string(cap->issued), cap->cap_gen, gen);
  696. return 0;
  697. }
  698. return 1;
  699. }
  700. /*
  701. * Return set of valid cap bits issued to us. Note that caps time
  702. * out, and may be invalidated in bulk if the client session times out
  703. * and session->s_cap_gen is bumped.
  704. */
  705. int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented)
  706. {
  707. int have = ci->i_snap_caps;
  708. struct ceph_cap *cap;
  709. struct rb_node *p;
  710. if (implemented)
  711. *implemented = 0;
  712. for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
  713. cap = rb_entry(p, struct ceph_cap, ci_node);
  714. if (!__cap_is_valid(cap))
  715. continue;
  716. dout("__ceph_caps_issued %p cap %p issued %s\n",
  717. &ci->vfs_inode, cap, ceph_cap_string(cap->issued));
  718. have |= cap->issued;
  719. if (implemented)
  720. *implemented |= cap->implemented;
  721. }
  722. /*
  723. * exclude caps issued by non-auth MDS, but are been revoking
  724. * by the auth MDS. The non-auth MDS should be revoking/exporting
  725. * these caps, but the message is delayed.
  726. */
  727. if (ci->i_auth_cap) {
  728. cap = ci->i_auth_cap;
  729. have &= ~cap->implemented | cap->issued;
  730. }
  731. return have;
  732. }
  733. /*
  734. * Get cap bits issued by caps other than @ocap
  735. */
  736. int __ceph_caps_issued_other(struct ceph_inode_info *ci, struct ceph_cap *ocap)
  737. {
  738. int have = ci->i_snap_caps;
  739. struct ceph_cap *cap;
  740. struct rb_node *p;
  741. for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
  742. cap = rb_entry(p, struct ceph_cap, ci_node);
  743. if (cap == ocap)
  744. continue;
  745. if (!__cap_is_valid(cap))
  746. continue;
  747. have |= cap->issued;
  748. }
  749. return have;
  750. }
  751. /*
  752. * Move a cap to the end of the LRU (oldest caps at list head, newest
  753. * at list tail).
  754. */
  755. static void __touch_cap(struct ceph_cap *cap)
  756. {
  757. struct ceph_mds_session *s = cap->session;
  758. spin_lock(&s->s_cap_lock);
  759. if (!s->s_cap_iterator) {
  760. dout("__touch_cap %p cap %p mds%d\n", &cap->ci->vfs_inode, cap,
  761. s->s_mds);
  762. list_move_tail(&cap->session_caps, &s->s_caps);
  763. } else {
  764. dout("__touch_cap %p cap %p mds%d NOP, iterating over caps\n",
  765. &cap->ci->vfs_inode, cap, s->s_mds);
  766. }
  767. spin_unlock(&s->s_cap_lock);
  768. }
  769. /*
  770. * Check if we hold the given mask. If so, move the cap(s) to the
  771. * front of their respective LRUs. (This is the preferred way for
  772. * callers to check for caps they want.)
  773. */
  774. int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
  775. {
  776. struct ceph_cap *cap;
  777. struct rb_node *p;
  778. int have = ci->i_snap_caps;
  779. if ((have & mask) == mask) {
  780. dout("__ceph_caps_issued_mask %p snap issued %s"
  781. " (mask %s)\n", &ci->vfs_inode,
  782. ceph_cap_string(have),
  783. ceph_cap_string(mask));
  784. return 1;
  785. }
  786. for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
  787. cap = rb_entry(p, struct ceph_cap, ci_node);
  788. if (!__cap_is_valid(cap))
  789. continue;
  790. if ((cap->issued & mask) == mask) {
  791. dout("__ceph_caps_issued_mask %p cap %p issued %s"
  792. " (mask %s)\n", &ci->vfs_inode, cap,
  793. ceph_cap_string(cap->issued),
  794. ceph_cap_string(mask));
  795. if (touch)
  796. __touch_cap(cap);
  797. return 1;
  798. }
  799. /* does a combination of caps satisfy mask? */
  800. have |= cap->issued;
  801. if ((have & mask) == mask) {
  802. dout("__ceph_caps_issued_mask %p combo issued %s"
  803. " (mask %s)\n", &ci->vfs_inode,
  804. ceph_cap_string(cap->issued),
  805. ceph_cap_string(mask));
  806. if (touch) {
  807. struct rb_node *q;
  808. /* touch this + preceding caps */
  809. __touch_cap(cap);
  810. for (q = rb_first(&ci->i_caps); q != p;
  811. q = rb_next(q)) {
  812. cap = rb_entry(q, struct ceph_cap,
  813. ci_node);
  814. if (!__cap_is_valid(cap))
  815. continue;
  816. __touch_cap(cap);
  817. }
  818. }
  819. return 1;
  820. }
  821. }
  822. return 0;
  823. }
  824. /*
  825. * Return true if mask caps are currently being revoked by an MDS.
  826. */
  827. int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
  828. struct ceph_cap *ocap, int mask)
  829. {
  830. struct ceph_cap *cap;
  831. struct rb_node *p;
  832. for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
  833. cap = rb_entry(p, struct ceph_cap, ci_node);
  834. if (cap != ocap &&
  835. (cap->implemented & ~cap->issued & mask))
  836. return 1;
  837. }
  838. return 0;
  839. }
  840. int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
  841. {
  842. struct inode *inode = &ci->vfs_inode;
  843. int ret;
  844. spin_lock(&ci->i_ceph_lock);
  845. ret = __ceph_caps_revoking_other(ci, NULL, mask);
  846. spin_unlock(&ci->i_ceph_lock);
  847. dout("ceph_caps_revoking %p %s = %d\n", inode,
  848. ceph_cap_string(mask), ret);
  849. return ret;
  850. }
  851. int __ceph_caps_used(struct ceph_inode_info *ci)
  852. {
  853. int used = 0;
  854. if (ci->i_pin_ref)
  855. used |= CEPH_CAP_PIN;
  856. if (ci->i_rd_ref)
  857. used |= CEPH_CAP_FILE_RD;
  858. if (ci->i_rdcache_ref ||
  859. (!S_ISDIR(ci->vfs_inode.i_mode) && /* ignore readdir cache */
  860. ci->vfs_inode.i_data.nrpages))
  861. used |= CEPH_CAP_FILE_CACHE;
  862. if (ci->i_wr_ref)
  863. used |= CEPH_CAP_FILE_WR;
  864. if (ci->i_wb_ref || ci->i_wrbuffer_ref)
  865. used |= CEPH_CAP_FILE_BUFFER;
  866. return used;
  867. }
  868. /*
  869. * wanted, by virtue of open file modes
  870. */
  871. int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
  872. {
  873. int i, bits = 0;
  874. for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
  875. if (ci->i_nr_by_mode[i])
  876. bits |= 1 << i;
  877. }
  878. if (bits == 0)
  879. return 0;
  880. return ceph_caps_for_mode(bits >> 1);
  881. }
  882. /*
  883. * Return caps we have registered with the MDS(s) as 'wanted'.
  884. */
  885. int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check)
  886. {
  887. struct ceph_cap *cap;
  888. struct rb_node *p;
  889. int mds_wanted = 0;
  890. for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
  891. cap = rb_entry(p, struct ceph_cap, ci_node);
  892. if (check && !__cap_is_valid(cap))
  893. continue;
  894. if (cap == ci->i_auth_cap)
  895. mds_wanted |= cap->mds_wanted;
  896. else
  897. mds_wanted |= (cap->mds_wanted & ~CEPH_CAP_ANY_FILE_WR);
  898. }
  899. return mds_wanted;
  900. }
  901. /*
  902. * called under i_ceph_lock
  903. */
  904. static int __ceph_is_single_caps(struct ceph_inode_info *ci)
  905. {
  906. return rb_first(&ci->i_caps) == rb_last(&ci->i_caps);
  907. }
  908. static int __ceph_is_any_caps(struct ceph_inode_info *ci)
  909. {
  910. return !RB_EMPTY_ROOT(&ci->i_caps);
  911. }
  912. int ceph_is_any_caps(struct inode *inode)
  913. {
  914. struct ceph_inode_info *ci = ceph_inode(inode);
  915. int ret;
  916. spin_lock(&ci->i_ceph_lock);
  917. ret = __ceph_is_any_caps(ci);
  918. spin_unlock(&ci->i_ceph_lock);
  919. return ret;
  920. }
  921. static void drop_inode_snap_realm(struct ceph_inode_info *ci)
  922. {
  923. struct ceph_snap_realm *realm = ci->i_snap_realm;
  924. spin_lock(&realm->inodes_with_caps_lock);
  925. list_del_init(&ci->i_snap_realm_item);
  926. ci->i_snap_realm_counter++;
  927. ci->i_snap_realm = NULL;
  928. spin_unlock(&realm->inodes_with_caps_lock);
  929. ceph_put_snap_realm(ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc,
  930. realm);
  931. }
  932. /*
  933. * Remove a cap. Take steps to deal with a racing iterate_session_caps.
  934. *
  935. * caller should hold i_ceph_lock.
  936. * caller will not hold session s_mutex if called from destroy_inode.
  937. */
  938. void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
  939. {
  940. struct ceph_mds_session *session = cap->session;
  941. struct ceph_inode_info *ci = cap->ci;
  942. struct ceph_mds_client *mdsc =
  943. ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
  944. int removed = 0;
  945. dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
  946. /* remove from session list */
  947. spin_lock(&session->s_cap_lock);
  948. if (session->s_cap_iterator == cap) {
  949. /* not yet, we are iterating over this very cap */
  950. dout("__ceph_remove_cap delaying %p removal from session %p\n",
  951. cap, cap->session);
  952. } else {
  953. list_del_init(&cap->session_caps);
  954. session->s_nr_caps--;
  955. cap->session = NULL;
  956. removed = 1;
  957. }
  958. /* protect backpointer with s_cap_lock: see iterate_session_caps */
  959. cap->ci = NULL;
  960. /*
  961. * s_cap_reconnect is protected by s_cap_lock. no one changes
  962. * s_cap_gen while session is in the reconnect state.
  963. */
  964. if (queue_release &&
  965. (!session->s_cap_reconnect || cap->cap_gen == session->s_cap_gen)) {
  966. cap->queue_release = 1;
  967. if (removed) {
  968. list_add_tail(&cap->session_caps,
  969. &session->s_cap_releases);
  970. session->s_num_cap_releases++;
  971. removed = 0;
  972. }
  973. } else {
  974. cap->queue_release = 0;
  975. }
  976. cap->cap_ino = ci->i_vino.ino;
  977. spin_unlock(&session->s_cap_lock);
  978. /* remove from inode list */
  979. rb_erase(&cap->ci_node, &ci->i_caps);
  980. if (ci->i_auth_cap == cap)
  981. ci->i_auth_cap = NULL;
  982. if (removed)
  983. ceph_put_cap(mdsc, cap);
  984. /* when reconnect denied, we remove session caps forcibly,
  985. * i_wr_ref can be non-zero. If there are ongoing write,
  986. * keep i_snap_realm.
  987. */
  988. if (!__ceph_is_any_caps(ci) && ci->i_wr_ref == 0 && ci->i_snap_realm)
  989. drop_inode_snap_realm(ci);
  990. if (!__ceph_is_any_real_caps(ci))
  991. __cap_delay_cancel(mdsc, ci);
  992. }
  993. struct cap_msg_args {
  994. struct ceph_mds_session *session;
  995. u64 ino, cid, follows;
  996. u64 flush_tid, oldest_flush_tid, size, max_size;
  997. u64 xattr_version;
  998. struct ceph_buffer *xattr_buf;
  999. struct timespec atime, mtime, ctime;
  1000. int op, caps, wanted, dirty;
  1001. u32 seq, issue_seq, mseq, time_warp_seq;
  1002. u32 flags;
  1003. kuid_t uid;
  1004. kgid_t gid;
  1005. umode_t mode;
  1006. bool inline_data;
  1007. };
  1008. /*
  1009. * Build and send a cap message to the given MDS.
  1010. *
  1011. * Caller should be holding s_mutex.
  1012. */
  1013. static int send_cap_msg(struct cap_msg_args *arg)
  1014. {
  1015. struct ceph_mds_caps *fc;
  1016. struct ceph_msg *msg;
  1017. void *p;
  1018. size_t extra_len;
  1019. struct timespec zerotime = {0};
  1020. struct ceph_osd_client *osdc = &arg->session->s_mdsc->fsc->client->osdc;
  1021. dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s"
  1022. " seq %u/%u tid %llu/%llu mseq %u follows %lld size %llu/%llu"
  1023. " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(arg->op),
  1024. arg->cid, arg->ino, ceph_cap_string(arg->caps),
  1025. ceph_cap_string(arg->wanted), ceph_cap_string(arg->dirty),
  1026. arg->seq, arg->issue_seq, arg->flush_tid, arg->oldest_flush_tid,
  1027. arg->mseq, arg->follows, arg->size, arg->max_size,
  1028. arg->xattr_version,
  1029. arg->xattr_buf ? (int)arg->xattr_buf->vec.iov_len : 0);
  1030. /* flock buffer size + inline version + inline data size +
  1031. * osd_epoch_barrier + oldest_flush_tid */
  1032. extra_len = 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4;
  1033. msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc) + extra_len,
  1034. GFP_NOFS, false);
  1035. if (!msg)
  1036. return -ENOMEM;
  1037. msg->hdr.version = cpu_to_le16(10);
  1038. msg->hdr.tid = cpu_to_le64(arg->flush_tid);
  1039. fc = msg->front.iov_base;
  1040. memset(fc, 0, sizeof(*fc));
  1041. fc->cap_id = cpu_to_le64(arg->cid);
  1042. fc->op = cpu_to_le32(arg->op);
  1043. fc->seq = cpu_to_le32(arg->seq);
  1044. fc->issue_seq = cpu_to_le32(arg->issue_seq);
  1045. fc->migrate_seq = cpu_to_le32(arg->mseq);
  1046. fc->caps = cpu_to_le32(arg->caps);
  1047. fc->wanted = cpu_to_le32(arg->wanted);
  1048. fc->dirty = cpu_to_le32(arg->dirty);
  1049. fc->ino = cpu_to_le64(arg->ino);
  1050. fc->snap_follows = cpu_to_le64(arg->follows);
  1051. fc->size = cpu_to_le64(arg->size);
  1052. fc->max_size = cpu_to_le64(arg->max_size);
  1053. ceph_encode_timespec(&fc->mtime, &arg->mtime);
  1054. ceph_encode_timespec(&fc->atime, &arg->atime);
  1055. ceph_encode_timespec(&fc->ctime, &arg->ctime);
  1056. fc->time_warp_seq = cpu_to_le32(arg->time_warp_seq);
  1057. fc->uid = cpu_to_le32(from_kuid(&init_user_ns, arg->uid));
  1058. fc->gid = cpu_to_le32(from_kgid(&init_user_ns, arg->gid));
  1059. fc->mode = cpu_to_le32(arg->mode);
  1060. fc->xattr_version = cpu_to_le64(arg->xattr_version);
  1061. if (arg->xattr_buf) {
  1062. msg->middle = ceph_buffer_get(arg->xattr_buf);
  1063. fc->xattr_len = cpu_to_le32(arg->xattr_buf->vec.iov_len);
  1064. msg->hdr.middle_len = cpu_to_le32(arg->xattr_buf->vec.iov_len);
  1065. }
  1066. p = fc + 1;
  1067. /* flock buffer size (version 2) */
  1068. ceph_encode_32(&p, 0);
  1069. /* inline version (version 4) */
  1070. ceph_encode_64(&p, arg->inline_data ? 0 : CEPH_INLINE_NONE);
  1071. /* inline data size */
  1072. ceph_encode_32(&p, 0);
  1073. /*
  1074. * osd_epoch_barrier (version 5)
  1075. * The epoch_barrier is protected osdc->lock, so READ_ONCE here in
  1076. * case it was recently changed
  1077. */
  1078. ceph_encode_32(&p, READ_ONCE(osdc->epoch_barrier));
  1079. /* oldest_flush_tid (version 6) */
  1080. ceph_encode_64(&p, arg->oldest_flush_tid);
  1081. /*
  1082. * caller_uid/caller_gid (version 7)
  1083. *
  1084. * Currently, we don't properly track which caller dirtied the caps
  1085. * last, and force a flush of them when there is a conflict. For now,
  1086. * just set this to 0:0, to emulate how the MDS has worked up to now.
  1087. */
  1088. ceph_encode_32(&p, 0);
  1089. ceph_encode_32(&p, 0);
  1090. /* pool namespace (version 8) (mds always ignores this) */
  1091. ceph_encode_32(&p, 0);
  1092. /*
  1093. * btime and change_attr (version 9)
  1094. *
  1095. * We just zero these out for now, as the MDS ignores them unless
  1096. * the requisite feature flags are set (which we don't do yet).
  1097. */
  1098. ceph_encode_timespec(p, &zerotime);
  1099. p += sizeof(struct ceph_timespec);
  1100. ceph_encode_64(&p, 0);
  1101. /* Advisory flags (version 10) */
  1102. ceph_encode_32(&p, arg->flags);
  1103. ceph_con_send(&arg->session->s_con, msg);
  1104. return 0;
  1105. }
  1106. /*
  1107. * Queue cap releases when an inode is dropped from our cache. Since
  1108. * inode is about to be destroyed, there is no need for i_ceph_lock.
  1109. */
  1110. void ceph_queue_caps_release(struct inode *inode)
  1111. {
  1112. struct ceph_inode_info *ci = ceph_inode(inode);
  1113. struct rb_node *p;
  1114. p = rb_first(&ci->i_caps);
  1115. while (p) {
  1116. struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
  1117. p = rb_next(p);
  1118. __ceph_remove_cap(cap, true);
  1119. }
  1120. }
  1121. /*
  1122. * Send a cap msg on the given inode. Update our caps state, then
  1123. * drop i_ceph_lock and send the message.
  1124. *
  1125. * Make note of max_size reported/requested from mds, revoked caps
  1126. * that have now been implemented.
  1127. *
  1128. * Make half-hearted attempt ot to invalidate page cache if we are
  1129. * dropping RDCACHE. Note that this will leave behind locked pages
  1130. * that we'll then need to deal with elsewhere.
  1131. *
  1132. * Return non-zero if delayed release, or we experienced an error
  1133. * such that the caller should requeue + retry later.
  1134. *
  1135. * called with i_ceph_lock, then drops it.
  1136. * caller should hold snap_rwsem (read), s_mutex.
  1137. */
  1138. static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
  1139. int op, bool sync, int used, int want, int retain,
  1140. int flushing, u64 flush_tid, u64 oldest_flush_tid)
  1141. __releases(cap->ci->i_ceph_lock)
  1142. {
  1143. struct ceph_inode_info *ci = cap->ci;
  1144. struct inode *inode = &ci->vfs_inode;
  1145. struct cap_msg_args arg;
  1146. int held, revoking;
  1147. int wake = 0;
  1148. int delayed = 0;
  1149. int ret;
  1150. held = cap->issued | cap->implemented;
  1151. revoking = cap->implemented & ~cap->issued;
  1152. retain &= ~revoking;
  1153. dout("__send_cap %p cap %p session %p %s -> %s (revoking %s)\n",
  1154. inode, cap, cap->session,
  1155. ceph_cap_string(held), ceph_cap_string(held & retain),
  1156. ceph_cap_string(revoking));
  1157. BUG_ON((retain & CEPH_CAP_PIN) == 0);
  1158. arg.session = cap->session;
  1159. /* don't release wanted unless we've waited a bit. */
  1160. if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
  1161. time_before(jiffies, ci->i_hold_caps_min)) {
  1162. dout(" delaying issued %s -> %s, wanted %s -> %s on send\n",
  1163. ceph_cap_string(cap->issued),
  1164. ceph_cap_string(cap->issued & retain),
  1165. ceph_cap_string(cap->mds_wanted),
  1166. ceph_cap_string(want));
  1167. want |= cap->mds_wanted;
  1168. retain |= cap->issued;
  1169. delayed = 1;
  1170. }
  1171. ci->i_ceph_flags &= ~(CEPH_I_NODELAY | CEPH_I_FLUSH);
  1172. if (want & ~cap->mds_wanted) {
  1173. /* user space may open/close single file frequently.
  1174. * This avoids droping mds_wanted immediately after
  1175. * requesting new mds_wanted.
  1176. */
  1177. __cap_set_timeouts(mdsc, ci);
  1178. }
  1179. cap->issued &= retain; /* drop bits we don't want */
  1180. if (cap->implemented & ~cap->issued) {
  1181. /*
  1182. * Wake up any waiters on wanted -> needed transition.
  1183. * This is due to the weird transition from buffered
  1184. * to sync IO... we need to flush dirty pages _before_
  1185. * allowing sync writes to avoid reordering.
  1186. */
  1187. wake = 1;
  1188. }
  1189. cap->implemented &= cap->issued | used;
  1190. cap->mds_wanted = want;
  1191. arg.ino = ceph_vino(inode).ino;
  1192. arg.cid = cap->cap_id;
  1193. arg.follows = flushing ? ci->i_head_snapc->seq : 0;
  1194. arg.flush_tid = flush_tid;
  1195. arg.oldest_flush_tid = oldest_flush_tid;
  1196. arg.size = inode->i_size;
  1197. ci->i_reported_size = arg.size;
  1198. arg.max_size = ci->i_wanted_max_size;
  1199. ci->i_requested_max_size = arg.max_size;
  1200. if (flushing & CEPH_CAP_XATTR_EXCL) {
  1201. __ceph_build_xattrs_blob(ci);
  1202. arg.xattr_version = ci->i_xattrs.version;
  1203. arg.xattr_buf = ci->i_xattrs.blob;
  1204. } else {
  1205. arg.xattr_buf = NULL;
  1206. }
  1207. arg.mtime = timespec64_to_timespec(inode->i_mtime);
  1208. arg.atime = timespec64_to_timespec(inode->i_atime);
  1209. arg.ctime = timespec64_to_timespec(inode->i_ctime);
  1210. arg.op = op;
  1211. arg.caps = cap->implemented;
  1212. arg.wanted = want;
  1213. arg.dirty = flushing;
  1214. arg.seq = cap->seq;
  1215. arg.issue_seq = cap->issue_seq;
  1216. arg.mseq = cap->mseq;
  1217. arg.time_warp_seq = ci->i_time_warp_seq;
  1218. arg.uid = inode->i_uid;
  1219. arg.gid = inode->i_gid;
  1220. arg.mode = inode->i_mode;
  1221. arg.inline_data = ci->i_inline_version != CEPH_INLINE_NONE;
  1222. if (list_empty(&ci->i_cap_snaps))
  1223. arg.flags = CEPH_CLIENT_CAPS_NO_CAPSNAP;
  1224. else
  1225. arg.flags = CEPH_CLIENT_CAPS_PENDING_CAPSNAP;
  1226. if (sync)
  1227. arg.flags |= CEPH_CLIENT_CAPS_SYNC;
  1228. spin_unlock(&ci->i_ceph_lock);
  1229. ret = send_cap_msg(&arg);
  1230. if (ret < 0) {
  1231. dout("error sending cap msg, must requeue %p\n", inode);
  1232. delayed = 1;
  1233. }
  1234. if (wake)
  1235. wake_up_all(&ci->i_cap_wq);
  1236. return delayed;
  1237. }
  1238. static inline int __send_flush_snap(struct inode *inode,
  1239. struct ceph_mds_session *session,
  1240. struct ceph_cap_snap *capsnap,
  1241. u32 mseq, u64 oldest_flush_tid)
  1242. {
  1243. struct cap_msg_args arg;
  1244. arg.session = session;
  1245. arg.ino = ceph_vino(inode).ino;
  1246. arg.cid = 0;
  1247. arg.follows = capsnap->follows;
  1248. arg.flush_tid = capsnap->cap_flush.tid;
  1249. arg.oldest_flush_tid = oldest_flush_tid;
  1250. arg.size = capsnap->size;
  1251. arg.max_size = 0;
  1252. arg.xattr_version = capsnap->xattr_version;
  1253. arg.xattr_buf = capsnap->xattr_blob;
  1254. arg.atime = capsnap->atime;
  1255. arg.mtime = capsnap->mtime;
  1256. arg.ctime = capsnap->ctime;
  1257. arg.op = CEPH_CAP_OP_FLUSHSNAP;
  1258. arg.caps = capsnap->issued;
  1259. arg.wanted = 0;
  1260. arg.dirty = capsnap->dirty;
  1261. arg.seq = 0;
  1262. arg.issue_seq = 0;
  1263. arg.mseq = mseq;
  1264. arg.time_warp_seq = capsnap->time_warp_seq;
  1265. arg.uid = capsnap->uid;
  1266. arg.gid = capsnap->gid;
  1267. arg.mode = capsnap->mode;
  1268. arg.inline_data = capsnap->inline_data;
  1269. arg.flags = 0;
  1270. return send_cap_msg(&arg);
  1271. }
  1272. /*
  1273. * When a snapshot is taken, clients accumulate dirty metadata on
  1274. * inodes with capabilities in ceph_cap_snaps to describe the file
  1275. * state at the time the snapshot was taken. This must be flushed
  1276. * asynchronously back to the MDS once sync writes complete and dirty
  1277. * data is written out.
  1278. *
  1279. * Called under i_ceph_lock. Takes s_mutex as needed.
  1280. */
  1281. static void __ceph_flush_snaps(struct ceph_inode_info *ci,
  1282. struct ceph_mds_session *session)
  1283. __releases(ci->i_ceph_lock)
  1284. __acquires(ci->i_ceph_lock)
  1285. {
  1286. struct inode *inode = &ci->vfs_inode;
  1287. struct ceph_mds_client *mdsc = session->s_mdsc;
  1288. struct ceph_cap_snap *capsnap;
  1289. u64 oldest_flush_tid = 0;
  1290. u64 first_tid = 1, last_tid = 0;
  1291. dout("__flush_snaps %p session %p\n", inode, session);
  1292. list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
  1293. /*
  1294. * we need to wait for sync writes to complete and for dirty
  1295. * pages to be written out.
  1296. */
  1297. if (capsnap->dirty_pages || capsnap->writing)
  1298. break;
  1299. /* should be removed by ceph_try_drop_cap_snap() */
  1300. BUG_ON(!capsnap->need_flush);
  1301. /* only flush each capsnap once */
  1302. if (capsnap->cap_flush.tid > 0) {
  1303. dout(" already flushed %p, skipping\n", capsnap);
  1304. continue;
  1305. }
  1306. spin_lock(&mdsc->cap_dirty_lock);
  1307. capsnap->cap_flush.tid = ++mdsc->last_cap_flush_tid;
  1308. list_add_tail(&capsnap->cap_flush.g_list,
  1309. &mdsc->cap_flush_list);
  1310. if (oldest_flush_tid == 0)
  1311. oldest_flush_tid = __get_oldest_flush_tid(mdsc);
  1312. if (list_empty(&ci->i_flushing_item)) {
  1313. list_add_tail(&ci->i_flushing_item,
  1314. &session->s_cap_flushing);
  1315. }
  1316. spin_unlock(&mdsc->cap_dirty_lock);
  1317. list_add_tail(&capsnap->cap_flush.i_list,
  1318. &ci->i_cap_flush_list);
  1319. if (first_tid == 1)
  1320. first_tid = capsnap->cap_flush.tid;
  1321. last_tid = capsnap->cap_flush.tid;
  1322. }
  1323. ci->i_ceph_flags &= ~CEPH_I_FLUSH_SNAPS;
  1324. while (first_tid <= last_tid) {
  1325. struct ceph_cap *cap = ci->i_auth_cap;
  1326. struct ceph_cap_flush *cf;
  1327. int ret;
  1328. if (!(cap && cap->session == session)) {
  1329. dout("__flush_snaps %p auth cap %p not mds%d, "
  1330. "stop\n", inode, cap, session->s_mds);
  1331. break;
  1332. }
  1333. ret = -ENOENT;
  1334. list_for_each_entry(cf, &ci->i_cap_flush_list, i_list) {
  1335. if (cf->tid >= first_tid) {
  1336. ret = 0;
  1337. break;
  1338. }
  1339. }
  1340. if (ret < 0)
  1341. break;
  1342. first_tid = cf->tid + 1;
  1343. capsnap = container_of(cf, struct ceph_cap_snap, cap_flush);
  1344. refcount_inc(&capsnap->nref);
  1345. spin_unlock(&ci->i_ceph_lock);
  1346. dout("__flush_snaps %p capsnap %p tid %llu %s\n",
  1347. inode, capsnap, cf->tid, ceph_cap_string(capsnap->dirty));
  1348. ret = __send_flush_snap(inode, session, capsnap, cap->mseq,
  1349. oldest_flush_tid);
  1350. if (ret < 0) {
  1351. pr_err("__flush_snaps: error sending cap flushsnap, "
  1352. "ino (%llx.%llx) tid %llu follows %llu\n",
  1353. ceph_vinop(inode), cf->tid, capsnap->follows);
  1354. }
  1355. ceph_put_cap_snap(capsnap);
  1356. spin_lock(&ci->i_ceph_lock);
  1357. }
  1358. }
  1359. void ceph_flush_snaps(struct ceph_inode_info *ci,
  1360. struct ceph_mds_session **psession)
  1361. {
  1362. struct inode *inode = &ci->vfs_inode;
  1363. struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
  1364. struct ceph_mds_session *session = NULL;
  1365. int mds;
  1366. dout("ceph_flush_snaps %p\n", inode);
  1367. if (psession)
  1368. session = *psession;
  1369. retry:
  1370. spin_lock(&ci->i_ceph_lock);
  1371. if (!(ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)) {
  1372. dout(" no capsnap needs flush, doing nothing\n");
  1373. goto out;
  1374. }
  1375. if (!ci->i_auth_cap) {
  1376. dout(" no auth cap (migrating?), doing nothing\n");
  1377. goto out;
  1378. }
  1379. mds = ci->i_auth_cap->session->s_mds;
  1380. if (session && session->s_mds != mds) {
  1381. dout(" oops, wrong session %p mutex\n", session);
  1382. mutex_unlock(&session->s_mutex);
  1383. ceph_put_mds_session(session);
  1384. session = NULL;
  1385. }
  1386. if (!session) {
  1387. spin_unlock(&ci->i_ceph_lock);
  1388. mutex_lock(&mdsc->mutex);
  1389. session = __ceph_lookup_mds_session(mdsc, mds);
  1390. mutex_unlock(&mdsc->mutex);
  1391. if (session) {
  1392. dout(" inverting session/ino locks on %p\n", session);
  1393. mutex_lock(&session->s_mutex);
  1394. }
  1395. goto retry;
  1396. }
  1397. // make sure flushsnap messages are sent in proper order.
  1398. if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) {
  1399. __kick_flushing_caps(mdsc, session, ci, 0);
  1400. ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
  1401. }
  1402. __ceph_flush_snaps(ci, session);
  1403. out:
  1404. spin_unlock(&ci->i_ceph_lock);
  1405. if (psession) {
  1406. *psession = session;
  1407. } else if (session) {
  1408. mutex_unlock(&session->s_mutex);
  1409. ceph_put_mds_session(session);
  1410. }
  1411. /* we flushed them all; remove this inode from the queue */
  1412. spin_lock(&mdsc->snap_flush_lock);
  1413. list_del_init(&ci->i_snap_flush_item);
  1414. spin_unlock(&mdsc->snap_flush_lock);
  1415. }
  1416. /*
  1417. * Mark caps dirty. If inode is newly dirty, return the dirty flags.
  1418. * Caller is then responsible for calling __mark_inode_dirty with the
  1419. * returned flags value.
  1420. */
  1421. int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
  1422. struct ceph_cap_flush **pcf)
  1423. {
  1424. struct ceph_mds_client *mdsc =
  1425. ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
  1426. struct inode *inode = &ci->vfs_inode;
  1427. int was = ci->i_dirty_caps;
  1428. int dirty = 0;
  1429. if (!ci->i_auth_cap) {
  1430. pr_warn("__mark_dirty_caps %p %llx mask %s, "
  1431. "but no auth cap (session was closed?)\n",
  1432. inode, ceph_ino(inode), ceph_cap_string(mask));
  1433. return 0;
  1434. }
  1435. dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->vfs_inode,
  1436. ceph_cap_string(mask), ceph_cap_string(was),
  1437. ceph_cap_string(was | mask));
  1438. ci->i_dirty_caps |= mask;
  1439. if (was == 0) {
  1440. WARN_ON_ONCE(ci->i_prealloc_cap_flush);
  1441. swap(ci->i_prealloc_cap_flush, *pcf);
  1442. if (!ci->i_head_snapc) {
  1443. WARN_ON_ONCE(!rwsem_is_locked(&mdsc->snap_rwsem));
  1444. ci->i_head_snapc = ceph_get_snap_context(
  1445. ci->i_snap_realm->cached_context);
  1446. }
  1447. dout(" inode %p now dirty snapc %p auth cap %p\n",
  1448. &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap);
  1449. BUG_ON(!list_empty(&ci->i_dirty_item));
  1450. spin_lock(&mdsc->cap_dirty_lock);
  1451. list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
  1452. spin_unlock(&mdsc->cap_dirty_lock);
  1453. if (ci->i_flushing_caps == 0) {
  1454. ihold(inode);
  1455. dirty |= I_DIRTY_SYNC;
  1456. }
  1457. } else {
  1458. WARN_ON_ONCE(!ci->i_prealloc_cap_flush);
  1459. }
  1460. BUG_ON(list_empty(&ci->i_dirty_item));
  1461. if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) &&
  1462. (mask & CEPH_CAP_FILE_BUFFER))
  1463. dirty |= I_DIRTY_DATASYNC;
  1464. __cap_delay_requeue(mdsc, ci);
  1465. return dirty;
  1466. }
  1467. struct ceph_cap_flush *ceph_alloc_cap_flush(void)
  1468. {
  1469. return kmem_cache_alloc(ceph_cap_flush_cachep, GFP_KERNEL);
  1470. }
  1471. void ceph_free_cap_flush(struct ceph_cap_flush *cf)
  1472. {
  1473. if (cf)
  1474. kmem_cache_free(ceph_cap_flush_cachep, cf);
  1475. }
  1476. static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc)
  1477. {
  1478. if (!list_empty(&mdsc->cap_flush_list)) {
  1479. struct ceph_cap_flush *cf =
  1480. list_first_entry(&mdsc->cap_flush_list,
  1481. struct ceph_cap_flush, g_list);
  1482. return cf->tid;
  1483. }
  1484. return 0;
  1485. }
  1486. /*
  1487. * Remove cap_flush from the mdsc's or inode's flushing cap list.
  1488. * Return true if caller needs to wake up flush waiters.
  1489. */
  1490. static bool __finish_cap_flush(struct ceph_mds_client *mdsc,
  1491. struct ceph_inode_info *ci,
  1492. struct ceph_cap_flush *cf)
  1493. {
  1494. struct ceph_cap_flush *prev;
  1495. bool wake = cf->wake;
  1496. if (mdsc) {
  1497. /* are there older pending cap flushes? */
  1498. if (wake && cf->g_list.prev != &mdsc->cap_flush_list) {
  1499. prev = list_prev_entry(cf, g_list);
  1500. prev->wake = true;
  1501. wake = false;
  1502. }
  1503. list_del(&cf->g_list);
  1504. } else if (ci) {
  1505. if (wake && cf->i_list.prev != &ci->i_cap_flush_list) {
  1506. prev = list_prev_entry(cf, i_list);
  1507. prev->wake = true;
  1508. wake = false;
  1509. }
  1510. list_del(&cf->i_list);
  1511. } else {
  1512. BUG_ON(1);
  1513. }
  1514. return wake;
  1515. }
  1516. /*
  1517. * Add dirty inode to the flushing list. Assigned a seq number so we
  1518. * can wait for caps to flush without starving.
  1519. *
  1520. * Called under i_ceph_lock.
  1521. */
  1522. static int __mark_caps_flushing(struct inode *inode,
  1523. struct ceph_mds_session *session, bool wake,
  1524. u64 *flush_tid, u64 *oldest_flush_tid)
  1525. {
  1526. struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
  1527. struct ceph_inode_info *ci = ceph_inode(inode);
  1528. struct ceph_cap_flush *cf = NULL;
  1529. int flushing;
  1530. BUG_ON(ci->i_dirty_caps == 0);
  1531. BUG_ON(list_empty(&ci->i_dirty_item));
  1532. BUG_ON(!ci->i_prealloc_cap_flush);
  1533. flushing = ci->i_dirty_caps;
  1534. dout("__mark_caps_flushing flushing %s, flushing_caps %s -> %s\n",
  1535. ceph_cap_string(flushing),
  1536. ceph_cap_string(ci->i_flushing_caps),
  1537. ceph_cap_string(ci->i_flushing_caps | flushing));
  1538. ci->i_flushing_caps |= flushing;
  1539. ci->i_dirty_caps = 0;
  1540. dout(" inode %p now !dirty\n", inode);
  1541. swap(cf, ci->i_prealloc_cap_flush);
  1542. cf->caps = flushing;
  1543. cf->wake = wake;
  1544. spin_lock(&mdsc->cap_dirty_lock);
  1545. list_del_init(&ci->i_dirty_item);
  1546. cf->tid = ++mdsc->last_cap_flush_tid;
  1547. list_add_tail(&cf->g_list, &mdsc->cap_flush_list);
  1548. *oldest_flush_tid = __get_oldest_flush_tid(mdsc);
  1549. if (list_empty(&ci->i_flushing_item)) {
  1550. list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing);
  1551. mdsc->num_cap_flushing++;
  1552. }
  1553. spin_unlock(&mdsc->cap_dirty_lock);
  1554. list_add_tail(&cf->i_list, &ci->i_cap_flush_list);
  1555. *flush_tid = cf->tid;
  1556. return flushing;
  1557. }
  1558. /*
  1559. * try to invalidate mapping pages without blocking.
  1560. */
  1561. static int try_nonblocking_invalidate(struct inode *inode)
  1562. {
  1563. struct ceph_inode_info *ci = ceph_inode(inode);
  1564. u32 invalidating_gen = ci->i_rdcache_gen;
  1565. spin_unlock(&ci->i_ceph_lock);
  1566. invalidate_mapping_pages(&inode->i_data, 0, -1);
  1567. spin_lock(&ci->i_ceph_lock);
  1568. if (inode->i_data.nrpages == 0 &&
  1569. invalidating_gen == ci->i_rdcache_gen) {
  1570. /* success. */
  1571. dout("try_nonblocking_invalidate %p success\n", inode);
  1572. /* save any racing async invalidate some trouble */
  1573. ci->i_rdcache_revoking = ci->i_rdcache_gen - 1;
  1574. return 0;
  1575. }
  1576. dout("try_nonblocking_invalidate %p failed\n", inode);
  1577. return -1;
  1578. }
  1579. bool __ceph_should_report_size(struct ceph_inode_info *ci)
  1580. {
  1581. loff_t size = ci->vfs_inode.i_size;
  1582. /* mds will adjust max size according to the reported size */
  1583. if (ci->i_flushing_caps & CEPH_CAP_FILE_WR)
  1584. return false;
  1585. if (size >= ci->i_max_size)
  1586. return true;
  1587. /* half of previous max_size increment has been used */
  1588. if (ci->i_max_size > ci->i_reported_size &&
  1589. (size << 1) >= ci->i_max_size + ci->i_reported_size)
  1590. return true;
  1591. return false;
  1592. }
  1593. /*
  1594. * Swiss army knife function to examine currently used and wanted
  1595. * versus held caps. Release, flush, ack revoked caps to mds as
  1596. * appropriate.
  1597. *
  1598. * CHECK_CAPS_NODELAY - caller is delayed work and we should not delay
  1599. * cap release further.
  1600. * CHECK_CAPS_AUTHONLY - we should only check the auth cap
  1601. * CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without
  1602. * further delay.
  1603. */
  1604. void ceph_check_caps(struct ceph_inode_info *ci, int flags,
  1605. struct ceph_mds_session *session)
  1606. {
  1607. struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode);
  1608. struct ceph_mds_client *mdsc = fsc->mdsc;
  1609. struct inode *inode = &ci->vfs_inode;
  1610. struct ceph_cap *cap;
  1611. u64 flush_tid, oldest_flush_tid;
  1612. int file_wanted, used, cap_used;
  1613. int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */
  1614. int issued, implemented, want, retain, revoking, flushing = 0;
  1615. int mds = -1; /* keep track of how far we've gone through i_caps list
  1616. to avoid an infinite loop on retry */
  1617. struct rb_node *p;
  1618. int delayed = 0, sent = 0;
  1619. bool no_delay = flags & CHECK_CAPS_NODELAY;
  1620. bool queue_invalidate = false;
  1621. bool tried_invalidate = false;
  1622. /* if we are unmounting, flush any unused caps immediately. */
  1623. if (mdsc->stopping)
  1624. no_delay = true;
  1625. spin_lock(&ci->i_ceph_lock);
  1626. if (ci->i_ceph_flags & CEPH_I_FLUSH)
  1627. flags |= CHECK_CAPS_FLUSH;
  1628. if (!(flags & CHECK_CAPS_AUTHONLY) ||
  1629. (ci->i_auth_cap && __ceph_is_single_caps(ci)))
  1630. __cap_delay_cancel(mdsc, ci);
  1631. goto retry_locked;
  1632. retry:
  1633. spin_lock(&ci->i_ceph_lock);
  1634. retry_locked:
  1635. file_wanted = __ceph_caps_file_wanted(ci);
  1636. used = __ceph_caps_used(ci);
  1637. issued = __ceph_caps_issued(ci, &implemented);
  1638. revoking = implemented & ~issued;
  1639. want = file_wanted;
  1640. retain = file_wanted | used | CEPH_CAP_PIN;
  1641. if (!mdsc->stopping && inode->i_nlink > 0) {
  1642. if (file_wanted) {
  1643. retain |= CEPH_CAP_ANY; /* be greedy */
  1644. } else if (S_ISDIR(inode->i_mode) &&
  1645. (issued & CEPH_CAP_FILE_SHARED) &&
  1646. __ceph_dir_is_complete(ci)) {
  1647. /*
  1648. * If a directory is complete, we want to keep
  1649. * the exclusive cap. So that MDS does not end up
  1650. * revoking the shared cap on every create/unlink
  1651. * operation.
  1652. */
  1653. want = CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
  1654. retain |= want;
  1655. } else {
  1656. retain |= CEPH_CAP_ANY_SHARED;
  1657. /*
  1658. * keep RD only if we didn't have the file open RW,
  1659. * because then the mds would revoke it anyway to
  1660. * journal max_size=0.
  1661. */
  1662. if (ci->i_max_size == 0)
  1663. retain |= CEPH_CAP_ANY_RD;
  1664. }
  1665. }
  1666. dout("check_caps %p file_want %s used %s dirty %s flushing %s"
  1667. " issued %s revoking %s retain %s %s%s%s\n", inode,
  1668. ceph_cap_string(file_wanted),
  1669. ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps),
  1670. ceph_cap_string(ci->i_flushing_caps),
  1671. ceph_cap_string(issued), ceph_cap_string(revoking),
  1672. ceph_cap_string(retain),
  1673. (flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "",
  1674. (flags & CHECK_CAPS_NODELAY) ? " NODELAY" : "",
  1675. (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "");
  1676. /*
  1677. * If we no longer need to hold onto old our caps, and we may
  1678. * have cached pages, but don't want them, then try to invalidate.
  1679. * If we fail, it's because pages are locked.... try again later.
  1680. */
  1681. if ((!no_delay || mdsc->stopping) &&
  1682. !S_ISDIR(inode->i_mode) && /* ignore readdir cache */
  1683. !(ci->i_wb_ref || ci->i_wrbuffer_ref) && /* no dirty pages... */
  1684. inode->i_data.nrpages && /* have cached pages */
  1685. (revoking & (CEPH_CAP_FILE_CACHE|
  1686. CEPH_CAP_FILE_LAZYIO)) && /* or revoking cache */
  1687. !tried_invalidate) {
  1688. dout("check_caps trying to invalidate on %p\n", inode);
  1689. if (try_nonblocking_invalidate(inode) < 0) {
  1690. dout("check_caps queuing invalidate\n");
  1691. queue_invalidate = true;
  1692. ci->i_rdcache_revoking = ci->i_rdcache_gen;
  1693. }
  1694. tried_invalidate = true;
  1695. goto retry_locked;
  1696. }
  1697. for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
  1698. cap = rb_entry(p, struct ceph_cap, ci_node);
  1699. /* avoid looping forever */
  1700. if (mds >= cap->mds ||
  1701. ((flags & CHECK_CAPS_AUTHONLY) && cap != ci->i_auth_cap))
  1702. continue;
  1703. /* NOTE: no side-effects allowed, until we take s_mutex */
  1704. cap_used = used;
  1705. if (ci->i_auth_cap && cap != ci->i_auth_cap)
  1706. cap_used &= ~ci->i_auth_cap->issued;
  1707. revoking = cap->implemented & ~cap->issued;
  1708. dout(" mds%d cap %p used %s issued %s implemented %s revoking %s\n",
  1709. cap->mds, cap, ceph_cap_string(cap_used),
  1710. ceph_cap_string(cap->issued),
  1711. ceph_cap_string(cap->implemented),
  1712. ceph_cap_string(revoking));
  1713. if (cap == ci->i_auth_cap &&
  1714. (cap->issued & CEPH_CAP_FILE_WR)) {
  1715. /* request larger max_size from MDS? */
  1716. if (ci->i_wanted_max_size > ci->i_max_size &&
  1717. ci->i_wanted_max_size > ci->i_requested_max_size) {
  1718. dout("requesting new max_size\n");
  1719. goto ack;
  1720. }
  1721. /* approaching file_max? */
  1722. if (__ceph_should_report_size(ci)) {
  1723. dout("i_size approaching max_size\n");
  1724. goto ack;
  1725. }
  1726. }
  1727. /* flush anything dirty? */
  1728. if (cap == ci->i_auth_cap) {
  1729. if ((flags & CHECK_CAPS_FLUSH) && ci->i_dirty_caps) {
  1730. dout("flushing dirty caps\n");
  1731. goto ack;
  1732. }
  1733. if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS) {
  1734. dout("flushing snap caps\n");
  1735. goto ack;
  1736. }
  1737. }
  1738. /* completed revocation? going down and there are no caps? */
  1739. if (revoking && (revoking & cap_used) == 0) {
  1740. dout("completed revocation of %s\n",
  1741. ceph_cap_string(cap->implemented & ~cap->issued));
  1742. goto ack;
  1743. }
  1744. /* want more caps from mds? */
  1745. if (want & ~(cap->mds_wanted | cap->issued))
  1746. goto ack;
  1747. /* things we might delay */
  1748. if ((cap->issued & ~retain) == 0 &&
  1749. cap->mds_wanted == want)
  1750. continue; /* nope, all good */
  1751. if (no_delay)
  1752. goto ack;
  1753. /* delay? */
  1754. if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
  1755. time_before(jiffies, ci->i_hold_caps_max)) {
  1756. dout(" delaying issued %s -> %s, wanted %s -> %s\n",
  1757. ceph_cap_string(cap->issued),
  1758. ceph_cap_string(cap->issued & retain),
  1759. ceph_cap_string(cap->mds_wanted),
  1760. ceph_cap_string(want));
  1761. delayed++;
  1762. continue;
  1763. }
  1764. ack:
  1765. if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
  1766. dout(" skipping %p I_NOFLUSH set\n", inode);
  1767. continue;
  1768. }
  1769. if (session && session != cap->session) {
  1770. dout("oops, wrong session %p mutex\n", session);
  1771. mutex_unlock(&session->s_mutex);
  1772. session = NULL;
  1773. }
  1774. if (!session) {
  1775. session = cap->session;
  1776. if (mutex_trylock(&session->s_mutex) == 0) {
  1777. dout("inverting session/ino locks on %p\n",
  1778. session);
  1779. spin_unlock(&ci->i_ceph_lock);
  1780. if (took_snap_rwsem) {
  1781. up_read(&mdsc->snap_rwsem);
  1782. took_snap_rwsem = 0;
  1783. }
  1784. mutex_lock(&session->s_mutex);
  1785. goto retry;
  1786. }
  1787. }
  1788. /* kick flushing and flush snaps before sending normal
  1789. * cap message */
  1790. if (cap == ci->i_auth_cap &&
  1791. (ci->i_ceph_flags &
  1792. (CEPH_I_KICK_FLUSH | CEPH_I_FLUSH_SNAPS))) {
  1793. if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) {
  1794. __kick_flushing_caps(mdsc, session, ci, 0);
  1795. ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
  1796. }
  1797. if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)
  1798. __ceph_flush_snaps(ci, session);
  1799. goto retry_locked;
  1800. }
  1801. /* take snap_rwsem after session mutex */
  1802. if (!took_snap_rwsem) {
  1803. if (down_read_trylock(&mdsc->snap_rwsem) == 0) {
  1804. dout("inverting snap/in locks on %p\n",
  1805. inode);
  1806. spin_unlock(&ci->i_ceph_lock);
  1807. down_read(&mdsc->snap_rwsem);
  1808. took_snap_rwsem = 1;
  1809. goto retry;
  1810. }
  1811. took_snap_rwsem = 1;
  1812. }
  1813. if (cap == ci->i_auth_cap && ci->i_dirty_caps) {
  1814. flushing = __mark_caps_flushing(inode, session, false,
  1815. &flush_tid,
  1816. &oldest_flush_tid);
  1817. } else {
  1818. flushing = 0;
  1819. flush_tid = 0;
  1820. spin_lock(&mdsc->cap_dirty_lock);
  1821. oldest_flush_tid = __get_oldest_flush_tid(mdsc);
  1822. spin_unlock(&mdsc->cap_dirty_lock);
  1823. }
  1824. mds = cap->mds; /* remember mds, so we don't repeat */
  1825. sent++;
  1826. /* __send_cap drops i_ceph_lock */
  1827. delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, false,
  1828. cap_used, want, retain, flushing,
  1829. flush_tid, oldest_flush_tid);
  1830. goto retry; /* retake i_ceph_lock and restart our cap scan. */
  1831. }
  1832. /* Reschedule delayed caps release if we delayed anything */
  1833. if (delayed)
  1834. __cap_delay_requeue(mdsc, ci);
  1835. spin_unlock(&ci->i_ceph_lock);
  1836. if (queue_invalidate)
  1837. ceph_queue_invalidate(inode);
  1838. if (session)
  1839. mutex_unlock(&session->s_mutex);
  1840. if (took_snap_rwsem)
  1841. up_read(&mdsc->snap_rwsem);
  1842. }
  1843. /*
  1844. * Try to flush dirty caps back to the auth mds.
  1845. */
  1846. static int try_flush_caps(struct inode *inode, u64 *ptid)
  1847. {
  1848. struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
  1849. struct ceph_inode_info *ci = ceph_inode(inode);
  1850. struct ceph_mds_session *session = NULL;
  1851. int flushing = 0;
  1852. u64 flush_tid = 0, oldest_flush_tid = 0;
  1853. retry:
  1854. spin_lock(&ci->i_ceph_lock);
  1855. if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
  1856. spin_unlock(&ci->i_ceph_lock);
  1857. dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode);
  1858. goto out;
  1859. }
  1860. if (ci->i_dirty_caps && ci->i_auth_cap) {
  1861. struct ceph_cap *cap = ci->i_auth_cap;
  1862. int used = __ceph_caps_used(ci);
  1863. int want = __ceph_caps_wanted(ci);
  1864. int delayed;
  1865. if (!session || session != cap->session) {
  1866. spin_unlock(&ci->i_ceph_lock);
  1867. if (session)
  1868. mutex_unlock(&session->s_mutex);
  1869. session = cap->session;
  1870. mutex_lock(&session->s_mutex);
  1871. goto retry;
  1872. }
  1873. if (cap->session->s_state < CEPH_MDS_SESSION_OPEN) {
  1874. spin_unlock(&ci->i_ceph_lock);
  1875. goto out;
  1876. }
  1877. flushing = __mark_caps_flushing(inode, session, true,
  1878. &flush_tid, &oldest_flush_tid);
  1879. /* __send_cap drops i_ceph_lock */
  1880. delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, true,
  1881. used, want, (cap->issued | cap->implemented),
  1882. flushing, flush_tid, oldest_flush_tid);
  1883. if (delayed) {
  1884. spin_lock(&ci->i_ceph_lock);
  1885. __cap_delay_requeue(mdsc, ci);
  1886. spin_unlock(&ci->i_ceph_lock);
  1887. }
  1888. } else {
  1889. if (!list_empty(&ci->i_cap_flush_list)) {
  1890. struct ceph_cap_flush *cf =
  1891. list_last_entry(&ci->i_cap_flush_list,
  1892. struct ceph_cap_flush, i_list);
  1893. cf->wake = true;
  1894. flush_tid = cf->tid;
  1895. }
  1896. flushing = ci->i_flushing_caps;
  1897. spin_unlock(&ci->i_ceph_lock);
  1898. }
  1899. out:
  1900. if (session)
  1901. mutex_unlock(&session->s_mutex);
  1902. *ptid = flush_tid;
  1903. return flushing;
  1904. }
  1905. /*
  1906. * Return true if we've flushed caps through the given flush_tid.
  1907. */
  1908. static int caps_are_flushed(struct inode *inode, u64 flush_tid)
  1909. {
  1910. struct ceph_inode_info *ci = ceph_inode(inode);
  1911. int ret = 1;
  1912. spin_lock(&ci->i_ceph_lock);
  1913. if (!list_empty(&ci->i_cap_flush_list)) {
  1914. struct ceph_cap_flush * cf =
  1915. list_first_entry(&ci->i_cap_flush_list,
  1916. struct ceph_cap_flush, i_list);
  1917. if (cf->tid <= flush_tid)
  1918. ret = 0;
  1919. }
  1920. spin_unlock(&ci->i_ceph_lock);
  1921. return ret;
  1922. }
  1923. /*
  1924. * wait for any unsafe requests to complete.
  1925. */
  1926. static int unsafe_request_wait(struct inode *inode)
  1927. {
  1928. struct ceph_inode_info *ci = ceph_inode(inode);
  1929. struct ceph_mds_request *req1 = NULL, *req2 = NULL;
  1930. int ret, err = 0;
  1931. spin_lock(&ci->i_unsafe_lock);
  1932. if (S_ISDIR(inode->i_mode) && !list_empty(&ci->i_unsafe_dirops)) {
  1933. req1 = list_last_entry(&ci->i_unsafe_dirops,
  1934. struct ceph_mds_request,
  1935. r_unsafe_dir_item);
  1936. ceph_mdsc_get_request(req1);
  1937. }
  1938. if (!list_empty(&ci->i_unsafe_iops)) {
  1939. req2 = list_last_entry(&ci->i_unsafe_iops,
  1940. struct ceph_mds_request,
  1941. r_unsafe_target_item);
  1942. ceph_mdsc_get_request(req2);
  1943. }
  1944. spin_unlock(&ci->i_unsafe_lock);
  1945. dout("unsafe_request_wait %p wait on tid %llu %llu\n",
  1946. inode, req1 ? req1->r_tid : 0ULL, req2 ? req2->r_tid : 0ULL);
  1947. if (req1) {
  1948. ret = !wait_for_completion_timeout(&req1->r_safe_completion,
  1949. ceph_timeout_jiffies(req1->r_timeout));
  1950. if (ret)
  1951. err = -EIO;
  1952. ceph_mdsc_put_request(req1);
  1953. }
  1954. if (req2) {
  1955. ret = !wait_for_completion_timeout(&req2->r_safe_completion,
  1956. ceph_timeout_jiffies(req2->r_timeout));
  1957. if (ret)
  1958. err = -EIO;
  1959. ceph_mdsc_put_request(req2);
  1960. }
  1961. return err;
  1962. }
  1963. int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
  1964. {
  1965. struct inode *inode = file->f_mapping->host;
  1966. struct ceph_inode_info *ci = ceph_inode(inode);
  1967. u64 flush_tid;
  1968. int ret;
  1969. int dirty;
  1970. dout("fsync %p%s\n", inode, datasync ? " datasync" : "");
  1971. ret = file_write_and_wait_range(file, start, end);
  1972. if (ret < 0)
  1973. goto out;
  1974. if (datasync)
  1975. goto out;
  1976. inode_lock(inode);
  1977. dirty = try_flush_caps(inode, &flush_tid);
  1978. dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
  1979. ret = unsafe_request_wait(inode);
  1980. /*
  1981. * only wait on non-file metadata writeback (the mds
  1982. * can recover size and mtime, so we don't need to
  1983. * wait for that)
  1984. */
  1985. if (!ret && (dirty & ~CEPH_CAP_ANY_FILE_WR)) {
  1986. ret = wait_event_interruptible(ci->i_cap_wq,
  1987. caps_are_flushed(inode, flush_tid));
  1988. }
  1989. inode_unlock(inode);
  1990. out:
  1991. dout("fsync %p%s result=%d\n", inode, datasync ? " datasync" : "", ret);
  1992. return ret;
  1993. }
  1994. /*
  1995. * Flush any dirty caps back to the mds. If we aren't asked to wait,
  1996. * queue inode for flush but don't do so immediately, because we can
  1997. * get by with fewer MDS messages if we wait for data writeback to
  1998. * complete first.
  1999. */
  2000. int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
  2001. {
  2002. struct ceph_inode_info *ci = ceph_inode(inode);
  2003. u64 flush_tid;
  2004. int err = 0;
  2005. int dirty;
  2006. int wait = (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync);
  2007. dout("write_inode %p wait=%d\n", inode, wait);
  2008. if (wait) {
  2009. dirty = try_flush_caps(inode, &flush_tid);
  2010. if (dirty)
  2011. err = wait_event_interruptible(ci->i_cap_wq,
  2012. caps_are_flushed(inode, flush_tid));
  2013. } else {
  2014. struct ceph_mds_client *mdsc =
  2015. ceph_sb_to_client(inode->i_sb)->mdsc;
  2016. spin_lock(&ci->i_ceph_lock);
  2017. if (__ceph_caps_dirty(ci))
  2018. __cap_delay_requeue_front(mdsc, ci);
  2019. spin_unlock(&ci->i_ceph_lock);
  2020. }
  2021. return err;
  2022. }
  2023. static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
  2024. struct ceph_mds_session *session,
  2025. struct ceph_inode_info *ci,
  2026. u64 oldest_flush_tid)
  2027. __releases(ci->i_ceph_lock)
  2028. __acquires(ci->i_ceph_lock)
  2029. {
  2030. struct inode *inode = &ci->vfs_inode;
  2031. struct ceph_cap *cap;
  2032. struct ceph_cap_flush *cf;
  2033. int ret;
  2034. u64 first_tid = 0;
  2035. list_for_each_entry(cf, &ci->i_cap_flush_list, i_list) {
  2036. if (cf->tid < first_tid)
  2037. continue;
  2038. cap = ci->i_auth_cap;
  2039. if (!(cap && cap->session == session)) {
  2040. pr_err("%p auth cap %p not mds%d ???\n",
  2041. inode, cap, session->s_mds);
  2042. break;
  2043. }
  2044. first_tid = cf->tid + 1;
  2045. if (cf->caps) {
  2046. dout("kick_flushing_caps %p cap %p tid %llu %s\n",
  2047. inode, cap, cf->tid, ceph_cap_string(cf->caps));
  2048. ci->i_ceph_flags |= CEPH_I_NODELAY;
  2049. ret = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
  2050. false, __ceph_caps_used(ci),
  2051. __ceph_caps_wanted(ci),
  2052. cap->issued | cap->implemented,
  2053. cf->caps, cf->tid, oldest_flush_tid);
  2054. if (ret) {
  2055. pr_err("kick_flushing_caps: error sending "
  2056. "cap flush, ino (%llx.%llx) "
  2057. "tid %llu flushing %s\n",
  2058. ceph_vinop(inode), cf->tid,
  2059. ceph_cap_string(cf->caps));
  2060. }
  2061. } else {
  2062. struct ceph_cap_snap *capsnap =
  2063. container_of(cf, struct ceph_cap_snap,
  2064. cap_flush);
  2065. dout("kick_flushing_caps %p capsnap %p tid %llu %s\n",
  2066. inode, capsnap, cf->tid,
  2067. ceph_cap_string(capsnap->dirty));
  2068. refcount_inc(&capsnap->nref);
  2069. spin_unlock(&ci->i_ceph_lock);
  2070. ret = __send_flush_snap(inode, session, capsnap, cap->mseq,
  2071. oldest_flush_tid);
  2072. if (ret < 0) {
  2073. pr_err("kick_flushing_caps: error sending "
  2074. "cap flushsnap, ino (%llx.%llx) "
  2075. "tid %llu follows %llu\n",
  2076. ceph_vinop(inode), cf->tid,
  2077. capsnap->follows);
  2078. }
  2079. ceph_put_cap_snap(capsnap);
  2080. }
  2081. spin_lock(&ci->i_ceph_lock);
  2082. }
  2083. }
  2084. void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc,
  2085. struct ceph_mds_session *session)
  2086. {
  2087. struct ceph_inode_info *ci;
  2088. struct ceph_cap *cap;
  2089. u64 oldest_flush_tid;
  2090. dout("early_kick_flushing_caps mds%d\n", session->s_mds);
  2091. spin_lock(&mdsc->cap_dirty_lock);
  2092. oldest_flush_tid = __get_oldest_flush_tid(mdsc);
  2093. spin_unlock(&mdsc->cap_dirty_lock);
  2094. list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
  2095. spin_lock(&ci->i_ceph_lock);
  2096. cap = ci->i_auth_cap;
  2097. if (!(cap && cap->session == session)) {
  2098. pr_err("%p auth cap %p not mds%d ???\n",
  2099. &ci->vfs_inode, cap, session->s_mds);
  2100. spin_unlock(&ci->i_ceph_lock);
  2101. continue;
  2102. }
  2103. /*
  2104. * if flushing caps were revoked, we re-send the cap flush
  2105. * in client reconnect stage. This guarantees MDS * processes
  2106. * the cap flush message before issuing the flushing caps to
  2107. * other client.
  2108. */
  2109. if ((cap->issued & ci->i_flushing_caps) !=
  2110. ci->i_flushing_caps) {
  2111. ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
  2112. __kick_flushing_caps(mdsc, session, ci,
  2113. oldest_flush_tid);
  2114. } else {
  2115. ci->i_ceph_flags |= CEPH_I_KICK_FLUSH;
  2116. }
  2117. spin_unlock(&ci->i_ceph_lock);
  2118. }
  2119. }
  2120. void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
  2121. struct ceph_mds_session *session)
  2122. {
  2123. struct ceph_inode_info *ci;
  2124. struct ceph_cap *cap;
  2125. u64 oldest_flush_tid;
  2126. dout("kick_flushing_caps mds%d\n", session->s_mds);
  2127. spin_lock(&mdsc->cap_dirty_lock);
  2128. oldest_flush_tid = __get_oldest_flush_tid(mdsc);
  2129. spin_unlock(&mdsc->cap_dirty_lock);
  2130. list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
  2131. spin_lock(&ci->i_ceph_lock);
  2132. cap = ci->i_auth_cap;
  2133. if (!(cap && cap->session == session)) {
  2134. pr_err("%p auth cap %p not mds%d ???\n",
  2135. &ci->vfs_inode, cap, session->s_mds);
  2136. spin_unlock(&ci->i_ceph_lock);
  2137. continue;
  2138. }
  2139. if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) {
  2140. ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
  2141. __kick_flushing_caps(mdsc, session, ci,
  2142. oldest_flush_tid);
  2143. }
  2144. spin_unlock(&ci->i_ceph_lock);
  2145. }
  2146. }
  2147. static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
  2148. struct ceph_mds_session *session,
  2149. struct inode *inode)
  2150. __releases(ci->i_ceph_lock)
  2151. {
  2152. struct ceph_inode_info *ci = ceph_inode(inode);
  2153. struct ceph_cap *cap;
  2154. cap = ci->i_auth_cap;
  2155. dout("kick_flushing_inode_caps %p flushing %s\n", inode,
  2156. ceph_cap_string(ci->i_flushing_caps));
  2157. if (!list_empty(&ci->i_cap_flush_list)) {
  2158. u64 oldest_flush_tid;
  2159. spin_lock(&mdsc->cap_dirty_lock);
  2160. list_move_tail(&ci->i_flushing_item,
  2161. &cap->session->s_cap_flushing);
  2162. oldest_flush_tid = __get_oldest_flush_tid(mdsc);
  2163. spin_unlock(&mdsc->cap_dirty_lock);
  2164. ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
  2165. __kick_flushing_caps(mdsc, session, ci, oldest_flush_tid);
  2166. spin_unlock(&ci->i_ceph_lock);
  2167. } else {
  2168. spin_unlock(&ci->i_ceph_lock);
  2169. }
  2170. }
  2171. /*
  2172. * Take references to capabilities we hold, so that we don't release
  2173. * them to the MDS prematurely.
  2174. *
  2175. * Protected by i_ceph_lock.
  2176. */
  2177. static void __take_cap_refs(struct ceph_inode_info *ci, int got,
  2178. bool snap_rwsem_locked)
  2179. {
  2180. if (got & CEPH_CAP_PIN)
  2181. ci->i_pin_ref++;
  2182. if (got & CEPH_CAP_FILE_RD)
  2183. ci->i_rd_ref++;
  2184. if (got & CEPH_CAP_FILE_CACHE)
  2185. ci->i_rdcache_ref++;
  2186. if (got & CEPH_CAP_FILE_WR) {
  2187. if (ci->i_wr_ref == 0 && !ci->i_head_snapc) {
  2188. BUG_ON(!snap_rwsem_locked);
  2189. ci->i_head_snapc = ceph_get_snap_context(
  2190. ci->i_snap_realm->cached_context);
  2191. }
  2192. ci->i_wr_ref++;
  2193. }
  2194. if (got & CEPH_CAP_FILE_BUFFER) {
  2195. if (ci->i_wb_ref == 0)
  2196. ihold(&ci->vfs_inode);
  2197. ci->i_wb_ref++;
  2198. dout("__take_cap_refs %p wb %d -> %d (?)\n",
  2199. &ci->vfs_inode, ci->i_wb_ref-1, ci->i_wb_ref);
  2200. }
  2201. }
  2202. /*
  2203. * Try to grab cap references. Specify those refs we @want, and the
  2204. * minimal set we @need. Also include the larger offset we are writing
  2205. * to (when applicable), and check against max_size here as well.
  2206. * Note that caller is responsible for ensuring max_size increases are
  2207. * requested from the MDS.
  2208. */
  2209. static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
  2210. loff_t endoff, bool nonblock, int *got, int *err)
  2211. {
  2212. struct inode *inode = &ci->vfs_inode;
  2213. struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
  2214. int ret = 0;
  2215. int have, implemented;
  2216. int file_wanted;
  2217. bool snap_rwsem_locked = false;
  2218. dout("get_cap_refs %p need %s want %s\n", inode,
  2219. ceph_cap_string(need), ceph_cap_string(want));
  2220. again:
  2221. spin_lock(&ci->i_ceph_lock);
  2222. /* make sure file is actually open */
  2223. file_wanted = __ceph_caps_file_wanted(ci);
  2224. if ((file_wanted & need) != need) {
  2225. dout("try_get_cap_refs need %s file_wanted %s, EBADF\n",
  2226. ceph_cap_string(need), ceph_cap_string(file_wanted));
  2227. *err = -EBADF;
  2228. ret = 1;
  2229. goto out_unlock;
  2230. }
  2231. /* finish pending truncate */
  2232. while (ci->i_truncate_pending) {
  2233. spin_unlock(&ci->i_ceph_lock);
  2234. if (snap_rwsem_locked) {
  2235. up_read(&mdsc->snap_rwsem);
  2236. snap_rwsem_locked = false;
  2237. }
  2238. __ceph_do_pending_vmtruncate(inode);
  2239. spin_lock(&ci->i_ceph_lock);
  2240. }
  2241. have = __ceph_caps_issued(ci, &implemented);
  2242. if (have & need & CEPH_CAP_FILE_WR) {
  2243. if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) {
  2244. dout("get_cap_refs %p endoff %llu > maxsize %llu\n",
  2245. inode, endoff, ci->i_max_size);
  2246. if (endoff > ci->i_requested_max_size) {
  2247. *err = -EAGAIN;
  2248. ret = 1;
  2249. }
  2250. goto out_unlock;
  2251. }
  2252. /*
  2253. * If a sync write is in progress, we must wait, so that we
  2254. * can get a final snapshot value for size+mtime.
  2255. */
  2256. if (__ceph_have_pending_cap_snap(ci)) {
  2257. dout("get_cap_refs %p cap_snap_pending\n", inode);
  2258. goto out_unlock;
  2259. }
  2260. }
  2261. if ((have & need) == need) {
  2262. /*
  2263. * Look at (implemented & ~have & not) so that we keep waiting
  2264. * on transition from wanted -> needed caps. This is needed
  2265. * for WRBUFFER|WR -> WR to avoid a new WR sync write from
  2266. * going before a prior buffered writeback happens.
  2267. */
  2268. int not = want & ~(have & need);
  2269. int revoking = implemented & ~have;
  2270. dout("get_cap_refs %p have %s but not %s (revoking %s)\n",
  2271. inode, ceph_cap_string(have), ceph_cap_string(not),
  2272. ceph_cap_string(revoking));
  2273. if ((revoking & not) == 0) {
  2274. if (!snap_rwsem_locked &&
  2275. !ci->i_head_snapc &&
  2276. (need & CEPH_CAP_FILE_WR)) {
  2277. if (!down_read_trylock(&mdsc->snap_rwsem)) {
  2278. /*
  2279. * we can not call down_read() when
  2280. * task isn't in TASK_RUNNING state
  2281. */
  2282. if (nonblock) {
  2283. *err = -EAGAIN;
  2284. ret = 1;
  2285. goto out_unlock;
  2286. }
  2287. spin_unlock(&ci->i_ceph_lock);
  2288. down_read(&mdsc->snap_rwsem);
  2289. snap_rwsem_locked = true;
  2290. goto again;
  2291. }
  2292. snap_rwsem_locked = true;
  2293. }
  2294. *got = need | (have & want);
  2295. if ((need & CEPH_CAP_FILE_RD) &&
  2296. !(*got & CEPH_CAP_FILE_CACHE))
  2297. ceph_disable_fscache_readpage(ci);
  2298. __take_cap_refs(ci, *got, true);
  2299. ret = 1;
  2300. }
  2301. } else {
  2302. int session_readonly = false;
  2303. if ((need & CEPH_CAP_FILE_WR) && ci->i_auth_cap) {
  2304. struct ceph_mds_session *s = ci->i_auth_cap->session;
  2305. spin_lock(&s->s_cap_lock);
  2306. session_readonly = s->s_readonly;
  2307. spin_unlock(&s->s_cap_lock);
  2308. }
  2309. if (session_readonly) {
  2310. dout("get_cap_refs %p needed %s but mds%d readonly\n",
  2311. inode, ceph_cap_string(need), ci->i_auth_cap->mds);
  2312. *err = -EROFS;
  2313. ret = 1;
  2314. goto out_unlock;
  2315. }
  2316. if (ci->i_ceph_flags & CEPH_I_CAP_DROPPED) {
  2317. int mds_wanted;
  2318. if (READ_ONCE(mdsc->fsc->mount_state) ==
  2319. CEPH_MOUNT_SHUTDOWN) {
  2320. dout("get_cap_refs %p forced umount\n", inode);
  2321. *err = -EIO;
  2322. ret = 1;
  2323. goto out_unlock;
  2324. }
  2325. mds_wanted = __ceph_caps_mds_wanted(ci, false);
  2326. if (need & ~(mds_wanted & need)) {
  2327. dout("get_cap_refs %p caps were dropped"
  2328. " (session killed?)\n", inode);
  2329. *err = -ESTALE;
  2330. ret = 1;
  2331. goto out_unlock;
  2332. }
  2333. if (!(file_wanted & ~mds_wanted))
  2334. ci->i_ceph_flags &= ~CEPH_I_CAP_DROPPED;
  2335. }
  2336. dout("get_cap_refs %p have %s needed %s\n", inode,
  2337. ceph_cap_string(have), ceph_cap_string(need));
  2338. }
  2339. out_unlock:
  2340. spin_unlock(&ci->i_ceph_lock);
  2341. if (snap_rwsem_locked)
  2342. up_read(&mdsc->snap_rwsem);
  2343. dout("get_cap_refs %p ret %d got %s\n", inode,
  2344. ret, ceph_cap_string(*got));
  2345. return ret;
  2346. }
  2347. /*
  2348. * Check the offset we are writing up to against our current
  2349. * max_size. If necessary, tell the MDS we want to write to
  2350. * a larger offset.
  2351. */
  2352. static void check_max_size(struct inode *inode, loff_t endoff)
  2353. {
  2354. struct ceph_inode_info *ci = ceph_inode(inode);
  2355. int check = 0;
  2356. /* do we need to explicitly request a larger max_size? */
  2357. spin_lock(&ci->i_ceph_lock);
  2358. if (endoff >= ci->i_max_size && endoff > ci->i_wanted_max_size) {
  2359. dout("write %p at large endoff %llu, req max_size\n",
  2360. inode, endoff);
  2361. ci->i_wanted_max_size = endoff;
  2362. }
  2363. /* duplicate ceph_check_caps()'s logic */
  2364. if (ci->i_auth_cap &&
  2365. (ci->i_auth_cap->issued & CEPH_CAP_FILE_WR) &&
  2366. ci->i_wanted_max_size > ci->i_max_size &&
  2367. ci->i_wanted_max_size > ci->i_requested_max_size)
  2368. check = 1;
  2369. spin_unlock(&ci->i_ceph_lock);
  2370. if (check)
  2371. ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
  2372. }
  2373. int ceph_try_get_caps(struct ceph_inode_info *ci, int need, int want, int *got)
  2374. {
  2375. int ret, err = 0;
  2376. BUG_ON(need & ~CEPH_CAP_FILE_RD);
  2377. BUG_ON(want & ~(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO));
  2378. ret = ceph_pool_perm_check(ci, need);
  2379. if (ret < 0)
  2380. return ret;
  2381. ret = try_get_cap_refs(ci, need, want, 0, true, got, &err);
  2382. if (ret) {
  2383. if (err == -EAGAIN) {
  2384. ret = 0;
  2385. } else if (err < 0) {
  2386. ret = err;
  2387. }
  2388. }
  2389. return ret;
  2390. }
  2391. /*
  2392. * Wait for caps, and take cap references. If we can't get a WR cap
  2393. * due to a small max_size, make sure we check_max_size (and possibly
  2394. * ask the mds) so we don't get hung up indefinitely.
  2395. */
  2396. int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
  2397. loff_t endoff, int *got, struct page **pinned_page)
  2398. {
  2399. int _got, ret, err = 0;
  2400. ret = ceph_pool_perm_check(ci, need);
  2401. if (ret < 0)
  2402. return ret;
  2403. while (true) {
  2404. if (endoff > 0)
  2405. check_max_size(&ci->vfs_inode, endoff);
  2406. err = 0;
  2407. _got = 0;
  2408. ret = try_get_cap_refs(ci, need, want, endoff,
  2409. false, &_got, &err);
  2410. if (ret) {
  2411. if (err == -EAGAIN)
  2412. continue;
  2413. if (err < 0)
  2414. ret = err;
  2415. } else {
  2416. DEFINE_WAIT_FUNC(wait, woken_wake_function);
  2417. add_wait_queue(&ci->i_cap_wq, &wait);
  2418. while (!try_get_cap_refs(ci, need, want, endoff,
  2419. true, &_got, &err)) {
  2420. if (signal_pending(current)) {
  2421. ret = -ERESTARTSYS;
  2422. break;
  2423. }
  2424. wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
  2425. }
  2426. remove_wait_queue(&ci->i_cap_wq, &wait);
  2427. if (err == -EAGAIN)
  2428. continue;
  2429. if (err < 0)
  2430. ret = err;
  2431. }
  2432. if (ret < 0) {
  2433. if (err == -ESTALE) {
  2434. /* session was killed, try renew caps */
  2435. ret = ceph_renew_caps(&ci->vfs_inode);
  2436. if (ret == 0)
  2437. continue;
  2438. }
  2439. return ret;
  2440. }
  2441. if (ci->i_inline_version != CEPH_INLINE_NONE &&
  2442. (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
  2443. i_size_read(&ci->vfs_inode) > 0) {
  2444. struct page *page =
  2445. find_get_page(ci->vfs_inode.i_mapping, 0);
  2446. if (page) {
  2447. if (PageUptodate(page)) {
  2448. *pinned_page = page;
  2449. break;
  2450. }
  2451. put_page(page);
  2452. }
  2453. /*
  2454. * drop cap refs first because getattr while
  2455. * holding * caps refs can cause deadlock.
  2456. */
  2457. ceph_put_cap_refs(ci, _got);
  2458. _got = 0;
  2459. /*
  2460. * getattr request will bring inline data into
  2461. * page cache
  2462. */
  2463. ret = __ceph_do_getattr(&ci->vfs_inode, NULL,
  2464. CEPH_STAT_CAP_INLINE_DATA,
  2465. true);
  2466. if (ret < 0)
  2467. return ret;
  2468. continue;
  2469. }
  2470. break;
  2471. }
  2472. if ((_got & CEPH_CAP_FILE_RD) && (_got & CEPH_CAP_FILE_CACHE))
  2473. ceph_fscache_revalidate_cookie(ci);
  2474. *got = _got;
  2475. return 0;
  2476. }
  2477. /*
  2478. * Take cap refs. Caller must already know we hold at least one ref
  2479. * on the caps in question or we don't know this is safe.
  2480. */
  2481. void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps)
  2482. {
  2483. spin_lock(&ci->i_ceph_lock);
  2484. __take_cap_refs(ci, caps, false);
  2485. spin_unlock(&ci->i_ceph_lock);
  2486. }
  2487. /*
  2488. * drop cap_snap that is not associated with any snapshot.
  2489. * we don't need to send FLUSHSNAP message for it.
  2490. */
  2491. static int ceph_try_drop_cap_snap(struct ceph_inode_info *ci,
  2492. struct ceph_cap_snap *capsnap)
  2493. {
  2494. if (!capsnap->need_flush &&
  2495. !capsnap->writing && !capsnap->dirty_pages) {
  2496. dout("dropping cap_snap %p follows %llu\n",
  2497. capsnap, capsnap->follows);
  2498. BUG_ON(capsnap->cap_flush.tid > 0);
  2499. ceph_put_snap_context(capsnap->context);
  2500. if (!list_is_last(&capsnap->ci_item, &ci->i_cap_snaps))
  2501. ci->i_ceph_flags |= CEPH_I_FLUSH_SNAPS;
  2502. list_del(&capsnap->ci_item);
  2503. ceph_put_cap_snap(capsnap);
  2504. return 1;
  2505. }
  2506. return 0;
  2507. }
  2508. /*
  2509. * Release cap refs.
  2510. *
  2511. * If we released the last ref on any given cap, call ceph_check_caps
  2512. * to release (or schedule a release).
  2513. *
  2514. * If we are releasing a WR cap (from a sync write), finalize any affected
  2515. * cap_snap, and wake up any waiters.
  2516. */
  2517. void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
  2518. {
  2519. struct inode *inode = &ci->vfs_inode;
  2520. int last = 0, put = 0, flushsnaps = 0, wake = 0;
  2521. spin_lock(&ci->i_ceph_lock);
  2522. if (had & CEPH_CAP_PIN)
  2523. --ci->i_pin_ref;
  2524. if (had & CEPH_CAP_FILE_RD)
  2525. if (--ci->i_rd_ref == 0)
  2526. last++;
  2527. if (had & CEPH_CAP_FILE_CACHE)
  2528. if (--ci->i_rdcache_ref == 0)
  2529. last++;
  2530. if (had & CEPH_CAP_FILE_BUFFER) {
  2531. if (--ci->i_wb_ref == 0) {
  2532. last++;
  2533. put++;
  2534. }
  2535. dout("put_cap_refs %p wb %d -> %d (?)\n",
  2536. inode, ci->i_wb_ref+1, ci->i_wb_ref);
  2537. }
  2538. if (had & CEPH_CAP_FILE_WR)
  2539. if (--ci->i_wr_ref == 0) {
  2540. last++;
  2541. if (__ceph_have_pending_cap_snap(ci)) {
  2542. struct ceph_cap_snap *capsnap =
  2543. list_last_entry(&ci->i_cap_snaps,
  2544. struct ceph_cap_snap,
  2545. ci_item);
  2546. capsnap->writing = 0;
  2547. if (ceph_try_drop_cap_snap(ci, capsnap))
  2548. put++;
  2549. else if (__ceph_finish_cap_snap(ci, capsnap))
  2550. flushsnaps = 1;
  2551. wake = 1;
  2552. }
  2553. if (ci->i_wrbuffer_ref_head == 0 &&
  2554. ci->i_dirty_caps == 0 &&
  2555. ci->i_flushing_caps == 0) {
  2556. BUG_ON(!ci->i_head_snapc);
  2557. ceph_put_snap_context(ci->i_head_snapc);
  2558. ci->i_head_snapc = NULL;
  2559. }
  2560. /* see comment in __ceph_remove_cap() */
  2561. if (!__ceph_is_any_caps(ci) && ci->i_snap_realm)
  2562. drop_inode_snap_realm(ci);
  2563. }
  2564. spin_unlock(&ci->i_ceph_lock);
  2565. dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had),
  2566. last ? " last" : "", put ? " put" : "");
  2567. if (last && !flushsnaps)
  2568. ceph_check_caps(ci, 0, NULL);
  2569. else if (flushsnaps)
  2570. ceph_flush_snaps(ci, NULL);
  2571. if (wake)
  2572. wake_up_all(&ci->i_cap_wq);
  2573. while (put-- > 0)
  2574. iput(inode);
  2575. }
  2576. /*
  2577. * Release @nr WRBUFFER refs on dirty pages for the given @snapc snap
  2578. * context. Adjust per-snap dirty page accounting as appropriate.
  2579. * Once all dirty data for a cap_snap is flushed, flush snapped file
  2580. * metadata back to the MDS. If we dropped the last ref, call
  2581. * ceph_check_caps.
  2582. */
  2583. void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
  2584. struct ceph_snap_context *snapc)
  2585. {
  2586. struct inode *inode = &ci->vfs_inode;
  2587. struct ceph_cap_snap *capsnap = NULL;
  2588. int put = 0;
  2589. bool last = false;
  2590. bool found = false;
  2591. bool flush_snaps = false;
  2592. bool complete_capsnap = false;
  2593. spin_lock(&ci->i_ceph_lock);
  2594. ci->i_wrbuffer_ref -= nr;
  2595. if (ci->i_wrbuffer_ref == 0) {
  2596. last = true;
  2597. put++;
  2598. }
  2599. if (ci->i_head_snapc == snapc) {
  2600. ci->i_wrbuffer_ref_head -= nr;
  2601. if (ci->i_wrbuffer_ref_head == 0 &&
  2602. ci->i_wr_ref == 0 &&
  2603. ci->i_dirty_caps == 0 &&
  2604. ci->i_flushing_caps == 0) {
  2605. BUG_ON(!ci->i_head_snapc);
  2606. ceph_put_snap_context(ci->i_head_snapc);
  2607. ci->i_head_snapc = NULL;
  2608. }
  2609. dout("put_wrbuffer_cap_refs on %p head %d/%d -> %d/%d %s\n",
  2610. inode,
  2611. ci->i_wrbuffer_ref+nr, ci->i_wrbuffer_ref_head+nr,
  2612. ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
  2613. last ? " LAST" : "");
  2614. } else {
  2615. list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
  2616. if (capsnap->context == snapc) {
  2617. found = true;
  2618. break;
  2619. }
  2620. }
  2621. BUG_ON(!found);
  2622. capsnap->dirty_pages -= nr;
  2623. if (capsnap->dirty_pages == 0) {
  2624. complete_capsnap = true;
  2625. if (!capsnap->writing) {
  2626. if (ceph_try_drop_cap_snap(ci, capsnap)) {
  2627. put++;
  2628. } else {
  2629. ci->i_ceph_flags |= CEPH_I_FLUSH_SNAPS;
  2630. flush_snaps = true;
  2631. }
  2632. }
  2633. }
  2634. dout("put_wrbuffer_cap_refs on %p cap_snap %p "
  2635. " snap %lld %d/%d -> %d/%d %s%s\n",
  2636. inode, capsnap, capsnap->context->seq,
  2637. ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr,
  2638. ci->i_wrbuffer_ref, capsnap->dirty_pages,
  2639. last ? " (wrbuffer last)" : "",
  2640. complete_capsnap ? " (complete capsnap)" : "");
  2641. }
  2642. spin_unlock(&ci->i_ceph_lock);
  2643. if (last) {
  2644. ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
  2645. } else if (flush_snaps) {
  2646. ceph_flush_snaps(ci, NULL);
  2647. }
  2648. if (complete_capsnap)
  2649. wake_up_all(&ci->i_cap_wq);
  2650. while (put-- > 0)
  2651. iput(inode);
  2652. }
  2653. /*
  2654. * Invalidate unlinked inode's aliases, so we can drop the inode ASAP.
  2655. */
  2656. static void invalidate_aliases(struct inode *inode)
  2657. {
  2658. struct dentry *dn, *prev = NULL;
  2659. dout("invalidate_aliases inode %p\n", inode);
  2660. d_prune_aliases(inode);
  2661. /*
  2662. * For non-directory inode, d_find_alias() only returns
  2663. * hashed dentry. After calling d_invalidate(), the
  2664. * dentry becomes unhashed.
  2665. *
  2666. * For directory inode, d_find_alias() can return
  2667. * unhashed dentry. But directory inode should have
  2668. * one alias at most.
  2669. */
  2670. while ((dn = d_find_alias(inode))) {
  2671. if (dn == prev) {
  2672. dput(dn);
  2673. break;
  2674. }
  2675. d_invalidate(dn);
  2676. if (prev)
  2677. dput(prev);
  2678. prev = dn;
  2679. }
  2680. if (prev)
  2681. dput(prev);
  2682. }
  2683. struct cap_extra_info {
  2684. struct ceph_string *pool_ns;
  2685. /* inline data */
  2686. u64 inline_version;
  2687. void *inline_data;
  2688. u32 inline_len;
  2689. /* dirstat */
  2690. bool dirstat_valid;
  2691. u64 nfiles;
  2692. u64 nsubdirs;
  2693. /* currently issued */
  2694. int issued;
  2695. };
  2696. /*
  2697. * Handle a cap GRANT message from the MDS. (Note that a GRANT may
  2698. * actually be a revocation if it specifies a smaller cap set.)
  2699. *
  2700. * caller holds s_mutex and i_ceph_lock, we drop both.
  2701. */
  2702. static void handle_cap_grant(struct inode *inode,
  2703. struct ceph_mds_session *session,
  2704. struct ceph_cap *cap,
  2705. struct ceph_mds_caps *grant,
  2706. struct ceph_buffer *xattr_buf,
  2707. struct cap_extra_info *extra_info)
  2708. __releases(ci->i_ceph_lock)
  2709. __releases(session->s_mdsc->snap_rwsem)
  2710. {
  2711. struct ceph_inode_info *ci = ceph_inode(inode);
  2712. int seq = le32_to_cpu(grant->seq);
  2713. int newcaps = le32_to_cpu(grant->caps);
  2714. int used, wanted, dirty;
  2715. u64 size = le64_to_cpu(grant->size);
  2716. u64 max_size = le64_to_cpu(grant->max_size);
  2717. int check_caps = 0;
  2718. bool wake = false;
  2719. bool writeback = false;
  2720. bool queue_trunc = false;
  2721. bool queue_invalidate = false;
  2722. bool deleted_inode = false;
  2723. bool fill_inline = false;
  2724. dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
  2725. inode, cap, session->s_mds, seq, ceph_cap_string(newcaps));
  2726. dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
  2727. inode->i_size);
  2728. /*
  2729. * auth mds of the inode changed. we received the cap export message,
  2730. * but still haven't received the cap import message. handle_cap_export
  2731. * updated the new auth MDS' cap.
  2732. *
  2733. * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing a message
  2734. * that was sent before the cap import message. So don't remove caps.
  2735. */
  2736. if (ceph_seq_cmp(seq, cap->seq) <= 0) {
  2737. WARN_ON(cap != ci->i_auth_cap);
  2738. WARN_ON(cap->cap_id != le64_to_cpu(grant->cap_id));
  2739. seq = cap->seq;
  2740. newcaps |= cap->issued;
  2741. }
  2742. /*
  2743. * If CACHE is being revoked, and we have no dirty buffers,
  2744. * try to invalidate (once). (If there are dirty buffers, we
  2745. * will invalidate _after_ writeback.)
  2746. */
  2747. if (!S_ISDIR(inode->i_mode) && /* don't invalidate readdir cache */
  2748. ((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
  2749. (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
  2750. !(ci->i_wrbuffer_ref || ci->i_wb_ref)) {
  2751. if (try_nonblocking_invalidate(inode)) {
  2752. /* there were locked pages.. invalidate later
  2753. in a separate thread. */
  2754. if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
  2755. queue_invalidate = true;
  2756. ci->i_rdcache_revoking = ci->i_rdcache_gen;
  2757. }
  2758. }
  2759. }
  2760. /* side effects now are allowed */
  2761. cap->cap_gen = session->s_cap_gen;
  2762. cap->seq = seq;
  2763. __check_cap_issue(ci, cap, newcaps);
  2764. if ((newcaps & CEPH_CAP_AUTH_SHARED) &&
  2765. (extra_info->issued & CEPH_CAP_AUTH_EXCL) == 0) {
  2766. inode->i_mode = le32_to_cpu(grant->mode);
  2767. inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(grant->uid));
  2768. inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(grant->gid));
  2769. dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
  2770. from_kuid(&init_user_ns, inode->i_uid),
  2771. from_kgid(&init_user_ns, inode->i_gid));
  2772. }
  2773. if ((newcaps & CEPH_CAP_LINK_SHARED) &&
  2774. (extra_info->issued & CEPH_CAP_LINK_EXCL) == 0) {
  2775. set_nlink(inode, le32_to_cpu(grant->nlink));
  2776. if (inode->i_nlink == 0 &&
  2777. (newcaps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL)))
  2778. deleted_inode = true;
  2779. }
  2780. if ((extra_info->issued & CEPH_CAP_XATTR_EXCL) == 0 &&
  2781. grant->xattr_len) {
  2782. int len = le32_to_cpu(grant->xattr_len);
  2783. u64 version = le64_to_cpu(grant->xattr_version);
  2784. if (version > ci->i_xattrs.version) {
  2785. dout(" got new xattrs v%llu on %p len %d\n",
  2786. version, inode, len);
  2787. if (ci->i_xattrs.blob)
  2788. ceph_buffer_put(ci->i_xattrs.blob);
  2789. ci->i_xattrs.blob = ceph_buffer_get(xattr_buf);
  2790. ci->i_xattrs.version = version;
  2791. ceph_forget_all_cached_acls(inode);
  2792. }
  2793. }
  2794. if (newcaps & CEPH_CAP_ANY_RD) {
  2795. struct timespec mtime, atime, ctime;
  2796. /* ctime/mtime/atime? */
  2797. ceph_decode_timespec(&mtime, &grant->mtime);
  2798. ceph_decode_timespec(&atime, &grant->atime);
  2799. ceph_decode_timespec(&ctime, &grant->ctime);
  2800. ceph_fill_file_time(inode, extra_info->issued,
  2801. le32_to_cpu(grant->time_warp_seq),
  2802. &ctime, &mtime, &atime);
  2803. }
  2804. if ((newcaps & CEPH_CAP_FILE_SHARED) && extra_info->dirstat_valid) {
  2805. ci->i_files = extra_info->nfiles;
  2806. ci->i_subdirs = extra_info->nsubdirs;
  2807. }
  2808. if (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
  2809. /* file layout may have changed */
  2810. s64 old_pool = ci->i_layout.pool_id;
  2811. struct ceph_string *old_ns;
  2812. ceph_file_layout_from_legacy(&ci->i_layout, &grant->layout);
  2813. old_ns = rcu_dereference_protected(ci->i_layout.pool_ns,
  2814. lockdep_is_held(&ci->i_ceph_lock));
  2815. rcu_assign_pointer(ci->i_layout.pool_ns, extra_info->pool_ns);
  2816. if (ci->i_layout.pool_id != old_pool ||
  2817. extra_info->pool_ns != old_ns)
  2818. ci->i_ceph_flags &= ~CEPH_I_POOL_PERM;
  2819. extra_info->pool_ns = old_ns;
  2820. /* size/truncate_seq? */
  2821. queue_trunc = ceph_fill_file_size(inode, extra_info->issued,
  2822. le32_to_cpu(grant->truncate_seq),
  2823. le64_to_cpu(grant->truncate_size),
  2824. size);
  2825. }
  2826. if (ci->i_auth_cap == cap && (newcaps & CEPH_CAP_ANY_FILE_WR)) {
  2827. if (max_size != ci->i_max_size) {
  2828. dout("max_size %lld -> %llu\n",
  2829. ci->i_max_size, max_size);
  2830. ci->i_max_size = max_size;
  2831. if (max_size >= ci->i_wanted_max_size) {
  2832. ci->i_wanted_max_size = 0; /* reset */
  2833. ci->i_requested_max_size = 0;
  2834. }
  2835. wake = true;
  2836. } else if (ci->i_wanted_max_size > ci->i_max_size &&
  2837. ci->i_wanted_max_size > ci->i_requested_max_size) {
  2838. /* CEPH_CAP_OP_IMPORT */
  2839. wake = true;
  2840. }
  2841. }
  2842. /* check cap bits */
  2843. wanted = __ceph_caps_wanted(ci);
  2844. used = __ceph_caps_used(ci);
  2845. dirty = __ceph_caps_dirty(ci);
  2846. dout(" my wanted = %s, used = %s, dirty %s\n",
  2847. ceph_cap_string(wanted),
  2848. ceph_cap_string(used),
  2849. ceph_cap_string(dirty));
  2850. if (wanted != le32_to_cpu(grant->wanted)) {
  2851. dout("mds wanted %s -> %s\n",
  2852. ceph_cap_string(le32_to_cpu(grant->wanted)),
  2853. ceph_cap_string(wanted));
  2854. /* imported cap may not have correct mds_wanted */
  2855. if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT)
  2856. check_caps = 1;
  2857. }
  2858. /* revocation, grant, or no-op? */
  2859. if (cap->issued & ~newcaps) {
  2860. int revoking = cap->issued & ~newcaps;
  2861. dout("revocation: %s -> %s (revoking %s)\n",
  2862. ceph_cap_string(cap->issued),
  2863. ceph_cap_string(newcaps),
  2864. ceph_cap_string(revoking));
  2865. if (revoking & used & CEPH_CAP_FILE_BUFFER)
  2866. writeback = true; /* initiate writeback; will delay ack */
  2867. else if (revoking == CEPH_CAP_FILE_CACHE &&
  2868. (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
  2869. queue_invalidate)
  2870. ; /* do nothing yet, invalidation will be queued */
  2871. else if (cap == ci->i_auth_cap)
  2872. check_caps = 1; /* check auth cap only */
  2873. else
  2874. check_caps = 2; /* check all caps */
  2875. cap->issued = newcaps;
  2876. cap->implemented |= newcaps;
  2877. } else if (cap->issued == newcaps) {
  2878. dout("caps unchanged: %s -> %s\n",
  2879. ceph_cap_string(cap->issued), ceph_cap_string(newcaps));
  2880. } else {
  2881. dout("grant: %s -> %s\n", ceph_cap_string(cap->issued),
  2882. ceph_cap_string(newcaps));
  2883. /* non-auth MDS is revoking the newly grant caps ? */
  2884. if (cap == ci->i_auth_cap &&
  2885. __ceph_caps_revoking_other(ci, cap, newcaps))
  2886. check_caps = 2;
  2887. cap->issued = newcaps;
  2888. cap->implemented |= newcaps; /* add bits only, to
  2889. * avoid stepping on a
  2890. * pending revocation */
  2891. wake = true;
  2892. }
  2893. BUG_ON(cap->issued & ~cap->implemented);
  2894. if (extra_info->inline_version > 0 &&
  2895. extra_info->inline_version >= ci->i_inline_version) {
  2896. ci->i_inline_version = extra_info->inline_version;
  2897. if (ci->i_inline_version != CEPH_INLINE_NONE &&
  2898. (newcaps & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)))
  2899. fill_inline = true;
  2900. }
  2901. if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) {
  2902. if (newcaps & ~extra_info->issued)
  2903. wake = true;
  2904. kick_flushing_inode_caps(session->s_mdsc, session, inode);
  2905. up_read(&session->s_mdsc->snap_rwsem);
  2906. } else {
  2907. spin_unlock(&ci->i_ceph_lock);
  2908. }
  2909. if (fill_inline)
  2910. ceph_fill_inline_data(inode, NULL, extra_info->inline_data,
  2911. extra_info->inline_len);
  2912. if (queue_trunc)
  2913. ceph_queue_vmtruncate(inode);
  2914. if (writeback)
  2915. /*
  2916. * queue inode for writeback: we can't actually call
  2917. * filemap_write_and_wait, etc. from message handler
  2918. * context.
  2919. */
  2920. ceph_queue_writeback(inode);
  2921. if (queue_invalidate)
  2922. ceph_queue_invalidate(inode);
  2923. if (deleted_inode)
  2924. invalidate_aliases(inode);
  2925. if (wake)
  2926. wake_up_all(&ci->i_cap_wq);
  2927. if (check_caps == 1)
  2928. ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY,
  2929. session);
  2930. else if (check_caps == 2)
  2931. ceph_check_caps(ci, CHECK_CAPS_NODELAY, session);
  2932. else
  2933. mutex_unlock(&session->s_mutex);
  2934. }
  2935. /*
  2936. * Handle FLUSH_ACK from MDS, indicating that metadata we sent to the
  2937. * MDS has been safely committed.
  2938. */
  2939. static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
  2940. struct ceph_mds_caps *m,
  2941. struct ceph_mds_session *session,
  2942. struct ceph_cap *cap)
  2943. __releases(ci->i_ceph_lock)
  2944. {
  2945. struct ceph_inode_info *ci = ceph_inode(inode);
  2946. struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
  2947. struct ceph_cap_flush *cf, *tmp_cf;
  2948. LIST_HEAD(to_remove);
  2949. unsigned seq = le32_to_cpu(m->seq);
  2950. int dirty = le32_to_cpu(m->dirty);
  2951. int cleaned = 0;
  2952. bool drop = false;
  2953. bool wake_ci = false;
  2954. bool wake_mdsc = false;
  2955. list_for_each_entry_safe(cf, tmp_cf, &ci->i_cap_flush_list, i_list) {
  2956. if (cf->tid == flush_tid)
  2957. cleaned = cf->caps;
  2958. if (cf->caps == 0) /* capsnap */
  2959. continue;
  2960. if (cf->tid <= flush_tid) {
  2961. if (__finish_cap_flush(NULL, ci, cf))
  2962. wake_ci = true;
  2963. list_add_tail(&cf->i_list, &to_remove);
  2964. } else {
  2965. cleaned &= ~cf->caps;
  2966. if (!cleaned)
  2967. break;
  2968. }
  2969. }
  2970. dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s,"
  2971. " flushing %s -> %s\n",
  2972. inode, session->s_mds, seq, ceph_cap_string(dirty),
  2973. ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps),
  2974. ceph_cap_string(ci->i_flushing_caps & ~cleaned));
  2975. if (list_empty(&to_remove) && !cleaned)
  2976. goto out;
  2977. ci->i_flushing_caps &= ~cleaned;
  2978. spin_lock(&mdsc->cap_dirty_lock);
  2979. list_for_each_entry(cf, &to_remove, i_list) {
  2980. if (__finish_cap_flush(mdsc, NULL, cf))
  2981. wake_mdsc = true;
  2982. }
  2983. if (ci->i_flushing_caps == 0) {
  2984. if (list_empty(&ci->i_cap_flush_list)) {
  2985. list_del_init(&ci->i_flushing_item);
  2986. if (!list_empty(&session->s_cap_flushing)) {
  2987. dout(" mds%d still flushing cap on %p\n",
  2988. session->s_mds,
  2989. &list_first_entry(&session->s_cap_flushing,
  2990. struct ceph_inode_info,
  2991. i_flushing_item)->vfs_inode);
  2992. }
  2993. }
  2994. mdsc->num_cap_flushing--;
  2995. dout(" inode %p now !flushing\n", inode);
  2996. if (ci->i_dirty_caps == 0) {
  2997. dout(" inode %p now clean\n", inode);
  2998. BUG_ON(!list_empty(&ci->i_dirty_item));
  2999. drop = true;
  3000. if (ci->i_wr_ref == 0 &&
  3001. ci->i_wrbuffer_ref_head == 0) {
  3002. BUG_ON(!ci->i_head_snapc);
  3003. ceph_put_snap_context(ci->i_head_snapc);
  3004. ci->i_head_snapc = NULL;
  3005. }
  3006. } else {
  3007. BUG_ON(list_empty(&ci->i_dirty_item));
  3008. }
  3009. }
  3010. spin_unlock(&mdsc->cap_dirty_lock);
  3011. out:
  3012. spin_unlock(&ci->i_ceph_lock);
  3013. while (!list_empty(&to_remove)) {
  3014. cf = list_first_entry(&to_remove,
  3015. struct ceph_cap_flush, i_list);
  3016. list_del(&cf->i_list);
  3017. ceph_free_cap_flush(cf);
  3018. }
  3019. if (wake_ci)
  3020. wake_up_all(&ci->i_cap_wq);
  3021. if (wake_mdsc)
  3022. wake_up_all(&mdsc->cap_flushing_wq);
  3023. if (drop)
  3024. iput(inode);
  3025. }
  3026. /*
  3027. * Handle FLUSHSNAP_ACK. MDS has flushed snap data to disk and we can
  3028. * throw away our cap_snap.
  3029. *
  3030. * Caller hold s_mutex.
  3031. */
  3032. static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
  3033. struct ceph_mds_caps *m,
  3034. struct ceph_mds_session *session)
  3035. {
  3036. struct ceph_inode_info *ci = ceph_inode(inode);
  3037. struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
  3038. u64 follows = le64_to_cpu(m->snap_follows);
  3039. struct ceph_cap_snap *capsnap;
  3040. bool flushed = false;
  3041. bool wake_ci = false;
  3042. bool wake_mdsc = false;
  3043. dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n",
  3044. inode, ci, session->s_mds, follows);
  3045. spin_lock(&ci->i_ceph_lock);
  3046. list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
  3047. if (capsnap->follows == follows) {
  3048. if (capsnap->cap_flush.tid != flush_tid) {
  3049. dout(" cap_snap %p follows %lld tid %lld !="
  3050. " %lld\n", capsnap, follows,
  3051. flush_tid, capsnap->cap_flush.tid);
  3052. break;
  3053. }
  3054. flushed = true;
  3055. break;
  3056. } else {
  3057. dout(" skipping cap_snap %p follows %lld\n",
  3058. capsnap, capsnap->follows);
  3059. }
  3060. }
  3061. if (flushed) {
  3062. WARN_ON(capsnap->dirty_pages || capsnap->writing);
  3063. dout(" removing %p cap_snap %p follows %lld\n",
  3064. inode, capsnap, follows);
  3065. list_del(&capsnap->ci_item);
  3066. if (__finish_cap_flush(NULL, ci, &capsnap->cap_flush))
  3067. wake_ci = true;
  3068. spin_lock(&mdsc->cap_dirty_lock);
  3069. if (list_empty(&ci->i_cap_flush_list))
  3070. list_del_init(&ci->i_flushing_item);
  3071. if (__finish_cap_flush(mdsc, NULL, &capsnap->cap_flush))
  3072. wake_mdsc = true;
  3073. spin_unlock(&mdsc->cap_dirty_lock);
  3074. }
  3075. spin_unlock(&ci->i_ceph_lock);
  3076. if (flushed) {
  3077. ceph_put_snap_context(capsnap->context);
  3078. ceph_put_cap_snap(capsnap);
  3079. if (wake_ci)
  3080. wake_up_all(&ci->i_cap_wq);
  3081. if (wake_mdsc)
  3082. wake_up_all(&mdsc->cap_flushing_wq);
  3083. iput(inode);
  3084. }
  3085. }
  3086. /*
  3087. * Handle TRUNC from MDS, indicating file truncation.
  3088. *
  3089. * caller hold s_mutex.
  3090. */
  3091. static void handle_cap_trunc(struct inode *inode,
  3092. struct ceph_mds_caps *trunc,
  3093. struct ceph_mds_session *session)
  3094. __releases(ci->i_ceph_lock)
  3095. {
  3096. struct ceph_inode_info *ci = ceph_inode(inode);
  3097. int mds = session->s_mds;
  3098. int seq = le32_to_cpu(trunc->seq);
  3099. u32 truncate_seq = le32_to_cpu(trunc->truncate_seq);
  3100. u64 truncate_size = le64_to_cpu(trunc->truncate_size);
  3101. u64 size = le64_to_cpu(trunc->size);
  3102. int implemented = 0;
  3103. int dirty = __ceph_caps_dirty(ci);
  3104. int issued = __ceph_caps_issued(ceph_inode(inode), &implemented);
  3105. int queue_trunc = 0;
  3106. issued |= implemented | dirty;
  3107. dout("handle_cap_trunc inode %p mds%d seq %d to %lld seq %d\n",
  3108. inode, mds, seq, truncate_size, truncate_seq);
  3109. queue_trunc = ceph_fill_file_size(inode, issued,
  3110. truncate_seq, truncate_size, size);
  3111. spin_unlock(&ci->i_ceph_lock);
  3112. if (queue_trunc)
  3113. ceph_queue_vmtruncate(inode);
  3114. }
  3115. /*
  3116. * Handle EXPORT from MDS. Cap is being migrated _from_ this mds to a
  3117. * different one. If we are the most recent migration we've seen (as
  3118. * indicated by mseq), make note of the migrating cap bits for the
  3119. * duration (until we see the corresponding IMPORT).
  3120. *
  3121. * caller holds s_mutex
  3122. */
  3123. static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
  3124. struct ceph_mds_cap_peer *ph,
  3125. struct ceph_mds_session *session)
  3126. {
  3127. struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
  3128. struct ceph_mds_session *tsession = NULL;
  3129. struct ceph_cap *cap, *tcap, *new_cap = NULL;
  3130. struct ceph_inode_info *ci = ceph_inode(inode);
  3131. u64 t_cap_id;
  3132. unsigned mseq = le32_to_cpu(ex->migrate_seq);
  3133. unsigned t_seq, t_mseq;
  3134. int target, issued;
  3135. int mds = session->s_mds;
  3136. if (ph) {
  3137. t_cap_id = le64_to_cpu(ph->cap_id);
  3138. t_seq = le32_to_cpu(ph->seq);
  3139. t_mseq = le32_to_cpu(ph->mseq);
  3140. target = le32_to_cpu(ph->mds);
  3141. } else {
  3142. t_cap_id = t_seq = t_mseq = 0;
  3143. target = -1;
  3144. }
  3145. dout("handle_cap_export inode %p ci %p mds%d mseq %d target %d\n",
  3146. inode, ci, mds, mseq, target);
  3147. retry:
  3148. spin_lock(&ci->i_ceph_lock);
  3149. cap = __get_cap_for_mds(ci, mds);
  3150. if (!cap || cap->cap_id != le64_to_cpu(ex->cap_id))
  3151. goto out_unlock;
  3152. if (target < 0) {
  3153. __ceph_remove_cap(cap, false);
  3154. if (!ci->i_auth_cap)
  3155. ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
  3156. goto out_unlock;
  3157. }
  3158. /*
  3159. * now we know we haven't received the cap import message yet
  3160. * because the exported cap still exist.
  3161. */
  3162. issued = cap->issued;
  3163. if (issued != cap->implemented)
  3164. pr_err_ratelimited("handle_cap_export: issued != implemented: "
  3165. "ino (%llx.%llx) mds%d seq %d mseq %d "
  3166. "issued %s implemented %s\n",
  3167. ceph_vinop(inode), mds, cap->seq, cap->mseq,
  3168. ceph_cap_string(issued),
  3169. ceph_cap_string(cap->implemented));
  3170. tcap = __get_cap_for_mds(ci, target);
  3171. if (tcap) {
  3172. /* already have caps from the target */
  3173. if (tcap->cap_id == t_cap_id &&
  3174. ceph_seq_cmp(tcap->seq, t_seq) < 0) {
  3175. dout(" updating import cap %p mds%d\n", tcap, target);
  3176. tcap->cap_id = t_cap_id;
  3177. tcap->seq = t_seq - 1;
  3178. tcap->issue_seq = t_seq - 1;
  3179. tcap->mseq = t_mseq;
  3180. tcap->issued |= issued;
  3181. tcap->implemented |= issued;
  3182. if (cap == ci->i_auth_cap)
  3183. ci->i_auth_cap = tcap;
  3184. if (!list_empty(&ci->i_cap_flush_list) &&
  3185. ci->i_auth_cap == tcap) {
  3186. spin_lock(&mdsc->cap_dirty_lock);
  3187. list_move_tail(&ci->i_flushing_item,
  3188. &tcap->session->s_cap_flushing);
  3189. spin_unlock(&mdsc->cap_dirty_lock);
  3190. }
  3191. }
  3192. __ceph_remove_cap(cap, false);
  3193. goto out_unlock;
  3194. } else if (tsession) {
  3195. /* add placeholder for the export tagert */
  3196. int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0;
  3197. tcap = new_cap;
  3198. ceph_add_cap(inode, tsession, t_cap_id, -1, issued, 0,
  3199. t_seq - 1, t_mseq, (u64)-1, flag, &new_cap);
  3200. if (!list_empty(&ci->i_cap_flush_list) &&
  3201. ci->i_auth_cap == tcap) {
  3202. spin_lock(&mdsc->cap_dirty_lock);
  3203. list_move_tail(&ci->i_flushing_item,
  3204. &tcap->session->s_cap_flushing);
  3205. spin_unlock(&mdsc->cap_dirty_lock);
  3206. }
  3207. __ceph_remove_cap(cap, false);
  3208. goto out_unlock;
  3209. }
  3210. spin_unlock(&ci->i_ceph_lock);
  3211. mutex_unlock(&session->s_mutex);
  3212. /* open target session */
  3213. tsession = ceph_mdsc_open_export_target_session(mdsc, target);
  3214. if (!IS_ERR(tsession)) {
  3215. if (mds > target) {
  3216. mutex_lock(&session->s_mutex);
  3217. mutex_lock_nested(&tsession->s_mutex,
  3218. SINGLE_DEPTH_NESTING);
  3219. } else {
  3220. mutex_lock(&tsession->s_mutex);
  3221. mutex_lock_nested(&session->s_mutex,
  3222. SINGLE_DEPTH_NESTING);
  3223. }
  3224. new_cap = ceph_get_cap(mdsc, NULL);
  3225. } else {
  3226. WARN_ON(1);
  3227. tsession = NULL;
  3228. target = -1;
  3229. }
  3230. goto retry;
  3231. out_unlock:
  3232. spin_unlock(&ci->i_ceph_lock);
  3233. mutex_unlock(&session->s_mutex);
  3234. if (tsession) {
  3235. mutex_unlock(&tsession->s_mutex);
  3236. ceph_put_mds_session(tsession);
  3237. }
  3238. if (new_cap)
  3239. ceph_put_cap(mdsc, new_cap);
  3240. }
  3241. /*
  3242. * Handle cap IMPORT.
  3243. *
  3244. * caller holds s_mutex. acquires i_ceph_lock
  3245. */
  3246. static void handle_cap_import(struct ceph_mds_client *mdsc,
  3247. struct inode *inode, struct ceph_mds_caps *im,
  3248. struct ceph_mds_cap_peer *ph,
  3249. struct ceph_mds_session *session,
  3250. struct ceph_cap **target_cap, int *old_issued)
  3251. __acquires(ci->i_ceph_lock)
  3252. {
  3253. struct ceph_inode_info *ci = ceph_inode(inode);
  3254. struct ceph_cap *cap, *ocap, *new_cap = NULL;
  3255. int mds = session->s_mds;
  3256. int issued;
  3257. unsigned caps = le32_to_cpu(im->caps);
  3258. unsigned wanted = le32_to_cpu(im->wanted);
  3259. unsigned seq = le32_to_cpu(im->seq);
  3260. unsigned mseq = le32_to_cpu(im->migrate_seq);
  3261. u64 realmino = le64_to_cpu(im->realm);
  3262. u64 cap_id = le64_to_cpu(im->cap_id);
  3263. u64 p_cap_id;
  3264. int peer;
  3265. if (ph) {
  3266. p_cap_id = le64_to_cpu(ph->cap_id);
  3267. peer = le32_to_cpu(ph->mds);
  3268. } else {
  3269. p_cap_id = 0;
  3270. peer = -1;
  3271. }
  3272. dout("handle_cap_import inode %p ci %p mds%d mseq %d peer %d\n",
  3273. inode, ci, mds, mseq, peer);
  3274. retry:
  3275. spin_lock(&ci->i_ceph_lock);
  3276. cap = __get_cap_for_mds(ci, mds);
  3277. if (!cap) {
  3278. if (!new_cap) {
  3279. spin_unlock(&ci->i_ceph_lock);
  3280. new_cap = ceph_get_cap(mdsc, NULL);
  3281. goto retry;
  3282. }
  3283. cap = new_cap;
  3284. } else {
  3285. if (new_cap) {
  3286. ceph_put_cap(mdsc, new_cap);
  3287. new_cap = NULL;
  3288. }
  3289. }
  3290. __ceph_caps_issued(ci, &issued);
  3291. issued |= __ceph_caps_dirty(ci);
  3292. ceph_add_cap(inode, session, cap_id, -1, caps, wanted, seq, mseq,
  3293. realmino, CEPH_CAP_FLAG_AUTH, &new_cap);
  3294. ocap = peer >= 0 ? __get_cap_for_mds(ci, peer) : NULL;
  3295. if (ocap && ocap->cap_id == p_cap_id) {
  3296. dout(" remove export cap %p mds%d flags %d\n",
  3297. ocap, peer, ph->flags);
  3298. if ((ph->flags & CEPH_CAP_FLAG_AUTH) &&
  3299. (ocap->seq != le32_to_cpu(ph->seq) ||
  3300. ocap->mseq != le32_to_cpu(ph->mseq))) {
  3301. pr_err_ratelimited("handle_cap_import: "
  3302. "mismatched seq/mseq: ino (%llx.%llx) "
  3303. "mds%d seq %d mseq %d importer mds%d "
  3304. "has peer seq %d mseq %d\n",
  3305. ceph_vinop(inode), peer, ocap->seq,
  3306. ocap->mseq, mds, le32_to_cpu(ph->seq),
  3307. le32_to_cpu(ph->mseq));
  3308. }
  3309. __ceph_remove_cap(ocap, (ph->flags & CEPH_CAP_FLAG_RELEASE));
  3310. }
  3311. /* make sure we re-request max_size, if necessary */
  3312. ci->i_requested_max_size = 0;
  3313. *old_issued = issued;
  3314. *target_cap = cap;
  3315. }
  3316. /*
  3317. * Handle a caps message from the MDS.
  3318. *
  3319. * Identify the appropriate session, inode, and call the right handler
  3320. * based on the cap op.
  3321. */
  3322. void ceph_handle_caps(struct ceph_mds_session *session,
  3323. struct ceph_msg *msg)
  3324. {
  3325. struct ceph_mds_client *mdsc = session->s_mdsc;
  3326. struct inode *inode;
  3327. struct ceph_inode_info *ci;
  3328. struct ceph_cap *cap;
  3329. struct ceph_mds_caps *h;
  3330. struct ceph_mds_cap_peer *peer = NULL;
  3331. struct ceph_snap_realm *realm = NULL;
  3332. int op;
  3333. int msg_version = le16_to_cpu(msg->hdr.version);
  3334. u32 seq, mseq;
  3335. struct ceph_vino vino;
  3336. void *snaptrace;
  3337. size_t snaptrace_len;
  3338. void *p, *end;
  3339. struct cap_extra_info extra_info = {};
  3340. dout("handle_caps from mds%d\n", session->s_mds);
  3341. /* decode */
  3342. end = msg->front.iov_base + msg->front.iov_len;
  3343. if (msg->front.iov_len < sizeof(*h))
  3344. goto bad;
  3345. h = msg->front.iov_base;
  3346. op = le32_to_cpu(h->op);
  3347. vino.ino = le64_to_cpu(h->ino);
  3348. vino.snap = CEPH_NOSNAP;
  3349. seq = le32_to_cpu(h->seq);
  3350. mseq = le32_to_cpu(h->migrate_seq);
  3351. snaptrace = h + 1;
  3352. snaptrace_len = le32_to_cpu(h->snap_trace_len);
  3353. p = snaptrace + snaptrace_len;
  3354. if (msg_version >= 2) {
  3355. u32 flock_len;
  3356. ceph_decode_32_safe(&p, end, flock_len, bad);
  3357. if (p + flock_len > end)
  3358. goto bad;
  3359. p += flock_len;
  3360. }
  3361. if (msg_version >= 3) {
  3362. if (op == CEPH_CAP_OP_IMPORT) {
  3363. if (p + sizeof(*peer) > end)
  3364. goto bad;
  3365. peer = p;
  3366. p += sizeof(*peer);
  3367. } else if (op == CEPH_CAP_OP_EXPORT) {
  3368. /* recorded in unused fields */
  3369. peer = (void *)&h->size;
  3370. }
  3371. }
  3372. if (msg_version >= 4) {
  3373. ceph_decode_64_safe(&p, end, extra_info.inline_version, bad);
  3374. ceph_decode_32_safe(&p, end, extra_info.inline_len, bad);
  3375. if (p + extra_info.inline_len > end)
  3376. goto bad;
  3377. extra_info.inline_data = p;
  3378. p += extra_info.inline_len;
  3379. }
  3380. if (msg_version >= 5) {
  3381. struct ceph_osd_client *osdc = &mdsc->fsc->client->osdc;
  3382. u32 epoch_barrier;
  3383. ceph_decode_32_safe(&p, end, epoch_barrier, bad);
  3384. ceph_osdc_update_epoch_barrier(osdc, epoch_barrier);
  3385. }
  3386. if (msg_version >= 8) {
  3387. u64 flush_tid;
  3388. u32 caller_uid, caller_gid;
  3389. u32 pool_ns_len;
  3390. /* version >= 6 */
  3391. ceph_decode_64_safe(&p, end, flush_tid, bad);
  3392. /* version >= 7 */
  3393. ceph_decode_32_safe(&p, end, caller_uid, bad);
  3394. ceph_decode_32_safe(&p, end, caller_gid, bad);
  3395. /* version >= 8 */
  3396. ceph_decode_32_safe(&p, end, pool_ns_len, bad);
  3397. if (pool_ns_len > 0) {
  3398. ceph_decode_need(&p, end, pool_ns_len, bad);
  3399. extra_info.pool_ns =
  3400. ceph_find_or_create_string(p, pool_ns_len);
  3401. p += pool_ns_len;
  3402. }
  3403. }
  3404. if (msg_version >= 11) {
  3405. struct ceph_timespec *btime;
  3406. u64 change_attr;
  3407. u32 flags;
  3408. /* version >= 9 */
  3409. if (p + sizeof(*btime) > end)
  3410. goto bad;
  3411. btime = p;
  3412. p += sizeof(*btime);
  3413. ceph_decode_64_safe(&p, end, change_attr, bad);
  3414. /* version >= 10 */
  3415. ceph_decode_32_safe(&p, end, flags, bad);
  3416. /* version >= 11 */
  3417. extra_info.dirstat_valid = true;
  3418. ceph_decode_64_safe(&p, end, extra_info.nfiles, bad);
  3419. ceph_decode_64_safe(&p, end, extra_info.nsubdirs, bad);
  3420. }
  3421. /* lookup ino */
  3422. inode = ceph_find_inode(mdsc->fsc->sb, vino);
  3423. ci = ceph_inode(inode);
  3424. dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino,
  3425. vino.snap, inode);
  3426. mutex_lock(&session->s_mutex);
  3427. session->s_seq++;
  3428. dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
  3429. (unsigned)seq);
  3430. if (!inode) {
  3431. dout(" i don't have ino %llx\n", vino.ino);
  3432. if (op == CEPH_CAP_OP_IMPORT) {
  3433. cap = ceph_get_cap(mdsc, NULL);
  3434. cap->cap_ino = vino.ino;
  3435. cap->queue_release = 1;
  3436. cap->cap_id = le64_to_cpu(h->cap_id);
  3437. cap->mseq = mseq;
  3438. cap->seq = seq;
  3439. cap->issue_seq = seq;
  3440. spin_lock(&session->s_cap_lock);
  3441. list_add_tail(&cap->session_caps,
  3442. &session->s_cap_releases);
  3443. session->s_num_cap_releases++;
  3444. spin_unlock(&session->s_cap_lock);
  3445. }
  3446. goto flush_cap_releases;
  3447. }
  3448. /* these will work even if we don't have a cap yet */
  3449. switch (op) {
  3450. case CEPH_CAP_OP_FLUSHSNAP_ACK:
  3451. handle_cap_flushsnap_ack(inode, le64_to_cpu(msg->hdr.tid),
  3452. h, session);
  3453. goto done;
  3454. case CEPH_CAP_OP_EXPORT:
  3455. handle_cap_export(inode, h, peer, session);
  3456. goto done_unlocked;
  3457. case CEPH_CAP_OP_IMPORT:
  3458. realm = NULL;
  3459. if (snaptrace_len) {
  3460. down_write(&mdsc->snap_rwsem);
  3461. ceph_update_snap_trace(mdsc, snaptrace,
  3462. snaptrace + snaptrace_len,
  3463. false, &realm);
  3464. downgrade_write(&mdsc->snap_rwsem);
  3465. } else {
  3466. down_read(&mdsc->snap_rwsem);
  3467. }
  3468. handle_cap_import(mdsc, inode, h, peer, session,
  3469. &cap, &extra_info.issued);
  3470. handle_cap_grant(inode, session, cap,
  3471. h, msg->middle, &extra_info);
  3472. if (realm)
  3473. ceph_put_snap_realm(mdsc, realm);
  3474. goto done_unlocked;
  3475. }
  3476. /* the rest require a cap */
  3477. spin_lock(&ci->i_ceph_lock);
  3478. cap = __get_cap_for_mds(ceph_inode(inode), session->s_mds);
  3479. if (!cap) {
  3480. dout(" no cap on %p ino %llx.%llx from mds%d\n",
  3481. inode, ceph_ino(inode), ceph_snap(inode),
  3482. session->s_mds);
  3483. spin_unlock(&ci->i_ceph_lock);
  3484. goto flush_cap_releases;
  3485. }
  3486. /* note that each of these drops i_ceph_lock for us */
  3487. switch (op) {
  3488. case CEPH_CAP_OP_REVOKE:
  3489. case CEPH_CAP_OP_GRANT:
  3490. __ceph_caps_issued(ci, &extra_info.issued);
  3491. extra_info.issued |= __ceph_caps_dirty(ci);
  3492. handle_cap_grant(inode, session, cap,
  3493. h, msg->middle, &extra_info);
  3494. goto done_unlocked;
  3495. case CEPH_CAP_OP_FLUSH_ACK:
  3496. handle_cap_flush_ack(inode, le64_to_cpu(msg->hdr.tid),
  3497. h, session, cap);
  3498. break;
  3499. case CEPH_CAP_OP_TRUNC:
  3500. handle_cap_trunc(inode, h, session);
  3501. break;
  3502. default:
  3503. spin_unlock(&ci->i_ceph_lock);
  3504. pr_err("ceph_handle_caps: unknown cap op %d %s\n", op,
  3505. ceph_cap_op_name(op));
  3506. }
  3507. goto done;
  3508. flush_cap_releases:
  3509. /*
  3510. * send any cap release message to try to move things
  3511. * along for the mds (who clearly thinks we still have this
  3512. * cap).
  3513. */
  3514. ceph_send_cap_releases(mdsc, session);
  3515. done:
  3516. mutex_unlock(&session->s_mutex);
  3517. done_unlocked:
  3518. iput(inode);
  3519. ceph_put_string(extra_info.pool_ns);
  3520. return;
  3521. bad:
  3522. pr_err("ceph_handle_caps: corrupt message\n");
  3523. ceph_msg_dump(msg);
  3524. return;
  3525. }
  3526. /*
  3527. * Delayed work handler to process end of delayed cap release LRU list.
  3528. */
  3529. void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
  3530. {
  3531. struct inode *inode;
  3532. struct ceph_inode_info *ci;
  3533. int flags = CHECK_CAPS_NODELAY;
  3534. dout("check_delayed_caps\n");
  3535. while (1) {
  3536. spin_lock(&mdsc->cap_delay_lock);
  3537. if (list_empty(&mdsc->cap_delay_list))
  3538. break;
  3539. ci = list_first_entry(&mdsc->cap_delay_list,
  3540. struct ceph_inode_info,
  3541. i_cap_delay_list);
  3542. if ((ci->i_ceph_flags & CEPH_I_FLUSH) == 0 &&
  3543. time_before(jiffies, ci->i_hold_caps_max))
  3544. break;
  3545. list_del_init(&ci->i_cap_delay_list);
  3546. inode = igrab(&ci->vfs_inode);
  3547. spin_unlock(&mdsc->cap_delay_lock);
  3548. if (inode) {
  3549. dout("check_delayed_caps on %p\n", inode);
  3550. ceph_check_caps(ci, flags, NULL);
  3551. iput(inode);
  3552. }
  3553. }
  3554. spin_unlock(&mdsc->cap_delay_lock);
  3555. }
  3556. /*
  3557. * Flush all dirty caps to the mds
  3558. */
  3559. void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
  3560. {
  3561. struct ceph_inode_info *ci;
  3562. struct inode *inode;
  3563. dout("flush_dirty_caps\n");
  3564. spin_lock(&mdsc->cap_dirty_lock);
  3565. while (!list_empty(&mdsc->cap_dirty)) {
  3566. ci = list_first_entry(&mdsc->cap_dirty, struct ceph_inode_info,
  3567. i_dirty_item);
  3568. inode = &ci->vfs_inode;
  3569. ihold(inode);
  3570. dout("flush_dirty_caps %p\n", inode);
  3571. spin_unlock(&mdsc->cap_dirty_lock);
  3572. ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH, NULL);
  3573. iput(inode);
  3574. spin_lock(&mdsc->cap_dirty_lock);
  3575. }
  3576. spin_unlock(&mdsc->cap_dirty_lock);
  3577. dout("flush_dirty_caps done\n");
  3578. }
  3579. void __ceph_get_fmode(struct ceph_inode_info *ci, int fmode)
  3580. {
  3581. int i;
  3582. int bits = (fmode << 1) | 1;
  3583. for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
  3584. if (bits & (1 << i))
  3585. ci->i_nr_by_mode[i]++;
  3586. }
  3587. }
  3588. /*
  3589. * Drop open file reference. If we were the last open file,
  3590. * we may need to release capabilities to the MDS (or schedule
  3591. * their delayed release).
  3592. */
  3593. void ceph_put_fmode(struct ceph_inode_info *ci, int fmode)
  3594. {
  3595. int i, last = 0;
  3596. int bits = (fmode << 1) | 1;
  3597. spin_lock(&ci->i_ceph_lock);
  3598. for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
  3599. if (bits & (1 << i)) {
  3600. BUG_ON(ci->i_nr_by_mode[i] == 0);
  3601. if (--ci->i_nr_by_mode[i] == 0)
  3602. last++;
  3603. }
  3604. }
  3605. dout("put_fmode %p fmode %d {%d,%d,%d,%d}\n",
  3606. &ci->vfs_inode, fmode,
  3607. ci->i_nr_by_mode[0], ci->i_nr_by_mode[1],
  3608. ci->i_nr_by_mode[2], ci->i_nr_by_mode[3]);
  3609. spin_unlock(&ci->i_ceph_lock);
  3610. if (last && ci->i_vino.snap == CEPH_NOSNAP)
  3611. ceph_check_caps(ci, 0, NULL);
  3612. }
  3613. /*
  3614. * For a soon-to-be unlinked file, drop the AUTH_RDCACHE caps. If it
  3615. * looks like the link count will hit 0, drop any other caps (other
  3616. * than PIN) we don't specifically want (due to the file still being
  3617. * open).
  3618. */
  3619. int ceph_drop_caps_for_unlink(struct inode *inode)
  3620. {
  3621. struct ceph_inode_info *ci = ceph_inode(inode);
  3622. int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
  3623. spin_lock(&ci->i_ceph_lock);
  3624. if (inode->i_nlink == 1) {
  3625. drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN);
  3626. ci->i_ceph_flags |= CEPH_I_NODELAY;
  3627. if (__ceph_caps_dirty(ci)) {
  3628. struct ceph_mds_client *mdsc =
  3629. ceph_inode_to_client(inode)->mdsc;
  3630. __cap_delay_requeue_front(mdsc, ci);
  3631. }
  3632. }
  3633. spin_unlock(&ci->i_ceph_lock);
  3634. return drop;
  3635. }
  3636. /*
  3637. * Helpers for embedding cap and dentry lease releases into mds
  3638. * requests.
  3639. *
  3640. * @force is used by dentry_release (below) to force inclusion of a
  3641. * record for the directory inode, even when there aren't any caps to
  3642. * drop.
  3643. */
  3644. int ceph_encode_inode_release(void **p, struct inode *inode,
  3645. int mds, int drop, int unless, int force)
  3646. {
  3647. struct ceph_inode_info *ci = ceph_inode(inode);
  3648. struct ceph_cap *cap;
  3649. struct ceph_mds_request_release *rel = *p;
  3650. int used, dirty;
  3651. int ret = 0;
  3652. spin_lock(&ci->i_ceph_lock);
  3653. used = __ceph_caps_used(ci);
  3654. dirty = __ceph_caps_dirty(ci);
  3655. dout("encode_inode_release %p mds%d used|dirty %s drop %s unless %s\n",
  3656. inode, mds, ceph_cap_string(used|dirty), ceph_cap_string(drop),
  3657. ceph_cap_string(unless));
  3658. /* only drop unused, clean caps */
  3659. drop &= ~(used | dirty);
  3660. cap = __get_cap_for_mds(ci, mds);
  3661. if (cap && __cap_is_valid(cap)) {
  3662. unless &= cap->issued;
  3663. if (unless) {
  3664. if (unless & CEPH_CAP_AUTH_EXCL)
  3665. drop &= ~CEPH_CAP_AUTH_SHARED;
  3666. if (unless & CEPH_CAP_LINK_EXCL)
  3667. drop &= ~CEPH_CAP_LINK_SHARED;
  3668. if (unless & CEPH_CAP_XATTR_EXCL)
  3669. drop &= ~CEPH_CAP_XATTR_SHARED;
  3670. if (unless & CEPH_CAP_FILE_EXCL)
  3671. drop &= ~CEPH_CAP_FILE_SHARED;
  3672. }
  3673. if (force || (cap->issued & drop)) {
  3674. if (cap->issued & drop) {
  3675. int wanted = __ceph_caps_wanted(ci);
  3676. if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0)
  3677. wanted |= cap->mds_wanted;
  3678. dout("encode_inode_release %p cap %p "
  3679. "%s -> %s, wanted %s -> %s\n", inode, cap,
  3680. ceph_cap_string(cap->issued),
  3681. ceph_cap_string(cap->issued & ~drop),
  3682. ceph_cap_string(cap->mds_wanted),
  3683. ceph_cap_string(wanted));
  3684. cap->issued &= ~drop;
  3685. cap->implemented &= ~drop;
  3686. cap->mds_wanted = wanted;
  3687. } else {
  3688. dout("encode_inode_release %p cap %p %s"
  3689. " (force)\n", inode, cap,
  3690. ceph_cap_string(cap->issued));
  3691. }
  3692. rel->ino = cpu_to_le64(ceph_ino(inode));
  3693. rel->cap_id = cpu_to_le64(cap->cap_id);
  3694. rel->seq = cpu_to_le32(cap->seq);
  3695. rel->issue_seq = cpu_to_le32(cap->issue_seq);
  3696. rel->mseq = cpu_to_le32(cap->mseq);
  3697. rel->caps = cpu_to_le32(cap->implemented);
  3698. rel->wanted = cpu_to_le32(cap->mds_wanted);
  3699. rel->dname_len = 0;
  3700. rel->dname_seq = 0;
  3701. *p += sizeof(*rel);
  3702. ret = 1;
  3703. } else {
  3704. dout("encode_inode_release %p cap %p %s (noop)\n",
  3705. inode, cap, ceph_cap_string(cap->issued));
  3706. }
  3707. }
  3708. spin_unlock(&ci->i_ceph_lock);
  3709. return ret;
  3710. }
  3711. int ceph_encode_dentry_release(void **p, struct dentry *dentry,
  3712. struct inode *dir,
  3713. int mds, int drop, int unless)
  3714. {
  3715. struct dentry *parent = NULL;
  3716. struct ceph_mds_request_release *rel = *p;
  3717. struct ceph_dentry_info *di = ceph_dentry(dentry);
  3718. int force = 0;
  3719. int ret;
  3720. /*
  3721. * force an record for the directory caps if we have a dentry lease.
  3722. * this is racy (can't take i_ceph_lock and d_lock together), but it
  3723. * doesn't have to be perfect; the mds will revoke anything we don't
  3724. * release.
  3725. */
  3726. spin_lock(&dentry->d_lock);
  3727. if (di->lease_session && di->lease_session->s_mds == mds)
  3728. force = 1;
  3729. if (!dir) {
  3730. parent = dget(dentry->d_parent);
  3731. dir = d_inode(parent);
  3732. }
  3733. spin_unlock(&dentry->d_lock);
  3734. ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force);
  3735. dput(parent);
  3736. spin_lock(&dentry->d_lock);
  3737. if (ret && di->lease_session && di->lease_session->s_mds == mds) {
  3738. dout("encode_dentry_release %p mds%d seq %d\n",
  3739. dentry, mds, (int)di->lease_seq);
  3740. rel->dname_len = cpu_to_le32(dentry->d_name.len);
  3741. memcpy(*p, dentry->d_name.name, dentry->d_name.len);
  3742. *p += dentry->d_name.len;
  3743. rel->dname_seq = cpu_to_le32(di->lease_seq);
  3744. __ceph_mdsc_drop_dentry_lease(dentry);
  3745. }
  3746. spin_unlock(&dentry->d_lock);
  3747. return ret;
  3748. }