osd_client.c 81 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086
  1. #include <linux/ceph/ceph_debug.h>
  2. #include <linux/module.h>
  3. #include <linux/err.h>
  4. #include <linux/highmem.h>
  5. #include <linux/mm.h>
  6. #include <linux/pagemap.h>
  7. #include <linux/slab.h>
  8. #include <linux/uaccess.h>
  9. #ifdef CONFIG_BLOCK
  10. #include <linux/bio.h>
  11. #endif
  12. #include <linux/ceph/libceph.h>
  13. #include <linux/ceph/osd_client.h>
  14. #include <linux/ceph/messenger.h>
  15. #include <linux/ceph/decode.h>
  16. #include <linux/ceph/auth.h>
  17. #include <linux/ceph/pagelist.h>
  18. #define OSD_OPREPLY_FRONT_LEN 512
  19. static struct kmem_cache *ceph_osd_request_cache;
  20. static const struct ceph_connection_operations osd_con_ops;
  21. static void __send_queued(struct ceph_osd_client *osdc);
  22. static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd);
  23. static void __register_request(struct ceph_osd_client *osdc,
  24. struct ceph_osd_request *req);
  25. static void __unregister_request(struct ceph_osd_client *osdc,
  26. struct ceph_osd_request *req);
  27. static void __unregister_linger_request(struct ceph_osd_client *osdc,
  28. struct ceph_osd_request *req);
  29. static void __enqueue_request(struct ceph_osd_request *req);
  30. /*
  31. * Implement client access to distributed object storage cluster.
  32. *
  33. * All data objects are stored within a cluster/cloud of OSDs, or
  34. * "object storage devices." (Note that Ceph OSDs have _nothing_ to
  35. * do with the T10 OSD extensions to SCSI.) Ceph OSDs are simply
  36. * remote daemons serving up and coordinating consistent and safe
  37. * access to storage.
  38. *
  39. * Cluster membership and the mapping of data objects onto storage devices
  40. * are described by the osd map.
  41. *
  42. * We keep track of pending OSD requests (read, write), resubmit
  43. * requests to different OSDs when the cluster topology/data layout
  44. * change, or retry the affected requests when the communications
  45. * channel with an OSD is reset.
  46. */
  47. /*
  48. * calculate the mapping of a file extent onto an object, and fill out the
  49. * request accordingly. shorten extent as necessary if it crosses an
  50. * object boundary.
  51. *
  52. * fill osd op in request message.
  53. */
  54. static int calc_layout(struct ceph_file_layout *layout, u64 off, u64 *plen,
  55. u64 *objnum, u64 *objoff, u64 *objlen)
  56. {
  57. u64 orig_len = *plen;
  58. int r;
  59. /* object extent? */
  60. r = ceph_calc_file_object_mapping(layout, off, orig_len, objnum,
  61. objoff, objlen);
  62. if (r < 0)
  63. return r;
  64. if (*objlen < orig_len) {
  65. *plen = *objlen;
  66. dout(" skipping last %llu, final file extent %llu~%llu\n",
  67. orig_len - *plen, off, *plen);
  68. }
  69. dout("calc_layout objnum=%llx %llu~%llu\n", *objnum, *objoff, *objlen);
  70. return 0;
  71. }
  72. static void ceph_osd_data_init(struct ceph_osd_data *osd_data)
  73. {
  74. memset(osd_data, 0, sizeof (*osd_data));
  75. osd_data->type = CEPH_OSD_DATA_TYPE_NONE;
  76. }
  77. static void ceph_osd_data_pages_init(struct ceph_osd_data *osd_data,
  78. struct page **pages, u64 length, u32 alignment,
  79. bool pages_from_pool, bool own_pages)
  80. {
  81. osd_data->type = CEPH_OSD_DATA_TYPE_PAGES;
  82. osd_data->pages = pages;
  83. osd_data->length = length;
  84. osd_data->alignment = alignment;
  85. osd_data->pages_from_pool = pages_from_pool;
  86. osd_data->own_pages = own_pages;
  87. }
  88. static void ceph_osd_data_pagelist_init(struct ceph_osd_data *osd_data,
  89. struct ceph_pagelist *pagelist)
  90. {
  91. osd_data->type = CEPH_OSD_DATA_TYPE_PAGELIST;
  92. osd_data->pagelist = pagelist;
  93. }
  94. #ifdef CONFIG_BLOCK
  95. static void ceph_osd_data_bio_init(struct ceph_osd_data *osd_data,
  96. struct bio *bio, size_t bio_length)
  97. {
  98. osd_data->type = CEPH_OSD_DATA_TYPE_BIO;
  99. osd_data->bio = bio;
  100. osd_data->bio_length = bio_length;
  101. }
  102. #endif /* CONFIG_BLOCK */
  103. #define osd_req_op_data(oreq, whch, typ, fld) \
  104. ({ \
  105. struct ceph_osd_request *__oreq = (oreq); \
  106. unsigned int __whch = (whch); \
  107. BUG_ON(__whch >= __oreq->r_num_ops); \
  108. &__oreq->r_ops[__whch].typ.fld; \
  109. })
  110. static struct ceph_osd_data *
  111. osd_req_op_raw_data_in(struct ceph_osd_request *osd_req, unsigned int which)
  112. {
  113. BUG_ON(which >= osd_req->r_num_ops);
  114. return &osd_req->r_ops[which].raw_data_in;
  115. }
  116. struct ceph_osd_data *
  117. osd_req_op_extent_osd_data(struct ceph_osd_request *osd_req,
  118. unsigned int which)
  119. {
  120. return osd_req_op_data(osd_req, which, extent, osd_data);
  121. }
  122. EXPORT_SYMBOL(osd_req_op_extent_osd_data);
  123. void osd_req_op_raw_data_in_pages(struct ceph_osd_request *osd_req,
  124. unsigned int which, struct page **pages,
  125. u64 length, u32 alignment,
  126. bool pages_from_pool, bool own_pages)
  127. {
  128. struct ceph_osd_data *osd_data;
  129. osd_data = osd_req_op_raw_data_in(osd_req, which);
  130. ceph_osd_data_pages_init(osd_data, pages, length, alignment,
  131. pages_from_pool, own_pages);
  132. }
  133. EXPORT_SYMBOL(osd_req_op_raw_data_in_pages);
  134. void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *osd_req,
  135. unsigned int which, struct page **pages,
  136. u64 length, u32 alignment,
  137. bool pages_from_pool, bool own_pages)
  138. {
  139. struct ceph_osd_data *osd_data;
  140. osd_data = osd_req_op_data(osd_req, which, extent, osd_data);
  141. ceph_osd_data_pages_init(osd_data, pages, length, alignment,
  142. pages_from_pool, own_pages);
  143. }
  144. EXPORT_SYMBOL(osd_req_op_extent_osd_data_pages);
  145. void osd_req_op_extent_osd_data_pagelist(struct ceph_osd_request *osd_req,
  146. unsigned int which, struct ceph_pagelist *pagelist)
  147. {
  148. struct ceph_osd_data *osd_data;
  149. osd_data = osd_req_op_data(osd_req, which, extent, osd_data);
  150. ceph_osd_data_pagelist_init(osd_data, pagelist);
  151. }
  152. EXPORT_SYMBOL(osd_req_op_extent_osd_data_pagelist);
  153. #ifdef CONFIG_BLOCK
  154. void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *osd_req,
  155. unsigned int which, struct bio *bio, size_t bio_length)
  156. {
  157. struct ceph_osd_data *osd_data;
  158. osd_data = osd_req_op_data(osd_req, which, extent, osd_data);
  159. ceph_osd_data_bio_init(osd_data, bio, bio_length);
  160. }
  161. EXPORT_SYMBOL(osd_req_op_extent_osd_data_bio);
  162. #endif /* CONFIG_BLOCK */
  163. static void osd_req_op_cls_request_info_pagelist(
  164. struct ceph_osd_request *osd_req,
  165. unsigned int which, struct ceph_pagelist *pagelist)
  166. {
  167. struct ceph_osd_data *osd_data;
  168. osd_data = osd_req_op_data(osd_req, which, cls, request_info);
  169. ceph_osd_data_pagelist_init(osd_data, pagelist);
  170. }
  171. void osd_req_op_cls_request_data_pagelist(
  172. struct ceph_osd_request *osd_req,
  173. unsigned int which, struct ceph_pagelist *pagelist)
  174. {
  175. struct ceph_osd_data *osd_data;
  176. osd_data = osd_req_op_data(osd_req, which, cls, request_data);
  177. ceph_osd_data_pagelist_init(osd_data, pagelist);
  178. osd_req->r_ops[which].cls.indata_len += pagelist->length;
  179. osd_req->r_ops[which].indata_len += pagelist->length;
  180. }
  181. EXPORT_SYMBOL(osd_req_op_cls_request_data_pagelist);
  182. void osd_req_op_cls_request_data_pages(struct ceph_osd_request *osd_req,
  183. unsigned int which, struct page **pages, u64 length,
  184. u32 alignment, bool pages_from_pool, bool own_pages)
  185. {
  186. struct ceph_osd_data *osd_data;
  187. osd_data = osd_req_op_data(osd_req, which, cls, request_data);
  188. ceph_osd_data_pages_init(osd_data, pages, length, alignment,
  189. pages_from_pool, own_pages);
  190. osd_req->r_ops[which].cls.indata_len += length;
  191. osd_req->r_ops[which].indata_len += length;
  192. }
  193. EXPORT_SYMBOL(osd_req_op_cls_request_data_pages);
  194. void osd_req_op_cls_response_data_pages(struct ceph_osd_request *osd_req,
  195. unsigned int which, struct page **pages, u64 length,
  196. u32 alignment, bool pages_from_pool, bool own_pages)
  197. {
  198. struct ceph_osd_data *osd_data;
  199. osd_data = osd_req_op_data(osd_req, which, cls, response_data);
  200. ceph_osd_data_pages_init(osd_data, pages, length, alignment,
  201. pages_from_pool, own_pages);
  202. }
  203. EXPORT_SYMBOL(osd_req_op_cls_response_data_pages);
  204. static u64 ceph_osd_data_length(struct ceph_osd_data *osd_data)
  205. {
  206. switch (osd_data->type) {
  207. case CEPH_OSD_DATA_TYPE_NONE:
  208. return 0;
  209. case CEPH_OSD_DATA_TYPE_PAGES:
  210. return osd_data->length;
  211. case CEPH_OSD_DATA_TYPE_PAGELIST:
  212. return (u64)osd_data->pagelist->length;
  213. #ifdef CONFIG_BLOCK
  214. case CEPH_OSD_DATA_TYPE_BIO:
  215. return (u64)osd_data->bio_length;
  216. #endif /* CONFIG_BLOCK */
  217. default:
  218. WARN(true, "unrecognized data type %d\n", (int)osd_data->type);
  219. return 0;
  220. }
  221. }
  222. static void ceph_osd_data_release(struct ceph_osd_data *osd_data)
  223. {
  224. if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES && osd_data->own_pages) {
  225. int num_pages;
  226. num_pages = calc_pages_for((u64)osd_data->alignment,
  227. (u64)osd_data->length);
  228. ceph_release_page_vector(osd_data->pages, num_pages);
  229. }
  230. ceph_osd_data_init(osd_data);
  231. }
  232. static void osd_req_op_data_release(struct ceph_osd_request *osd_req,
  233. unsigned int which)
  234. {
  235. struct ceph_osd_req_op *op;
  236. BUG_ON(which >= osd_req->r_num_ops);
  237. op = &osd_req->r_ops[which];
  238. switch (op->op) {
  239. case CEPH_OSD_OP_READ:
  240. case CEPH_OSD_OP_WRITE:
  241. case CEPH_OSD_OP_WRITEFULL:
  242. ceph_osd_data_release(&op->extent.osd_data);
  243. break;
  244. case CEPH_OSD_OP_CALL:
  245. ceph_osd_data_release(&op->cls.request_info);
  246. ceph_osd_data_release(&op->cls.request_data);
  247. ceph_osd_data_release(&op->cls.response_data);
  248. break;
  249. case CEPH_OSD_OP_SETXATTR:
  250. case CEPH_OSD_OP_CMPXATTR:
  251. ceph_osd_data_release(&op->xattr.osd_data);
  252. break;
  253. case CEPH_OSD_OP_STAT:
  254. ceph_osd_data_release(&op->raw_data_in);
  255. break;
  256. default:
  257. break;
  258. }
  259. }
  260. /*
  261. * Assumes @t is zero-initialized.
  262. */
  263. static void target_init(struct ceph_osd_request_target *t)
  264. {
  265. ceph_oid_init(&t->base_oid);
  266. ceph_oloc_init(&t->base_oloc);
  267. ceph_oid_init(&t->target_oid);
  268. ceph_oloc_init(&t->target_oloc);
  269. ceph_osds_init(&t->acting);
  270. ceph_osds_init(&t->up);
  271. t->size = -1;
  272. t->min_size = -1;
  273. t->osd = CEPH_HOMELESS_OSD;
  274. }
  275. static void target_destroy(struct ceph_osd_request_target *t)
  276. {
  277. ceph_oid_destroy(&t->base_oid);
  278. ceph_oid_destroy(&t->target_oid);
  279. }
  280. /*
  281. * requests
  282. */
  283. static void ceph_osdc_release_request(struct kref *kref)
  284. {
  285. struct ceph_osd_request *req = container_of(kref,
  286. struct ceph_osd_request, r_kref);
  287. unsigned int which;
  288. dout("%s %p (r_request %p r_reply %p)\n", __func__, req,
  289. req->r_request, req->r_reply);
  290. WARN_ON(!RB_EMPTY_NODE(&req->r_node));
  291. WARN_ON(!list_empty(&req->r_req_lru_item));
  292. WARN_ON(!list_empty(&req->r_osd_item));
  293. WARN_ON(!list_empty(&req->r_linger_item));
  294. WARN_ON(!list_empty(&req->r_linger_osd_item));
  295. WARN_ON(req->r_osd);
  296. if (req->r_request)
  297. ceph_msg_put(req->r_request);
  298. if (req->r_reply) {
  299. ceph_msg_revoke_incoming(req->r_reply);
  300. ceph_msg_put(req->r_reply);
  301. }
  302. for (which = 0; which < req->r_num_ops; which++)
  303. osd_req_op_data_release(req, which);
  304. target_destroy(&req->r_t);
  305. ceph_put_snap_context(req->r_snapc);
  306. if (req->r_mempool)
  307. mempool_free(req, req->r_osdc->req_mempool);
  308. else if (req->r_num_ops <= CEPH_OSD_SLAB_OPS)
  309. kmem_cache_free(ceph_osd_request_cache, req);
  310. else
  311. kfree(req);
  312. }
  313. void ceph_osdc_get_request(struct ceph_osd_request *req)
  314. {
  315. dout("%s %p (was %d)\n", __func__, req,
  316. atomic_read(&req->r_kref.refcount));
  317. kref_get(&req->r_kref);
  318. }
  319. EXPORT_SYMBOL(ceph_osdc_get_request);
  320. void ceph_osdc_put_request(struct ceph_osd_request *req)
  321. {
  322. if (req) {
  323. dout("%s %p (was %d)\n", __func__, req,
  324. atomic_read(&req->r_kref.refcount));
  325. kref_put(&req->r_kref, ceph_osdc_release_request);
  326. }
  327. }
  328. EXPORT_SYMBOL(ceph_osdc_put_request);
  329. struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
  330. struct ceph_snap_context *snapc,
  331. unsigned int num_ops,
  332. bool use_mempool,
  333. gfp_t gfp_flags)
  334. {
  335. struct ceph_osd_request *req;
  336. if (use_mempool) {
  337. BUG_ON(num_ops > CEPH_OSD_SLAB_OPS);
  338. req = mempool_alloc(osdc->req_mempool, gfp_flags);
  339. } else if (num_ops <= CEPH_OSD_SLAB_OPS) {
  340. req = kmem_cache_alloc(ceph_osd_request_cache, gfp_flags);
  341. } else {
  342. BUG_ON(num_ops > CEPH_OSD_MAX_OPS);
  343. req = kmalloc(sizeof(*req) + num_ops * sizeof(req->r_ops[0]),
  344. gfp_flags);
  345. }
  346. if (unlikely(!req))
  347. return NULL;
  348. /* req only, each op is zeroed in _osd_req_op_init() */
  349. memset(req, 0, sizeof(*req));
  350. req->r_osdc = osdc;
  351. req->r_mempool = use_mempool;
  352. req->r_num_ops = num_ops;
  353. req->r_snapid = CEPH_NOSNAP;
  354. req->r_snapc = ceph_get_snap_context(snapc);
  355. kref_init(&req->r_kref);
  356. init_completion(&req->r_completion);
  357. init_completion(&req->r_safe_completion);
  358. RB_CLEAR_NODE(&req->r_node);
  359. INIT_LIST_HEAD(&req->r_unsafe_item);
  360. INIT_LIST_HEAD(&req->r_linger_item);
  361. INIT_LIST_HEAD(&req->r_linger_osd_item);
  362. INIT_LIST_HEAD(&req->r_req_lru_item);
  363. INIT_LIST_HEAD(&req->r_osd_item);
  364. target_init(&req->r_t);
  365. dout("%s req %p\n", __func__, req);
  366. return req;
  367. }
  368. EXPORT_SYMBOL(ceph_osdc_alloc_request);
  369. int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp)
  370. {
  371. struct ceph_osd_client *osdc = req->r_osdc;
  372. struct ceph_msg *msg;
  373. int msg_size;
  374. WARN_ON(ceph_oid_empty(&req->r_base_oid));
  375. /* create request message */
  376. msg_size = 4 + 4 + 4; /* client_inc, osdmap_epoch, flags */
  377. msg_size += 4 + 4 + 4 + 8; /* mtime, reassert_version */
  378. msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */
  379. msg_size += 1 + 8 + 4 + 4; /* pgid */
  380. msg_size += 4 + req->r_base_oid.name_len; /* oid */
  381. msg_size += 2 + req->r_num_ops * sizeof(struct ceph_osd_op);
  382. msg_size += 8; /* snapid */
  383. msg_size += 8; /* snap_seq */
  384. msg_size += 4 + 8 * (req->r_snapc ? req->r_snapc->num_snaps : 0);
  385. msg_size += 4; /* retry_attempt */
  386. if (req->r_mempool)
  387. msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
  388. else
  389. msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp, true);
  390. if (!msg)
  391. return -ENOMEM;
  392. memset(msg->front.iov_base, 0, msg->front.iov_len);
  393. req->r_request = msg;
  394. /* create reply message */
  395. msg_size = OSD_OPREPLY_FRONT_LEN;
  396. msg_size += req->r_base_oid.name_len;
  397. msg_size += req->r_num_ops * sizeof(struct ceph_osd_op);
  398. if (req->r_mempool)
  399. msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
  400. else
  401. msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, msg_size, gfp, true);
  402. if (!msg)
  403. return -ENOMEM;
  404. req->r_reply = msg;
  405. return 0;
  406. }
  407. EXPORT_SYMBOL(ceph_osdc_alloc_messages);
  408. static bool osd_req_opcode_valid(u16 opcode)
  409. {
  410. switch (opcode) {
  411. #define GENERATE_CASE(op, opcode, str) case CEPH_OSD_OP_##op: return true;
  412. __CEPH_FORALL_OSD_OPS(GENERATE_CASE)
  413. #undef GENERATE_CASE
  414. default:
  415. return false;
  416. }
  417. }
  418. /*
  419. * This is an osd op init function for opcodes that have no data or
  420. * other information associated with them. It also serves as a
  421. * common init routine for all the other init functions, below.
  422. */
  423. static struct ceph_osd_req_op *
  424. _osd_req_op_init(struct ceph_osd_request *osd_req, unsigned int which,
  425. u16 opcode, u32 flags)
  426. {
  427. struct ceph_osd_req_op *op;
  428. BUG_ON(which >= osd_req->r_num_ops);
  429. BUG_ON(!osd_req_opcode_valid(opcode));
  430. op = &osd_req->r_ops[which];
  431. memset(op, 0, sizeof (*op));
  432. op->op = opcode;
  433. op->flags = flags;
  434. return op;
  435. }
  436. void osd_req_op_init(struct ceph_osd_request *osd_req,
  437. unsigned int which, u16 opcode, u32 flags)
  438. {
  439. (void)_osd_req_op_init(osd_req, which, opcode, flags);
  440. }
  441. EXPORT_SYMBOL(osd_req_op_init);
  442. void osd_req_op_extent_init(struct ceph_osd_request *osd_req,
  443. unsigned int which, u16 opcode,
  444. u64 offset, u64 length,
  445. u64 truncate_size, u32 truncate_seq)
  446. {
  447. struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
  448. opcode, 0);
  449. size_t payload_len = 0;
  450. BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE &&
  451. opcode != CEPH_OSD_OP_WRITEFULL && opcode != CEPH_OSD_OP_ZERO &&
  452. opcode != CEPH_OSD_OP_TRUNCATE);
  453. op->extent.offset = offset;
  454. op->extent.length = length;
  455. op->extent.truncate_size = truncate_size;
  456. op->extent.truncate_seq = truncate_seq;
  457. if (opcode == CEPH_OSD_OP_WRITE || opcode == CEPH_OSD_OP_WRITEFULL)
  458. payload_len += length;
  459. op->indata_len = payload_len;
  460. }
  461. EXPORT_SYMBOL(osd_req_op_extent_init);
  462. void osd_req_op_extent_update(struct ceph_osd_request *osd_req,
  463. unsigned int which, u64 length)
  464. {
  465. struct ceph_osd_req_op *op;
  466. u64 previous;
  467. BUG_ON(which >= osd_req->r_num_ops);
  468. op = &osd_req->r_ops[which];
  469. previous = op->extent.length;
  470. if (length == previous)
  471. return; /* Nothing to do */
  472. BUG_ON(length > previous);
  473. op->extent.length = length;
  474. op->indata_len -= previous - length;
  475. }
  476. EXPORT_SYMBOL(osd_req_op_extent_update);
  477. void osd_req_op_extent_dup_last(struct ceph_osd_request *osd_req,
  478. unsigned int which, u64 offset_inc)
  479. {
  480. struct ceph_osd_req_op *op, *prev_op;
  481. BUG_ON(which + 1 >= osd_req->r_num_ops);
  482. prev_op = &osd_req->r_ops[which];
  483. op = _osd_req_op_init(osd_req, which + 1, prev_op->op, prev_op->flags);
  484. /* dup previous one */
  485. op->indata_len = prev_op->indata_len;
  486. op->outdata_len = prev_op->outdata_len;
  487. op->extent = prev_op->extent;
  488. /* adjust offset */
  489. op->extent.offset += offset_inc;
  490. op->extent.length -= offset_inc;
  491. if (op->op == CEPH_OSD_OP_WRITE || op->op == CEPH_OSD_OP_WRITEFULL)
  492. op->indata_len -= offset_inc;
  493. }
  494. EXPORT_SYMBOL(osd_req_op_extent_dup_last);
  495. void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which,
  496. u16 opcode, const char *class, const char *method)
  497. {
  498. struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
  499. opcode, 0);
  500. struct ceph_pagelist *pagelist;
  501. size_t payload_len = 0;
  502. size_t size;
  503. BUG_ON(opcode != CEPH_OSD_OP_CALL);
  504. pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
  505. BUG_ON(!pagelist);
  506. ceph_pagelist_init(pagelist);
  507. op->cls.class_name = class;
  508. size = strlen(class);
  509. BUG_ON(size > (size_t) U8_MAX);
  510. op->cls.class_len = size;
  511. ceph_pagelist_append(pagelist, class, size);
  512. payload_len += size;
  513. op->cls.method_name = method;
  514. size = strlen(method);
  515. BUG_ON(size > (size_t) U8_MAX);
  516. op->cls.method_len = size;
  517. ceph_pagelist_append(pagelist, method, size);
  518. payload_len += size;
  519. osd_req_op_cls_request_info_pagelist(osd_req, which, pagelist);
  520. op->indata_len = payload_len;
  521. }
  522. EXPORT_SYMBOL(osd_req_op_cls_init);
  523. int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which,
  524. u16 opcode, const char *name, const void *value,
  525. size_t size, u8 cmp_op, u8 cmp_mode)
  526. {
  527. struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
  528. opcode, 0);
  529. struct ceph_pagelist *pagelist;
  530. size_t payload_len;
  531. BUG_ON(opcode != CEPH_OSD_OP_SETXATTR && opcode != CEPH_OSD_OP_CMPXATTR);
  532. pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS);
  533. if (!pagelist)
  534. return -ENOMEM;
  535. ceph_pagelist_init(pagelist);
  536. payload_len = strlen(name);
  537. op->xattr.name_len = payload_len;
  538. ceph_pagelist_append(pagelist, name, payload_len);
  539. op->xattr.value_len = size;
  540. ceph_pagelist_append(pagelist, value, size);
  541. payload_len += size;
  542. op->xattr.cmp_op = cmp_op;
  543. op->xattr.cmp_mode = cmp_mode;
  544. ceph_osd_data_pagelist_init(&op->xattr.osd_data, pagelist);
  545. op->indata_len = payload_len;
  546. return 0;
  547. }
  548. EXPORT_SYMBOL(osd_req_op_xattr_init);
  549. void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
  550. unsigned int which, u16 opcode,
  551. u64 cookie, u64 version, int flag)
  552. {
  553. struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
  554. opcode, 0);
  555. BUG_ON(opcode != CEPH_OSD_OP_NOTIFY_ACK && opcode != CEPH_OSD_OP_WATCH);
  556. op->watch.cookie = cookie;
  557. op->watch.ver = version;
  558. if (opcode == CEPH_OSD_OP_WATCH && flag)
  559. op->watch.flag = (u8)1;
  560. }
  561. EXPORT_SYMBOL(osd_req_op_watch_init);
  562. void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req,
  563. unsigned int which,
  564. u64 expected_object_size,
  565. u64 expected_write_size)
  566. {
  567. struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
  568. CEPH_OSD_OP_SETALLOCHINT,
  569. 0);
  570. op->alloc_hint.expected_object_size = expected_object_size;
  571. op->alloc_hint.expected_write_size = expected_write_size;
  572. /*
  573. * CEPH_OSD_OP_SETALLOCHINT op is advisory and therefore deemed
  574. * not worth a feature bit. Set FAILOK per-op flag to make
  575. * sure older osds don't trip over an unsupported opcode.
  576. */
  577. op->flags |= CEPH_OSD_OP_FLAG_FAILOK;
  578. }
  579. EXPORT_SYMBOL(osd_req_op_alloc_hint_init);
  580. static void ceph_osdc_msg_data_add(struct ceph_msg *msg,
  581. struct ceph_osd_data *osd_data)
  582. {
  583. u64 length = ceph_osd_data_length(osd_data);
  584. if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) {
  585. BUG_ON(length > (u64) SIZE_MAX);
  586. if (length)
  587. ceph_msg_data_add_pages(msg, osd_data->pages,
  588. length, osd_data->alignment);
  589. } else if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGELIST) {
  590. BUG_ON(!length);
  591. ceph_msg_data_add_pagelist(msg, osd_data->pagelist);
  592. #ifdef CONFIG_BLOCK
  593. } else if (osd_data->type == CEPH_OSD_DATA_TYPE_BIO) {
  594. ceph_msg_data_add_bio(msg, osd_data->bio, length);
  595. #endif
  596. } else {
  597. BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_NONE);
  598. }
  599. }
  600. static u32 osd_req_encode_op(struct ceph_osd_op *dst,
  601. const struct ceph_osd_req_op *src)
  602. {
  603. if (WARN_ON(!osd_req_opcode_valid(src->op))) {
  604. pr_err("unrecognized osd opcode %d\n", src->op);
  605. return 0;
  606. }
  607. switch (src->op) {
  608. case CEPH_OSD_OP_STAT:
  609. break;
  610. case CEPH_OSD_OP_READ:
  611. case CEPH_OSD_OP_WRITE:
  612. case CEPH_OSD_OP_WRITEFULL:
  613. case CEPH_OSD_OP_ZERO:
  614. case CEPH_OSD_OP_TRUNCATE:
  615. dst->extent.offset = cpu_to_le64(src->extent.offset);
  616. dst->extent.length = cpu_to_le64(src->extent.length);
  617. dst->extent.truncate_size =
  618. cpu_to_le64(src->extent.truncate_size);
  619. dst->extent.truncate_seq =
  620. cpu_to_le32(src->extent.truncate_seq);
  621. break;
  622. case CEPH_OSD_OP_CALL:
  623. dst->cls.class_len = src->cls.class_len;
  624. dst->cls.method_len = src->cls.method_len;
  625. dst->cls.indata_len = cpu_to_le32(src->cls.indata_len);
  626. break;
  627. case CEPH_OSD_OP_STARTSYNC:
  628. break;
  629. case CEPH_OSD_OP_NOTIFY_ACK:
  630. case CEPH_OSD_OP_WATCH:
  631. dst->watch.cookie = cpu_to_le64(src->watch.cookie);
  632. dst->watch.ver = cpu_to_le64(src->watch.ver);
  633. dst->watch.flag = src->watch.flag;
  634. break;
  635. case CEPH_OSD_OP_SETALLOCHINT:
  636. dst->alloc_hint.expected_object_size =
  637. cpu_to_le64(src->alloc_hint.expected_object_size);
  638. dst->alloc_hint.expected_write_size =
  639. cpu_to_le64(src->alloc_hint.expected_write_size);
  640. break;
  641. case CEPH_OSD_OP_SETXATTR:
  642. case CEPH_OSD_OP_CMPXATTR:
  643. dst->xattr.name_len = cpu_to_le32(src->xattr.name_len);
  644. dst->xattr.value_len = cpu_to_le32(src->xattr.value_len);
  645. dst->xattr.cmp_op = src->xattr.cmp_op;
  646. dst->xattr.cmp_mode = src->xattr.cmp_mode;
  647. break;
  648. case CEPH_OSD_OP_CREATE:
  649. case CEPH_OSD_OP_DELETE:
  650. break;
  651. default:
  652. pr_err("unsupported osd opcode %s\n",
  653. ceph_osd_op_name(src->op));
  654. WARN_ON(1);
  655. return 0;
  656. }
  657. dst->op = cpu_to_le16(src->op);
  658. dst->flags = cpu_to_le32(src->flags);
  659. dst->payload_len = cpu_to_le32(src->indata_len);
  660. return src->indata_len;
  661. }
  662. /*
  663. * build new request AND message, calculate layout, and adjust file
  664. * extent as needed.
  665. *
  666. * if the file was recently truncated, we include information about its
  667. * old and new size so that the object can be updated appropriately. (we
  668. * avoid synchronously deleting truncated objects because it's slow.)
  669. *
  670. * if @do_sync, include a 'startsync' command so that the osd will flush
  671. * data quickly.
  672. */
  673. struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
  674. struct ceph_file_layout *layout,
  675. struct ceph_vino vino,
  676. u64 off, u64 *plen,
  677. unsigned int which, int num_ops,
  678. int opcode, int flags,
  679. struct ceph_snap_context *snapc,
  680. u32 truncate_seq,
  681. u64 truncate_size,
  682. bool use_mempool)
  683. {
  684. struct ceph_osd_request *req;
  685. u64 objnum = 0;
  686. u64 objoff = 0;
  687. u64 objlen = 0;
  688. int r;
  689. BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE &&
  690. opcode != CEPH_OSD_OP_ZERO && opcode != CEPH_OSD_OP_TRUNCATE &&
  691. opcode != CEPH_OSD_OP_CREATE && opcode != CEPH_OSD_OP_DELETE);
  692. req = ceph_osdc_alloc_request(osdc, snapc, num_ops, use_mempool,
  693. GFP_NOFS);
  694. if (!req) {
  695. r = -ENOMEM;
  696. goto fail;
  697. }
  698. /* calculate max write size */
  699. r = calc_layout(layout, off, plen, &objnum, &objoff, &objlen);
  700. if (r)
  701. goto fail;
  702. if (opcode == CEPH_OSD_OP_CREATE || opcode == CEPH_OSD_OP_DELETE) {
  703. osd_req_op_init(req, which, opcode, 0);
  704. } else {
  705. u32 object_size = le32_to_cpu(layout->fl_object_size);
  706. u32 object_base = off - objoff;
  707. if (!(truncate_seq == 1 && truncate_size == -1ULL)) {
  708. if (truncate_size <= object_base) {
  709. truncate_size = 0;
  710. } else {
  711. truncate_size -= object_base;
  712. if (truncate_size > object_size)
  713. truncate_size = object_size;
  714. }
  715. }
  716. osd_req_op_extent_init(req, which, opcode, objoff, objlen,
  717. truncate_size, truncate_seq);
  718. }
  719. req->r_flags = flags;
  720. req->r_base_oloc.pool = ceph_file_layout_pg_pool(*layout);
  721. ceph_oid_printf(&req->r_base_oid, "%llx.%08llx", vino.ino, objnum);
  722. req->r_snapid = vino.snap;
  723. if (flags & CEPH_OSD_FLAG_WRITE)
  724. req->r_data_offset = off;
  725. r = ceph_osdc_alloc_messages(req, GFP_NOFS);
  726. if (r)
  727. goto fail;
  728. return req;
  729. fail:
  730. ceph_osdc_put_request(req);
  731. return ERR_PTR(r);
  732. }
  733. EXPORT_SYMBOL(ceph_osdc_new_request);
  734. /*
  735. * We keep osd requests in an rbtree, sorted by ->r_tid.
  736. */
  737. DEFINE_RB_FUNCS(request, struct ceph_osd_request, r_tid, r_node)
  738. static struct ceph_osd_request *
  739. __lookup_request_ge(struct ceph_osd_client *osdc,
  740. u64 tid)
  741. {
  742. struct ceph_osd_request *req;
  743. struct rb_node *n = osdc->requests.rb_node;
  744. while (n) {
  745. req = rb_entry(n, struct ceph_osd_request, r_node);
  746. if (tid < req->r_tid) {
  747. if (!n->rb_left)
  748. return req;
  749. n = n->rb_left;
  750. } else if (tid > req->r_tid) {
  751. n = n->rb_right;
  752. } else {
  753. return req;
  754. }
  755. }
  756. return NULL;
  757. }
  758. static void __kick_linger_request(struct ceph_osd_request *req)
  759. {
  760. struct ceph_osd_client *osdc = req->r_osdc;
  761. struct ceph_osd *osd = req->r_osd;
  762. /*
  763. * Linger requests need to be resent with a new tid to avoid
  764. * the dup op detection logic on the OSDs. Achieve this with
  765. * a re-register dance instead of open-coding.
  766. */
  767. ceph_osdc_get_request(req);
  768. if (!list_empty(&req->r_linger_item))
  769. __unregister_linger_request(osdc, req);
  770. else
  771. __unregister_request(osdc, req);
  772. __register_request(osdc, req);
  773. ceph_osdc_put_request(req);
  774. /*
  775. * Unless request has been registered as both normal and
  776. * lingering, __unregister{,_linger}_request clears r_osd.
  777. * However, here we need to preserve r_osd to make sure we
  778. * requeue on the same OSD.
  779. */
  780. WARN_ON(req->r_osd || !osd);
  781. req->r_osd = osd;
  782. dout("%s requeueing %p tid %llu\n", __func__, req, req->r_tid);
  783. __enqueue_request(req);
  784. }
  785. /*
  786. * Resubmit requests pending on the given osd.
  787. */
  788. static void __kick_osd_requests(struct ceph_osd_client *osdc,
  789. struct ceph_osd *osd)
  790. {
  791. struct ceph_osd_request *req, *nreq;
  792. LIST_HEAD(resend);
  793. LIST_HEAD(resend_linger);
  794. int err;
  795. dout("%s osd%d\n", __func__, osd->o_osd);
  796. err = __reset_osd(osdc, osd);
  797. if (err)
  798. return;
  799. /*
  800. * Build up a list of requests to resend by traversing the
  801. * osd's list of requests. Requests for a given object are
  802. * sent in tid order, and that is also the order they're
  803. * kept on this list. Therefore all requests that are in
  804. * flight will be found first, followed by all requests that
  805. * have not yet been sent. And to resend requests while
  806. * preserving this order we will want to put any sent
  807. * requests back on the front of the osd client's unsent
  808. * list.
  809. *
  810. * So we build a separate ordered list of already-sent
  811. * requests for the affected osd and splice it onto the
  812. * front of the osd client's unsent list. Once we've seen a
  813. * request that has not yet been sent we're done. Those
  814. * requests are already sitting right where they belong.
  815. */
  816. list_for_each_entry(req, &osd->o_requests, r_osd_item) {
  817. if (!req->r_sent)
  818. break;
  819. if (!req->r_linger) {
  820. dout("%s requeueing %p tid %llu\n", __func__, req,
  821. req->r_tid);
  822. list_move_tail(&req->r_req_lru_item, &resend);
  823. req->r_flags |= CEPH_OSD_FLAG_RETRY;
  824. } else {
  825. list_move_tail(&req->r_req_lru_item, &resend_linger);
  826. }
  827. }
  828. list_splice(&resend, &osdc->req_unsent);
  829. /*
  830. * Both registered and not yet registered linger requests are
  831. * enqueued with a new tid on the same OSD. We add/move them
  832. * to req_unsent/o_requests at the end to keep things in tid
  833. * order.
  834. */
  835. list_for_each_entry_safe(req, nreq, &osd->o_linger_requests,
  836. r_linger_osd_item) {
  837. WARN_ON(!list_empty(&req->r_req_lru_item));
  838. __kick_linger_request(req);
  839. }
  840. list_for_each_entry_safe(req, nreq, &resend_linger, r_req_lru_item)
  841. __kick_linger_request(req);
  842. }
  843. /*
  844. * If the osd connection drops, we need to resubmit all requests.
  845. */
  846. static void osd_reset(struct ceph_connection *con)
  847. {
  848. struct ceph_osd *osd = con->private;
  849. struct ceph_osd_client *osdc;
  850. if (!osd)
  851. return;
  852. dout("osd_reset osd%d\n", osd->o_osd);
  853. osdc = osd->o_osdc;
  854. down_read(&osdc->map_sem);
  855. mutex_lock(&osdc->request_mutex);
  856. __kick_osd_requests(osdc, osd);
  857. __send_queued(osdc);
  858. mutex_unlock(&osdc->request_mutex);
  859. up_read(&osdc->map_sem);
  860. }
  861. /*
  862. * Track open sessions with osds.
  863. */
  864. static struct ceph_osd *create_osd(struct ceph_osd_client *osdc, int onum)
  865. {
  866. struct ceph_osd *osd;
  867. osd = kzalloc(sizeof(*osd), GFP_NOFS);
  868. if (!osd)
  869. return NULL;
  870. atomic_set(&osd->o_ref, 1);
  871. osd->o_osdc = osdc;
  872. osd->o_osd = onum;
  873. RB_CLEAR_NODE(&osd->o_node);
  874. INIT_LIST_HEAD(&osd->o_requests);
  875. INIT_LIST_HEAD(&osd->o_linger_requests);
  876. INIT_LIST_HEAD(&osd->o_osd_lru);
  877. osd->o_incarnation = 1;
  878. ceph_con_init(&osd->o_con, osd, &osd_con_ops, &osdc->client->msgr);
  879. INIT_LIST_HEAD(&osd->o_keepalive_item);
  880. return osd;
  881. }
  882. static struct ceph_osd *get_osd(struct ceph_osd *osd)
  883. {
  884. if (atomic_inc_not_zero(&osd->o_ref)) {
  885. dout("get_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref)-1,
  886. atomic_read(&osd->o_ref));
  887. return osd;
  888. } else {
  889. dout("get_osd %p FAIL\n", osd);
  890. return NULL;
  891. }
  892. }
  893. static void put_osd(struct ceph_osd *osd)
  894. {
  895. dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
  896. atomic_read(&osd->o_ref) - 1);
  897. if (atomic_dec_and_test(&osd->o_ref)) {
  898. if (osd->o_auth.authorizer)
  899. ceph_auth_destroy_authorizer(osd->o_auth.authorizer);
  900. kfree(osd);
  901. }
  902. }
  903. DEFINE_RB_FUNCS(osd, struct ceph_osd, o_osd, o_node)
  904. /*
  905. * remove an osd from our map
  906. */
  907. static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
  908. {
  909. dout("%s %p osd%d\n", __func__, osd, osd->o_osd);
  910. WARN_ON(!list_empty(&osd->o_requests));
  911. WARN_ON(!list_empty(&osd->o_linger_requests));
  912. list_del_init(&osd->o_osd_lru);
  913. erase_osd(&osdc->osds, osd);
  914. }
  915. static void remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
  916. {
  917. dout("%s %p osd%d\n", __func__, osd, osd->o_osd);
  918. if (!RB_EMPTY_NODE(&osd->o_node)) {
  919. ceph_con_close(&osd->o_con);
  920. __remove_osd(osdc, osd);
  921. put_osd(osd);
  922. }
  923. }
  924. static void __move_osd_to_lru(struct ceph_osd_client *osdc,
  925. struct ceph_osd *osd)
  926. {
  927. dout("%s %p\n", __func__, osd);
  928. BUG_ON(!list_empty(&osd->o_osd_lru));
  929. list_add_tail(&osd->o_osd_lru, &osdc->osd_lru);
  930. osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl;
  931. }
  932. static void maybe_move_osd_to_lru(struct ceph_osd_client *osdc,
  933. struct ceph_osd *osd)
  934. {
  935. dout("%s %p\n", __func__, osd);
  936. if (list_empty(&osd->o_requests) &&
  937. list_empty(&osd->o_linger_requests))
  938. __move_osd_to_lru(osdc, osd);
  939. }
  940. static void __remove_osd_from_lru(struct ceph_osd *osd)
  941. {
  942. dout("__remove_osd_from_lru %p\n", osd);
  943. if (!list_empty(&osd->o_osd_lru))
  944. list_del_init(&osd->o_osd_lru);
  945. }
  946. /*
  947. * reset osd connect
  948. */
  949. static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
  950. {
  951. struct ceph_entity_addr *peer_addr;
  952. dout("__reset_osd %p osd%d\n", osd, osd->o_osd);
  953. if (list_empty(&osd->o_requests) &&
  954. list_empty(&osd->o_linger_requests)) {
  955. remove_osd(osdc, osd);
  956. return -ENODEV;
  957. }
  958. peer_addr = &osdc->osdmap->osd_addr[osd->o_osd];
  959. if (!memcmp(peer_addr, &osd->o_con.peer_addr, sizeof (*peer_addr)) &&
  960. !ceph_con_opened(&osd->o_con)) {
  961. struct ceph_osd_request *req;
  962. dout("osd addr hasn't changed and connection never opened, "
  963. "letting msgr retry\n");
  964. /* touch each r_stamp for handle_timeout()'s benfit */
  965. list_for_each_entry(req, &osd->o_requests, r_osd_item)
  966. req->r_stamp = jiffies;
  967. return -EAGAIN;
  968. }
  969. ceph_con_close(&osd->o_con);
  970. ceph_con_open(&osd->o_con, CEPH_ENTITY_TYPE_OSD, osd->o_osd, peer_addr);
  971. osd->o_incarnation++;
  972. return 0;
  973. }
  974. /*
  975. * Register request, assign tid. If this is the first request, set up
  976. * the timeout event.
  977. */
  978. static void __register_request(struct ceph_osd_client *osdc,
  979. struct ceph_osd_request *req)
  980. {
  981. req->r_tid = ++osdc->last_tid;
  982. req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
  983. dout("__register_request %p tid %lld\n", req, req->r_tid);
  984. insert_request(&osdc->requests, req);
  985. ceph_osdc_get_request(req);
  986. osdc->num_requests++;
  987. }
  988. /*
  989. * called under osdc->request_mutex
  990. */
  991. static void __unregister_request(struct ceph_osd_client *osdc,
  992. struct ceph_osd_request *req)
  993. {
  994. if (RB_EMPTY_NODE(&req->r_node)) {
  995. dout("__unregister_request %p tid %lld not registered\n",
  996. req, req->r_tid);
  997. return;
  998. }
  999. dout("__unregister_request %p tid %lld\n", req, req->r_tid);
  1000. erase_request(&osdc->requests, req);
  1001. osdc->num_requests--;
  1002. if (req->r_osd) {
  1003. /* make sure the original request isn't in flight. */
  1004. ceph_msg_revoke(req->r_request);
  1005. list_del_init(&req->r_osd_item);
  1006. maybe_move_osd_to_lru(osdc, req->r_osd);
  1007. if (list_empty(&req->r_linger_osd_item))
  1008. req->r_osd = NULL;
  1009. }
  1010. list_del_init(&req->r_req_lru_item);
  1011. ceph_osdc_put_request(req);
  1012. }
  1013. /*
  1014. * Cancel a previously queued request message
  1015. */
  1016. static void __cancel_request(struct ceph_osd_request *req)
  1017. {
  1018. if (req->r_sent && req->r_osd) {
  1019. ceph_msg_revoke(req->r_request);
  1020. req->r_sent = 0;
  1021. }
  1022. }
  1023. static void __register_linger_request(struct ceph_osd_client *osdc,
  1024. struct ceph_osd_request *req)
  1025. {
  1026. dout("%s %p tid %llu\n", __func__, req, req->r_tid);
  1027. WARN_ON(!req->r_linger);
  1028. ceph_osdc_get_request(req);
  1029. list_add_tail(&req->r_linger_item, &osdc->req_linger);
  1030. if (req->r_osd)
  1031. list_add_tail(&req->r_linger_osd_item,
  1032. &req->r_osd->o_linger_requests);
  1033. }
  1034. static void __unregister_linger_request(struct ceph_osd_client *osdc,
  1035. struct ceph_osd_request *req)
  1036. {
  1037. WARN_ON(!req->r_linger);
  1038. if (list_empty(&req->r_linger_item)) {
  1039. dout("%s %p tid %llu not registered\n", __func__, req,
  1040. req->r_tid);
  1041. return;
  1042. }
  1043. dout("%s %p tid %llu\n", __func__, req, req->r_tid);
  1044. list_del_init(&req->r_linger_item);
  1045. if (req->r_osd) {
  1046. list_del_init(&req->r_linger_osd_item);
  1047. maybe_move_osd_to_lru(osdc, req->r_osd);
  1048. if (list_empty(&req->r_osd_item))
  1049. req->r_osd = NULL;
  1050. }
  1051. ceph_osdc_put_request(req);
  1052. }
  1053. void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc,
  1054. struct ceph_osd_request *req)
  1055. {
  1056. if (!req->r_linger) {
  1057. dout("set_request_linger %p\n", req);
  1058. req->r_linger = 1;
  1059. }
  1060. }
  1061. EXPORT_SYMBOL(ceph_osdc_set_request_linger);
  1062. static bool __pool_full(struct ceph_pg_pool_info *pi)
  1063. {
  1064. return pi->flags & CEPH_POOL_FLAG_FULL;
  1065. }
  1066. /*
  1067. * Returns whether a request should be blocked from being sent
  1068. * based on the current osdmap and osd_client settings.
  1069. *
  1070. * Caller should hold map_sem for read.
  1071. */
  1072. static bool target_should_be_paused(struct ceph_osd_client *osdc,
  1073. const struct ceph_osd_request_target *t,
  1074. struct ceph_pg_pool_info *pi)
  1075. {
  1076. bool pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD);
  1077. bool pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) ||
  1078. ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
  1079. __pool_full(pi);
  1080. WARN_ON(pi->id != t->base_oloc.pool);
  1081. return (t->flags & CEPH_OSD_FLAG_READ && pauserd) ||
  1082. (t->flags & CEPH_OSD_FLAG_WRITE && pausewr);
  1083. }
  1084. enum calc_target_result {
  1085. CALC_TARGET_NO_ACTION = 0,
  1086. CALC_TARGET_NEED_RESEND,
  1087. CALC_TARGET_POOL_DNE,
  1088. };
  1089. static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
  1090. struct ceph_osd_request_target *t,
  1091. u32 *last_force_resend,
  1092. bool any_change)
  1093. {
  1094. struct ceph_pg_pool_info *pi;
  1095. struct ceph_pg pgid, last_pgid;
  1096. struct ceph_osds up, acting;
  1097. bool force_resend = false;
  1098. bool need_check_tiering = false;
  1099. bool need_resend = false;
  1100. bool sort_bitwise = ceph_osdmap_flag(osdc->osdmap,
  1101. CEPH_OSDMAP_SORTBITWISE);
  1102. enum calc_target_result ct_res;
  1103. int ret;
  1104. pi = ceph_pg_pool_by_id(osdc->osdmap, t->base_oloc.pool);
  1105. if (!pi) {
  1106. t->osd = CEPH_HOMELESS_OSD;
  1107. ct_res = CALC_TARGET_POOL_DNE;
  1108. goto out;
  1109. }
  1110. if (osdc->osdmap->epoch == pi->last_force_request_resend) {
  1111. if (last_force_resend &&
  1112. *last_force_resend < pi->last_force_request_resend) {
  1113. *last_force_resend = pi->last_force_request_resend;
  1114. force_resend = true;
  1115. } else if (!last_force_resend) {
  1116. force_resend = true;
  1117. }
  1118. }
  1119. if (ceph_oid_empty(&t->target_oid) || force_resend) {
  1120. ceph_oid_copy(&t->target_oid, &t->base_oid);
  1121. need_check_tiering = true;
  1122. }
  1123. if (ceph_oloc_empty(&t->target_oloc) || force_resend) {
  1124. ceph_oloc_copy(&t->target_oloc, &t->base_oloc);
  1125. need_check_tiering = true;
  1126. }
  1127. if (need_check_tiering &&
  1128. (t->flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) {
  1129. if (t->flags & CEPH_OSD_FLAG_READ && pi->read_tier >= 0)
  1130. t->target_oloc.pool = pi->read_tier;
  1131. if (t->flags & CEPH_OSD_FLAG_WRITE && pi->write_tier >= 0)
  1132. t->target_oloc.pool = pi->write_tier;
  1133. }
  1134. ret = ceph_object_locator_to_pg(osdc->osdmap, &t->target_oid,
  1135. &t->target_oloc, &pgid);
  1136. if (ret) {
  1137. WARN_ON(ret != -ENOENT);
  1138. t->osd = CEPH_HOMELESS_OSD;
  1139. ct_res = CALC_TARGET_POOL_DNE;
  1140. goto out;
  1141. }
  1142. last_pgid.pool = pgid.pool;
  1143. last_pgid.seed = ceph_stable_mod(pgid.seed, t->pg_num, t->pg_num_mask);
  1144. ceph_pg_to_up_acting_osds(osdc->osdmap, &pgid, &up, &acting);
  1145. if (any_change &&
  1146. ceph_is_new_interval(&t->acting,
  1147. &acting,
  1148. &t->up,
  1149. &up,
  1150. t->size,
  1151. pi->size,
  1152. t->min_size,
  1153. pi->min_size,
  1154. t->pg_num,
  1155. pi->pg_num,
  1156. t->sort_bitwise,
  1157. sort_bitwise,
  1158. &last_pgid))
  1159. force_resend = true;
  1160. if (t->paused && !target_should_be_paused(osdc, t, pi)) {
  1161. t->paused = false;
  1162. need_resend = true;
  1163. }
  1164. if (ceph_pg_compare(&t->pgid, &pgid) ||
  1165. ceph_osds_changed(&t->acting, &acting, any_change) ||
  1166. force_resend) {
  1167. t->pgid = pgid; /* struct */
  1168. ceph_osds_copy(&t->acting, &acting);
  1169. ceph_osds_copy(&t->up, &up);
  1170. t->size = pi->size;
  1171. t->min_size = pi->min_size;
  1172. t->pg_num = pi->pg_num;
  1173. t->pg_num_mask = pi->pg_num_mask;
  1174. t->sort_bitwise = sort_bitwise;
  1175. t->osd = acting.primary;
  1176. need_resend = true;
  1177. }
  1178. ct_res = need_resend ? CALC_TARGET_NEED_RESEND : CALC_TARGET_NO_ACTION;
  1179. out:
  1180. dout("%s t %p -> ct_res %d osd %d\n", __func__, t, ct_res, t->osd);
  1181. return ct_res;
  1182. }
  1183. static void __enqueue_request(struct ceph_osd_request *req)
  1184. {
  1185. struct ceph_osd_client *osdc = req->r_osdc;
  1186. dout("%s %p tid %llu to osd%d\n", __func__, req, req->r_tid,
  1187. req->r_osd ? req->r_osd->o_osd : -1);
  1188. if (req->r_osd) {
  1189. __remove_osd_from_lru(req->r_osd);
  1190. list_add_tail(&req->r_osd_item, &req->r_osd->o_requests);
  1191. list_move_tail(&req->r_req_lru_item, &osdc->req_unsent);
  1192. } else {
  1193. list_move_tail(&req->r_req_lru_item, &osdc->req_notarget);
  1194. }
  1195. }
  1196. /*
  1197. * Pick an osd (the first 'up' osd in the pg), allocate the osd struct
  1198. * (as needed), and set the request r_osd appropriately. If there is
  1199. * no up osd, set r_osd to NULL. Move the request to the appropriate list
  1200. * (unsent, homeless) or leave on in-flight lru.
  1201. *
  1202. * Return 0 if unchanged, 1 if changed, or negative on error.
  1203. *
  1204. * Caller should hold map_sem for read and request_mutex.
  1205. */
  1206. static int __map_request(struct ceph_osd_client *osdc,
  1207. struct ceph_osd_request *req, int force_resend)
  1208. {
  1209. enum calc_target_result ct_res;
  1210. int err;
  1211. dout("map_request %p tid %lld\n", req, req->r_tid);
  1212. ct_res = calc_target(osdc, &req->r_t, NULL, force_resend);
  1213. switch (ct_res) {
  1214. case CALC_TARGET_POOL_DNE:
  1215. list_move(&req->r_req_lru_item, &osdc->req_notarget);
  1216. return -EIO;
  1217. case CALC_TARGET_NO_ACTION:
  1218. return 0; /* no change */
  1219. default:
  1220. BUG_ON(ct_res != CALC_TARGET_NEED_RESEND);
  1221. }
  1222. dout("map_request tid %llu pgid %lld.%x osd%d (was osd%d)\n",
  1223. req->r_tid, req->r_t.pgid.pool, req->r_t.pgid.seed, req->r_t.osd,
  1224. req->r_osd ? req->r_osd->o_osd : -1);
  1225. if (req->r_osd) {
  1226. __cancel_request(req);
  1227. list_del_init(&req->r_osd_item);
  1228. list_del_init(&req->r_linger_osd_item);
  1229. req->r_osd = NULL;
  1230. }
  1231. req->r_osd = lookup_osd(&osdc->osds, req->r_t.osd);
  1232. if (!req->r_osd && req->r_t.osd >= 0) {
  1233. err = -ENOMEM;
  1234. req->r_osd = create_osd(osdc, req->r_t.osd);
  1235. if (!req->r_osd) {
  1236. list_move(&req->r_req_lru_item, &osdc->req_notarget);
  1237. goto out;
  1238. }
  1239. dout("map_request osd %p is osd%d\n", req->r_osd,
  1240. req->r_osd->o_osd);
  1241. insert_osd(&osdc->osds, req->r_osd);
  1242. ceph_con_open(&req->r_osd->o_con,
  1243. CEPH_ENTITY_TYPE_OSD, req->r_osd->o_osd,
  1244. &osdc->osdmap->osd_addr[req->r_osd->o_osd]);
  1245. }
  1246. __enqueue_request(req);
  1247. err = 1; /* osd or pg changed */
  1248. out:
  1249. return err;
  1250. }
  1251. static void setup_request_data(struct ceph_osd_request *req,
  1252. struct ceph_msg *msg)
  1253. {
  1254. u32 data_len = 0;
  1255. int i;
  1256. if (!list_empty(&msg->data))
  1257. return;
  1258. WARN_ON(msg->data_length);
  1259. for (i = 0; i < req->r_num_ops; i++) {
  1260. struct ceph_osd_req_op *op = &req->r_ops[i];
  1261. switch (op->op) {
  1262. /* request */
  1263. case CEPH_OSD_OP_WRITE:
  1264. case CEPH_OSD_OP_WRITEFULL:
  1265. WARN_ON(op->indata_len != op->extent.length);
  1266. ceph_osdc_msg_data_add(msg, &op->extent.osd_data);
  1267. break;
  1268. case CEPH_OSD_OP_SETXATTR:
  1269. case CEPH_OSD_OP_CMPXATTR:
  1270. WARN_ON(op->indata_len != op->xattr.name_len +
  1271. op->xattr.value_len);
  1272. ceph_osdc_msg_data_add(msg, &op->xattr.osd_data);
  1273. break;
  1274. /* reply */
  1275. case CEPH_OSD_OP_STAT:
  1276. ceph_osdc_msg_data_add(req->r_reply,
  1277. &op->raw_data_in);
  1278. break;
  1279. case CEPH_OSD_OP_READ:
  1280. ceph_osdc_msg_data_add(req->r_reply,
  1281. &op->extent.osd_data);
  1282. break;
  1283. /* both */
  1284. case CEPH_OSD_OP_CALL:
  1285. WARN_ON(op->indata_len != op->cls.class_len +
  1286. op->cls.method_len +
  1287. op->cls.indata_len);
  1288. ceph_osdc_msg_data_add(msg, &op->cls.request_info);
  1289. /* optional, can be NONE */
  1290. ceph_osdc_msg_data_add(msg, &op->cls.request_data);
  1291. /* optional, can be NONE */
  1292. ceph_osdc_msg_data_add(req->r_reply,
  1293. &op->cls.response_data);
  1294. break;
  1295. }
  1296. data_len += op->indata_len;
  1297. }
  1298. WARN_ON(data_len != msg->data_length);
  1299. }
  1300. static void encode_request(struct ceph_osd_request *req, struct ceph_msg *msg)
  1301. {
  1302. void *p = msg->front.iov_base;
  1303. void *const end = p + msg->front_alloc_len;
  1304. u32 data_len = 0;
  1305. int i;
  1306. if (req->r_flags & CEPH_OSD_FLAG_WRITE) {
  1307. /* snapshots aren't writeable */
  1308. WARN_ON(req->r_snapid != CEPH_NOSNAP);
  1309. } else {
  1310. WARN_ON(req->r_mtime.tv_sec || req->r_mtime.tv_nsec ||
  1311. req->r_data_offset || req->r_snapc);
  1312. }
  1313. setup_request_data(req, msg);
  1314. ceph_encode_32(&p, 1); /* client_inc, always 1 */
  1315. ceph_encode_32(&p, req->r_osdc->osdmap->epoch);
  1316. ceph_encode_32(&p, req->r_flags);
  1317. ceph_encode_timespec(p, &req->r_mtime);
  1318. p += sizeof(struct ceph_timespec);
  1319. /* aka reassert_version */
  1320. memcpy(p, &req->r_replay_version, sizeof(req->r_replay_version));
  1321. p += sizeof(req->r_replay_version);
  1322. /* oloc */
  1323. ceph_encode_8(&p, 4);
  1324. ceph_encode_8(&p, 4);
  1325. ceph_encode_32(&p, 8 + 4 + 4);
  1326. ceph_encode_64(&p, req->r_t.target_oloc.pool);
  1327. ceph_encode_32(&p, -1); /* preferred */
  1328. ceph_encode_32(&p, 0); /* key len */
  1329. /* pgid */
  1330. ceph_encode_8(&p, 1);
  1331. ceph_encode_64(&p, req->r_t.pgid.pool);
  1332. ceph_encode_32(&p, req->r_t.pgid.seed);
  1333. ceph_encode_32(&p, -1); /* preferred */
  1334. /* oid */
  1335. ceph_encode_32(&p, req->r_t.target_oid.name_len);
  1336. memcpy(p, req->r_t.target_oid.name, req->r_t.target_oid.name_len);
  1337. p += req->r_t.target_oid.name_len;
  1338. /* ops, can imply data */
  1339. ceph_encode_16(&p, req->r_num_ops);
  1340. for (i = 0; i < req->r_num_ops; i++) {
  1341. data_len += osd_req_encode_op(p, &req->r_ops[i]);
  1342. p += sizeof(struct ceph_osd_op);
  1343. }
  1344. ceph_encode_64(&p, req->r_snapid); /* snapid */
  1345. if (req->r_snapc) {
  1346. ceph_encode_64(&p, req->r_snapc->seq);
  1347. ceph_encode_32(&p, req->r_snapc->num_snaps);
  1348. for (i = 0; i < req->r_snapc->num_snaps; i++)
  1349. ceph_encode_64(&p, req->r_snapc->snaps[i]);
  1350. } else {
  1351. ceph_encode_64(&p, 0); /* snap_seq */
  1352. ceph_encode_32(&p, 0); /* snaps len */
  1353. }
  1354. ceph_encode_32(&p, req->r_attempts); /* retry_attempt */
  1355. BUG_ON(p > end);
  1356. msg->front.iov_len = p - msg->front.iov_base;
  1357. msg->hdr.version = cpu_to_le16(4); /* MOSDOp v4 */
  1358. msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
  1359. msg->hdr.data_len = cpu_to_le32(data_len);
  1360. /*
  1361. * The header "data_off" is a hint to the receiver allowing it
  1362. * to align received data into its buffers such that there's no
  1363. * need to re-copy it before writing it to disk (direct I/O).
  1364. */
  1365. msg->hdr.data_off = cpu_to_le16(req->r_data_offset);
  1366. dout("%s req %p oid %*pE oid_len %d front %zu data %u\n", __func__,
  1367. req, req->r_t.target_oid.name_len, req->r_t.target_oid.name,
  1368. req->r_t.target_oid.name_len, msg->front.iov_len, data_len);
  1369. }
  1370. /*
  1371. * @req has to be assigned a tid and registered.
  1372. */
  1373. static void send_request(struct ceph_osd_request *req)
  1374. {
  1375. struct ceph_osd *osd = req->r_osd;
  1376. WARN_ON(osd->o_osd != req->r_t.osd);
  1377. req->r_flags |= CEPH_OSD_FLAG_KNOWN_REDIR;
  1378. if (req->r_attempts)
  1379. req->r_flags |= CEPH_OSD_FLAG_RETRY;
  1380. else
  1381. WARN_ON(req->r_flags & CEPH_OSD_FLAG_RETRY);
  1382. encode_request(req, req->r_request);
  1383. dout("%s req %p tid %llu to pg %llu.%x osd%d flags 0x%x attempt %d\n",
  1384. __func__, req, req->r_tid, req->r_t.pgid.pool, req->r_t.pgid.seed,
  1385. req->r_t.osd, req->r_flags, req->r_attempts);
  1386. req->r_t.paused = false;
  1387. req->r_stamp = jiffies;
  1388. req->r_attempts++;
  1389. req->r_sent = osd->o_incarnation;
  1390. req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
  1391. ceph_con_send(&osd->o_con, ceph_msg_get(req->r_request));
  1392. }
  1393. /*
  1394. * Send any requests in the queue (req_unsent).
  1395. */
  1396. static void __send_queued(struct ceph_osd_client *osdc)
  1397. {
  1398. struct ceph_osd_request *req, *tmp;
  1399. dout("__send_queued\n");
  1400. list_for_each_entry_safe(req, tmp, &osdc->req_unsent, r_req_lru_item) {
  1401. list_move_tail(&req->r_req_lru_item, &osdc->req_lru);
  1402. send_request(req);
  1403. }
  1404. }
  1405. /*
  1406. * Caller should hold map_sem for read and request_mutex.
  1407. */
  1408. static int __ceph_osdc_start_request(struct ceph_osd_client *osdc,
  1409. struct ceph_osd_request *req,
  1410. bool nofail)
  1411. {
  1412. int rc;
  1413. __register_request(osdc, req);
  1414. req->r_sent = 0;
  1415. req->r_got_reply = 0;
  1416. rc = __map_request(osdc, req, 0);
  1417. if (rc < 0) {
  1418. if (nofail) {
  1419. dout("osdc_start_request failed map, "
  1420. " will retry %lld\n", req->r_tid);
  1421. rc = 0;
  1422. } else {
  1423. __unregister_request(osdc, req);
  1424. }
  1425. return rc;
  1426. }
  1427. if (req->r_osd == NULL) {
  1428. dout("send_request %p no up osds in pg\n", req);
  1429. ceph_monc_request_next_osdmap(&osdc->client->monc);
  1430. } else {
  1431. __send_queued(osdc);
  1432. }
  1433. return 0;
  1434. }
  1435. static void __complete_request(struct ceph_osd_request *req)
  1436. {
  1437. if (req->r_callback)
  1438. req->r_callback(req);
  1439. else
  1440. complete_all(&req->r_completion);
  1441. }
  1442. /*
  1443. * Timeout callback, called every N seconds. When 1 or more OSD
  1444. * requests has been active for more than N seconds, we send a keepalive
  1445. * (tag + timestamp) to its OSD to ensure any communications channel
  1446. * reset is detected.
  1447. */
  1448. static void handle_timeout(struct work_struct *work)
  1449. {
  1450. struct ceph_osd_client *osdc =
  1451. container_of(work, struct ceph_osd_client, timeout_work.work);
  1452. struct ceph_options *opts = osdc->client->options;
  1453. struct ceph_osd_request *req;
  1454. struct ceph_osd *osd;
  1455. struct list_head slow_osds;
  1456. dout("timeout\n");
  1457. down_read(&osdc->map_sem);
  1458. ceph_monc_request_next_osdmap(&osdc->client->monc);
  1459. mutex_lock(&osdc->request_mutex);
  1460. /*
  1461. * ping osds that are a bit slow. this ensures that if there
  1462. * is a break in the TCP connection we will notice, and reopen
  1463. * a connection with that osd (from the fault callback).
  1464. */
  1465. INIT_LIST_HEAD(&slow_osds);
  1466. list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) {
  1467. if (time_before(jiffies,
  1468. req->r_stamp + opts->osd_keepalive_timeout))
  1469. break;
  1470. osd = req->r_osd;
  1471. BUG_ON(!osd);
  1472. dout(" tid %llu is slow, will send keepalive on osd%d\n",
  1473. req->r_tid, osd->o_osd);
  1474. list_move_tail(&osd->o_keepalive_item, &slow_osds);
  1475. }
  1476. while (!list_empty(&slow_osds)) {
  1477. osd = list_entry(slow_osds.next, struct ceph_osd,
  1478. o_keepalive_item);
  1479. list_del_init(&osd->o_keepalive_item);
  1480. ceph_con_keepalive(&osd->o_con);
  1481. }
  1482. __send_queued(osdc);
  1483. mutex_unlock(&osdc->request_mutex);
  1484. up_read(&osdc->map_sem);
  1485. schedule_delayed_work(&osdc->timeout_work,
  1486. osdc->client->options->osd_keepalive_timeout);
  1487. }
  1488. static void handle_osds_timeout(struct work_struct *work)
  1489. {
  1490. struct ceph_osd_client *osdc =
  1491. container_of(work, struct ceph_osd_client,
  1492. osds_timeout_work.work);
  1493. unsigned long delay = osdc->client->options->osd_idle_ttl / 4;
  1494. struct ceph_osd *osd, *nosd;
  1495. dout("%s osdc %p\n", __func__, osdc);
  1496. down_read(&osdc->map_sem);
  1497. mutex_lock(&osdc->request_mutex);
  1498. list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) {
  1499. if (time_before(jiffies, osd->lru_ttl))
  1500. break;
  1501. remove_osd(osdc, osd);
  1502. }
  1503. mutex_unlock(&osdc->request_mutex);
  1504. up_read(&osdc->map_sem);
  1505. schedule_delayed_work(&osdc->osds_timeout_work,
  1506. round_jiffies_relative(delay));
  1507. }
  1508. static int ceph_oloc_decode(void **p, void *end,
  1509. struct ceph_object_locator *oloc)
  1510. {
  1511. u8 struct_v, struct_cv;
  1512. u32 len;
  1513. void *struct_end;
  1514. int ret = 0;
  1515. ceph_decode_need(p, end, 1 + 1 + 4, e_inval);
  1516. struct_v = ceph_decode_8(p);
  1517. struct_cv = ceph_decode_8(p);
  1518. if (struct_v < 3) {
  1519. pr_warn("got v %d < 3 cv %d of ceph_object_locator\n",
  1520. struct_v, struct_cv);
  1521. goto e_inval;
  1522. }
  1523. if (struct_cv > 6) {
  1524. pr_warn("got v %d cv %d > 6 of ceph_object_locator\n",
  1525. struct_v, struct_cv);
  1526. goto e_inval;
  1527. }
  1528. len = ceph_decode_32(p);
  1529. ceph_decode_need(p, end, len, e_inval);
  1530. struct_end = *p + len;
  1531. oloc->pool = ceph_decode_64(p);
  1532. *p += 4; /* skip preferred */
  1533. len = ceph_decode_32(p);
  1534. if (len > 0) {
  1535. pr_warn("ceph_object_locator::key is set\n");
  1536. goto e_inval;
  1537. }
  1538. if (struct_v >= 5) {
  1539. len = ceph_decode_32(p);
  1540. if (len > 0) {
  1541. pr_warn("ceph_object_locator::nspace is set\n");
  1542. goto e_inval;
  1543. }
  1544. }
  1545. if (struct_v >= 6) {
  1546. s64 hash = ceph_decode_64(p);
  1547. if (hash != -1) {
  1548. pr_warn("ceph_object_locator::hash is set\n");
  1549. goto e_inval;
  1550. }
  1551. }
  1552. /* skip the rest */
  1553. *p = struct_end;
  1554. out:
  1555. return ret;
  1556. e_inval:
  1557. ret = -EINVAL;
  1558. goto out;
  1559. }
  1560. static int ceph_redirect_decode(void **p, void *end,
  1561. struct ceph_request_redirect *redir)
  1562. {
  1563. u8 struct_v, struct_cv;
  1564. u32 len;
  1565. void *struct_end;
  1566. int ret;
  1567. ceph_decode_need(p, end, 1 + 1 + 4, e_inval);
  1568. struct_v = ceph_decode_8(p);
  1569. struct_cv = ceph_decode_8(p);
  1570. if (struct_cv > 1) {
  1571. pr_warn("got v %d cv %d > 1 of ceph_request_redirect\n",
  1572. struct_v, struct_cv);
  1573. goto e_inval;
  1574. }
  1575. len = ceph_decode_32(p);
  1576. ceph_decode_need(p, end, len, e_inval);
  1577. struct_end = *p + len;
  1578. ret = ceph_oloc_decode(p, end, &redir->oloc);
  1579. if (ret)
  1580. goto out;
  1581. len = ceph_decode_32(p);
  1582. if (len > 0) {
  1583. pr_warn("ceph_request_redirect::object_name is set\n");
  1584. goto e_inval;
  1585. }
  1586. len = ceph_decode_32(p);
  1587. *p += len; /* skip osd_instructions */
  1588. /* skip the rest */
  1589. *p = struct_end;
  1590. out:
  1591. return ret;
  1592. e_inval:
  1593. ret = -EINVAL;
  1594. goto out;
  1595. }
  1596. struct MOSDOpReply {
  1597. struct ceph_pg pgid;
  1598. u64 flags;
  1599. int result;
  1600. u32 epoch;
  1601. int num_ops;
  1602. u32 outdata_len[CEPH_OSD_MAX_OPS];
  1603. s32 rval[CEPH_OSD_MAX_OPS];
  1604. int retry_attempt;
  1605. struct ceph_eversion replay_version;
  1606. u64 user_version;
  1607. struct ceph_request_redirect redirect;
  1608. };
  1609. static int decode_MOSDOpReply(const struct ceph_msg *msg, struct MOSDOpReply *m)
  1610. {
  1611. void *p = msg->front.iov_base;
  1612. void *const end = p + msg->front.iov_len;
  1613. u16 version = le16_to_cpu(msg->hdr.version);
  1614. struct ceph_eversion bad_replay_version;
  1615. u8 decode_redir;
  1616. u32 len;
  1617. int ret;
  1618. int i;
  1619. ceph_decode_32_safe(&p, end, len, e_inval);
  1620. ceph_decode_need(&p, end, len, e_inval);
  1621. p += len; /* skip oid */
  1622. ret = ceph_decode_pgid(&p, end, &m->pgid);
  1623. if (ret)
  1624. return ret;
  1625. ceph_decode_64_safe(&p, end, m->flags, e_inval);
  1626. ceph_decode_32_safe(&p, end, m->result, e_inval);
  1627. ceph_decode_need(&p, end, sizeof(bad_replay_version), e_inval);
  1628. memcpy(&bad_replay_version, p, sizeof(bad_replay_version));
  1629. p += sizeof(bad_replay_version);
  1630. ceph_decode_32_safe(&p, end, m->epoch, e_inval);
  1631. ceph_decode_32_safe(&p, end, m->num_ops, e_inval);
  1632. if (m->num_ops > ARRAY_SIZE(m->outdata_len))
  1633. goto e_inval;
  1634. ceph_decode_need(&p, end, m->num_ops * sizeof(struct ceph_osd_op),
  1635. e_inval);
  1636. for (i = 0; i < m->num_ops; i++) {
  1637. struct ceph_osd_op *op = p;
  1638. m->outdata_len[i] = le32_to_cpu(op->payload_len);
  1639. p += sizeof(*op);
  1640. }
  1641. ceph_decode_32_safe(&p, end, m->retry_attempt, e_inval);
  1642. for (i = 0; i < m->num_ops; i++)
  1643. ceph_decode_32_safe(&p, end, m->rval[i], e_inval);
  1644. if (version >= 5) {
  1645. ceph_decode_need(&p, end, sizeof(m->replay_version), e_inval);
  1646. memcpy(&m->replay_version, p, sizeof(m->replay_version));
  1647. p += sizeof(m->replay_version);
  1648. ceph_decode_64_safe(&p, end, m->user_version, e_inval);
  1649. } else {
  1650. m->replay_version = bad_replay_version; /* struct */
  1651. m->user_version = le64_to_cpu(m->replay_version.version);
  1652. }
  1653. if (version >= 6) {
  1654. if (version >= 7)
  1655. ceph_decode_8_safe(&p, end, decode_redir, e_inval);
  1656. else
  1657. decode_redir = 1;
  1658. } else {
  1659. decode_redir = 0;
  1660. }
  1661. if (decode_redir) {
  1662. ret = ceph_redirect_decode(&p, end, &m->redirect);
  1663. if (ret)
  1664. return ret;
  1665. } else {
  1666. ceph_oloc_init(&m->redirect.oloc);
  1667. }
  1668. return 0;
  1669. e_inval:
  1670. return -EINVAL;
  1671. }
  1672. /*
  1673. * We are done with @req if
  1674. * - @m is a safe reply, or
  1675. * - @m is an unsafe reply and we didn't want a safe one
  1676. */
  1677. static bool done_request(const struct ceph_osd_request *req,
  1678. const struct MOSDOpReply *m)
  1679. {
  1680. return (m->result < 0 ||
  1681. (m->flags & CEPH_OSD_FLAG_ONDISK) ||
  1682. !(req->r_flags & CEPH_OSD_FLAG_ONDISK));
  1683. }
  1684. /*
  1685. * handle osd op reply. either call the callback if it is specified,
  1686. * or do the completion to wake up the waiting thread.
  1687. *
  1688. * ->r_unsafe_callback is set? yes no
  1689. *
  1690. * first reply is OK (needed r_cb/r_completion, r_cb/r_completion,
  1691. * any or needed/got safe) r_safe_completion r_safe_completion
  1692. *
  1693. * first reply is unsafe r_unsafe_cb(true) (nothing)
  1694. *
  1695. * when we get the safe reply r_unsafe_cb(false), r_cb/r_completion,
  1696. * r_safe_completion r_safe_completion
  1697. */
  1698. static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
  1699. {
  1700. struct ceph_osd_request *req;
  1701. struct MOSDOpReply m;
  1702. u64 tid = le64_to_cpu(msg->hdr.tid);
  1703. u32 data_len = 0;
  1704. bool already_acked;
  1705. int ret;
  1706. int i;
  1707. dout("%s msg %p tid %llu\n", __func__, msg, tid);
  1708. down_read(&osdc->map_sem);
  1709. mutex_lock(&osdc->request_mutex);
  1710. req = lookup_request(&osdc->requests, tid);
  1711. if (!req) {
  1712. dout("%s no tid %llu\n", __func__, tid);
  1713. goto out_unlock;
  1714. }
  1715. ceph_osdc_get_request(req);
  1716. ret = decode_MOSDOpReply(msg, &m);
  1717. if (ret) {
  1718. pr_err("failed to decode MOSDOpReply for tid %llu: %d\n",
  1719. req->r_tid, ret);
  1720. ceph_msg_dump(msg);
  1721. goto fail_request;
  1722. }
  1723. dout("%s req %p tid %llu flags 0x%llx pgid %llu.%x epoch %u attempt %d v %u'%llu uv %llu\n",
  1724. __func__, req, req->r_tid, m.flags, m.pgid.pool, m.pgid.seed,
  1725. m.epoch, m.retry_attempt, le32_to_cpu(m.replay_version.epoch),
  1726. le64_to_cpu(m.replay_version.version), m.user_version);
  1727. if (m.retry_attempt >= 0) {
  1728. if (m.retry_attempt != req->r_attempts - 1) {
  1729. dout("req %p tid %llu retry_attempt %d != %d, ignoring\n",
  1730. req, req->r_tid, m.retry_attempt,
  1731. req->r_attempts - 1);
  1732. goto out_put;
  1733. }
  1734. } else {
  1735. WARN_ON(1); /* MOSDOpReply v4 is assumed */
  1736. }
  1737. if (!ceph_oloc_empty(&m.redirect.oloc)) {
  1738. dout("req %p tid %llu redirect pool %lld\n", req, req->r_tid,
  1739. m.redirect.oloc.pool);
  1740. __unregister_request(osdc, req);
  1741. ceph_oloc_copy(&req->r_t.target_oloc, &m.redirect.oloc);
  1742. /*
  1743. * Start redirect requests with nofail=true. If
  1744. * mapping fails, request will end up on the notarget
  1745. * list, waiting for the new osdmap (which can take
  1746. * a while), even though the original request mapped
  1747. * successfully. In the future we might want to follow
  1748. * original request's nofail setting here.
  1749. */
  1750. ret = __ceph_osdc_start_request(osdc, req, true);
  1751. BUG_ON(ret);
  1752. goto out_put;
  1753. }
  1754. if (m.num_ops != req->r_num_ops) {
  1755. pr_err("num_ops %d != %d for tid %llu\n", m.num_ops,
  1756. req->r_num_ops, req->r_tid);
  1757. goto fail_request;
  1758. }
  1759. for (i = 0; i < req->r_num_ops; i++) {
  1760. dout(" req %p tid %llu op %d rval %d len %u\n", req,
  1761. req->r_tid, i, m.rval[i], m.outdata_len[i]);
  1762. req->r_ops[i].rval = m.rval[i];
  1763. req->r_ops[i].outdata_len = m.outdata_len[i];
  1764. data_len += m.outdata_len[i];
  1765. }
  1766. if (data_len != le32_to_cpu(msg->hdr.data_len)) {
  1767. pr_err("sum of lens %u != %u for tid %llu\n", data_len,
  1768. le32_to_cpu(msg->hdr.data_len), req->r_tid);
  1769. goto fail_request;
  1770. }
  1771. dout("%s req %p tid %llu acked %d result %d data_len %u\n", __func__,
  1772. req, req->r_tid, req->r_got_reply, m.result, data_len);
  1773. already_acked = req->r_got_reply;
  1774. if (!already_acked) {
  1775. req->r_result = m.result ?: data_len;
  1776. req->r_replay_version = m.replay_version; /* struct */
  1777. req->r_got_reply = true;
  1778. } else if (!(m.flags & CEPH_OSD_FLAG_ONDISK)) {
  1779. dout("req %p tid %llu dup ack\n", req, req->r_tid);
  1780. goto out_put;
  1781. }
  1782. if (done_request(req, &m)) {
  1783. __unregister_request(osdc, req);
  1784. if (req->r_linger) {
  1785. WARN_ON(req->r_unsafe_callback);
  1786. __register_linger_request(osdc, req);
  1787. }
  1788. }
  1789. mutex_unlock(&osdc->request_mutex);
  1790. up_read(&osdc->map_sem);
  1791. if (done_request(req, &m)) {
  1792. if (already_acked && req->r_unsafe_callback) {
  1793. dout("req %p tid %llu safe-cb\n", req, req->r_tid);
  1794. req->r_unsafe_callback(req, false);
  1795. } else {
  1796. dout("req %p tid %llu cb\n", req, req->r_tid);
  1797. __complete_request(req);
  1798. }
  1799. } else {
  1800. if (req->r_unsafe_callback) {
  1801. dout("req %p tid %llu unsafe-cb\n", req, req->r_tid);
  1802. req->r_unsafe_callback(req, true);
  1803. } else {
  1804. WARN_ON(1);
  1805. }
  1806. }
  1807. if (m.flags & CEPH_OSD_FLAG_ONDISK)
  1808. complete_all(&req->r_safe_completion);
  1809. ceph_osdc_put_request(req);
  1810. return;
  1811. fail_request:
  1812. req->r_result = -EIO;
  1813. __unregister_request(osdc, req);
  1814. __complete_request(req);
  1815. complete_all(&req->r_safe_completion);
  1816. out_put:
  1817. ceph_osdc_put_request(req);
  1818. out_unlock:
  1819. mutex_unlock(&osdc->request_mutex);
  1820. up_read(&osdc->map_sem);
  1821. }
  1822. static void reset_changed_osds(struct ceph_osd_client *osdc)
  1823. {
  1824. struct rb_node *p, *n;
  1825. dout("%s %p\n", __func__, osdc);
  1826. for (p = rb_first(&osdc->osds); p; p = n) {
  1827. struct ceph_osd *osd = rb_entry(p, struct ceph_osd, o_node);
  1828. n = rb_next(p);
  1829. if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
  1830. memcmp(&osd->o_con.peer_addr,
  1831. ceph_osd_addr(osdc->osdmap,
  1832. osd->o_osd),
  1833. sizeof(struct ceph_entity_addr)) != 0)
  1834. __reset_osd(osdc, osd);
  1835. }
  1836. }
  1837. /*
  1838. * Requeue requests whose mapping to an OSD has changed. If requests map to
  1839. * no osd, request a new map.
  1840. *
  1841. * Caller should hold map_sem for read.
  1842. */
  1843. static void kick_requests(struct ceph_osd_client *osdc, bool force_resend,
  1844. bool force_resend_writes)
  1845. {
  1846. struct ceph_osd_request *req, *nreq;
  1847. struct rb_node *p;
  1848. int needmap = 0;
  1849. int err;
  1850. bool force_resend_req;
  1851. dout("kick_requests %s %s\n", force_resend ? " (force resend)" : "",
  1852. force_resend_writes ? " (force resend writes)" : "");
  1853. mutex_lock(&osdc->request_mutex);
  1854. for (p = rb_first(&osdc->requests); p; ) {
  1855. req = rb_entry(p, struct ceph_osd_request, r_node);
  1856. p = rb_next(p);
  1857. /*
  1858. * For linger requests that have not yet been
  1859. * registered, move them to the linger list; they'll
  1860. * be sent to the osd in the loop below. Unregister
  1861. * the request before re-registering it as a linger
  1862. * request to ensure the __map_request() below
  1863. * will decide it needs to be sent.
  1864. */
  1865. if (req->r_linger && list_empty(&req->r_linger_item)) {
  1866. dout("%p tid %llu restart on osd%d\n",
  1867. req, req->r_tid,
  1868. req->r_osd ? req->r_osd->o_osd : -1);
  1869. ceph_osdc_get_request(req);
  1870. __unregister_request(osdc, req);
  1871. __register_linger_request(osdc, req);
  1872. ceph_osdc_put_request(req);
  1873. continue;
  1874. }
  1875. force_resend_req = force_resend ||
  1876. (force_resend_writes &&
  1877. req->r_flags & CEPH_OSD_FLAG_WRITE);
  1878. err = __map_request(osdc, req, force_resend_req);
  1879. if (err < 0)
  1880. continue; /* error */
  1881. if (req->r_osd == NULL) {
  1882. dout("%p tid %llu maps to no osd\n", req, req->r_tid);
  1883. needmap++; /* request a newer map */
  1884. } else if (err > 0) {
  1885. if (!req->r_linger) {
  1886. dout("%p tid %llu requeued on osd%d\n", req,
  1887. req->r_tid,
  1888. req->r_osd ? req->r_osd->o_osd : -1);
  1889. req->r_flags |= CEPH_OSD_FLAG_RETRY;
  1890. }
  1891. }
  1892. }
  1893. list_for_each_entry_safe(req, nreq, &osdc->req_linger,
  1894. r_linger_item) {
  1895. dout("linger req=%p req->r_osd=%p\n", req, req->r_osd);
  1896. err = __map_request(osdc, req,
  1897. force_resend || force_resend_writes);
  1898. dout("__map_request returned %d\n", err);
  1899. if (err < 0)
  1900. continue; /* hrm! */
  1901. if (req->r_osd == NULL || err > 0) {
  1902. if (req->r_osd == NULL) {
  1903. dout("lingering %p tid %llu maps to no osd\n",
  1904. req, req->r_tid);
  1905. /*
  1906. * A homeless lingering request makes
  1907. * no sense, as it's job is to keep
  1908. * a particular OSD connection open.
  1909. * Request a newer map and kick the
  1910. * request, knowing that it won't be
  1911. * resent until we actually get a map
  1912. * that can tell us where to send it.
  1913. */
  1914. needmap++;
  1915. }
  1916. dout("kicking lingering %p tid %llu osd%d\n", req,
  1917. req->r_tid, req->r_osd ? req->r_osd->o_osd : -1);
  1918. __register_request(osdc, req);
  1919. __unregister_linger_request(osdc, req);
  1920. }
  1921. }
  1922. reset_changed_osds(osdc);
  1923. mutex_unlock(&osdc->request_mutex);
  1924. if (needmap) {
  1925. dout("%d requests for down osds, need new map\n", needmap);
  1926. ceph_monc_request_next_osdmap(&osdc->client->monc);
  1927. }
  1928. }
  1929. /*
  1930. * Process updated osd map.
  1931. *
  1932. * The message contains any number of incremental and full maps, normally
  1933. * indicating some sort of topology change in the cluster. Kick requests
  1934. * off to different OSDs as needed.
  1935. */
  1936. void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
  1937. {
  1938. void *p, *end, *next;
  1939. u32 nr_maps, maplen;
  1940. u32 epoch;
  1941. struct ceph_osdmap *newmap = NULL, *oldmap;
  1942. int err;
  1943. struct ceph_fsid fsid;
  1944. bool was_full;
  1945. dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0);
  1946. p = msg->front.iov_base;
  1947. end = p + msg->front.iov_len;
  1948. /* verify fsid */
  1949. ceph_decode_need(&p, end, sizeof(fsid), bad);
  1950. ceph_decode_copy(&p, &fsid, sizeof(fsid));
  1951. if (ceph_check_fsid(osdc->client, &fsid) < 0)
  1952. return;
  1953. down_write(&osdc->map_sem);
  1954. was_full = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL);
  1955. /* incremental maps */
  1956. ceph_decode_32_safe(&p, end, nr_maps, bad);
  1957. dout(" %d inc maps\n", nr_maps);
  1958. while (nr_maps > 0) {
  1959. ceph_decode_need(&p, end, 2*sizeof(u32), bad);
  1960. epoch = ceph_decode_32(&p);
  1961. maplen = ceph_decode_32(&p);
  1962. ceph_decode_need(&p, end, maplen, bad);
  1963. next = p + maplen;
  1964. if (osdc->osdmap && osdc->osdmap->epoch+1 == epoch) {
  1965. dout("applying incremental map %u len %d\n",
  1966. epoch, maplen);
  1967. newmap = osdmap_apply_incremental(&p, next,
  1968. osdc->osdmap);
  1969. if (IS_ERR(newmap)) {
  1970. err = PTR_ERR(newmap);
  1971. goto bad;
  1972. }
  1973. BUG_ON(!newmap);
  1974. if (newmap != osdc->osdmap) {
  1975. ceph_osdmap_destroy(osdc->osdmap);
  1976. osdc->osdmap = newmap;
  1977. }
  1978. was_full = was_full ||
  1979. ceph_osdmap_flag(osdc->osdmap,
  1980. CEPH_OSDMAP_FULL);
  1981. kick_requests(osdc, 0, was_full);
  1982. } else {
  1983. dout("ignoring incremental map %u len %d\n",
  1984. epoch, maplen);
  1985. }
  1986. p = next;
  1987. nr_maps--;
  1988. }
  1989. if (newmap)
  1990. goto done;
  1991. /* full maps */
  1992. ceph_decode_32_safe(&p, end, nr_maps, bad);
  1993. dout(" %d full maps\n", nr_maps);
  1994. while (nr_maps) {
  1995. ceph_decode_need(&p, end, 2*sizeof(u32), bad);
  1996. epoch = ceph_decode_32(&p);
  1997. maplen = ceph_decode_32(&p);
  1998. ceph_decode_need(&p, end, maplen, bad);
  1999. if (nr_maps > 1) {
  2000. dout("skipping non-latest full map %u len %d\n",
  2001. epoch, maplen);
  2002. } else if (osdc->osdmap && osdc->osdmap->epoch >= epoch) {
  2003. dout("skipping full map %u len %d, "
  2004. "older than our %u\n", epoch, maplen,
  2005. osdc->osdmap->epoch);
  2006. } else {
  2007. int skipped_map = 0;
  2008. dout("taking full map %u len %d\n", epoch, maplen);
  2009. newmap = ceph_osdmap_decode(&p, p+maplen);
  2010. if (IS_ERR(newmap)) {
  2011. err = PTR_ERR(newmap);
  2012. goto bad;
  2013. }
  2014. BUG_ON(!newmap);
  2015. oldmap = osdc->osdmap;
  2016. osdc->osdmap = newmap;
  2017. if (oldmap) {
  2018. if (oldmap->epoch + 1 < newmap->epoch)
  2019. skipped_map = 1;
  2020. ceph_osdmap_destroy(oldmap);
  2021. }
  2022. was_full = was_full ||
  2023. ceph_osdmap_flag(osdc->osdmap,
  2024. CEPH_OSDMAP_FULL);
  2025. kick_requests(osdc, skipped_map, was_full);
  2026. }
  2027. p += maplen;
  2028. nr_maps--;
  2029. }
  2030. if (!osdc->osdmap)
  2031. goto bad;
  2032. done:
  2033. downgrade_write(&osdc->map_sem);
  2034. ceph_monc_got_map(&osdc->client->monc, CEPH_SUB_OSDMAP,
  2035. osdc->osdmap->epoch);
  2036. /*
  2037. * subscribe to subsequent osdmap updates if full to ensure
  2038. * we find out when we are no longer full and stop returning
  2039. * ENOSPC.
  2040. */
  2041. if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
  2042. ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD) ||
  2043. ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR))
  2044. ceph_monc_request_next_osdmap(&osdc->client->monc);
  2045. mutex_lock(&osdc->request_mutex);
  2046. __send_queued(osdc);
  2047. mutex_unlock(&osdc->request_mutex);
  2048. up_read(&osdc->map_sem);
  2049. wake_up_all(&osdc->client->auth_wq);
  2050. return;
  2051. bad:
  2052. pr_err("osdc handle_map corrupt msg\n");
  2053. ceph_msg_dump(msg);
  2054. up_write(&osdc->map_sem);
  2055. }
  2056. /*
  2057. * watch/notify callback event infrastructure
  2058. *
  2059. * These callbacks are used both for watch and notify operations.
  2060. */
  2061. static void __release_event(struct kref *kref)
  2062. {
  2063. struct ceph_osd_event *event =
  2064. container_of(kref, struct ceph_osd_event, kref);
  2065. dout("__release_event %p\n", event);
  2066. kfree(event);
  2067. }
  2068. static void get_event(struct ceph_osd_event *event)
  2069. {
  2070. kref_get(&event->kref);
  2071. }
  2072. void ceph_osdc_put_event(struct ceph_osd_event *event)
  2073. {
  2074. kref_put(&event->kref, __release_event);
  2075. }
  2076. EXPORT_SYMBOL(ceph_osdc_put_event);
  2077. static void __insert_event(struct ceph_osd_client *osdc,
  2078. struct ceph_osd_event *new)
  2079. {
  2080. struct rb_node **p = &osdc->event_tree.rb_node;
  2081. struct rb_node *parent = NULL;
  2082. struct ceph_osd_event *event = NULL;
  2083. while (*p) {
  2084. parent = *p;
  2085. event = rb_entry(parent, struct ceph_osd_event, node);
  2086. if (new->cookie < event->cookie)
  2087. p = &(*p)->rb_left;
  2088. else if (new->cookie > event->cookie)
  2089. p = &(*p)->rb_right;
  2090. else
  2091. BUG();
  2092. }
  2093. rb_link_node(&new->node, parent, p);
  2094. rb_insert_color(&new->node, &osdc->event_tree);
  2095. }
  2096. static struct ceph_osd_event *__find_event(struct ceph_osd_client *osdc,
  2097. u64 cookie)
  2098. {
  2099. struct rb_node **p = &osdc->event_tree.rb_node;
  2100. struct rb_node *parent = NULL;
  2101. struct ceph_osd_event *event = NULL;
  2102. while (*p) {
  2103. parent = *p;
  2104. event = rb_entry(parent, struct ceph_osd_event, node);
  2105. if (cookie < event->cookie)
  2106. p = &(*p)->rb_left;
  2107. else if (cookie > event->cookie)
  2108. p = &(*p)->rb_right;
  2109. else
  2110. return event;
  2111. }
  2112. return NULL;
  2113. }
  2114. static void __remove_event(struct ceph_osd_event *event)
  2115. {
  2116. struct ceph_osd_client *osdc = event->osdc;
  2117. if (!RB_EMPTY_NODE(&event->node)) {
  2118. dout("__remove_event removed %p\n", event);
  2119. rb_erase(&event->node, &osdc->event_tree);
  2120. ceph_osdc_put_event(event);
  2121. } else {
  2122. dout("__remove_event didn't remove %p\n", event);
  2123. }
  2124. }
  2125. int ceph_osdc_create_event(struct ceph_osd_client *osdc,
  2126. void (*event_cb)(u64, u64, u8, void *),
  2127. void *data, struct ceph_osd_event **pevent)
  2128. {
  2129. struct ceph_osd_event *event;
  2130. event = kmalloc(sizeof(*event), GFP_NOIO);
  2131. if (!event)
  2132. return -ENOMEM;
  2133. dout("create_event %p\n", event);
  2134. event->cb = event_cb;
  2135. event->one_shot = 0;
  2136. event->data = data;
  2137. event->osdc = osdc;
  2138. INIT_LIST_HEAD(&event->osd_node);
  2139. RB_CLEAR_NODE(&event->node);
  2140. kref_init(&event->kref); /* one ref for us */
  2141. kref_get(&event->kref); /* one ref for the caller */
  2142. spin_lock(&osdc->event_lock);
  2143. event->cookie = ++osdc->event_count;
  2144. __insert_event(osdc, event);
  2145. spin_unlock(&osdc->event_lock);
  2146. *pevent = event;
  2147. return 0;
  2148. }
  2149. EXPORT_SYMBOL(ceph_osdc_create_event);
  2150. void ceph_osdc_cancel_event(struct ceph_osd_event *event)
  2151. {
  2152. struct ceph_osd_client *osdc = event->osdc;
  2153. dout("cancel_event %p\n", event);
  2154. spin_lock(&osdc->event_lock);
  2155. __remove_event(event);
  2156. spin_unlock(&osdc->event_lock);
  2157. ceph_osdc_put_event(event); /* caller's */
  2158. }
  2159. EXPORT_SYMBOL(ceph_osdc_cancel_event);
  2160. static void do_event_work(struct work_struct *work)
  2161. {
  2162. struct ceph_osd_event_work *event_work =
  2163. container_of(work, struct ceph_osd_event_work, work);
  2164. struct ceph_osd_event *event = event_work->event;
  2165. u64 ver = event_work->ver;
  2166. u64 notify_id = event_work->notify_id;
  2167. u8 opcode = event_work->opcode;
  2168. dout("do_event_work completing %p\n", event);
  2169. event->cb(ver, notify_id, opcode, event->data);
  2170. dout("do_event_work completed %p\n", event);
  2171. ceph_osdc_put_event(event);
  2172. kfree(event_work);
  2173. }
  2174. /*
  2175. * Process osd watch notifications
  2176. */
  2177. static void handle_watch_notify(struct ceph_osd_client *osdc,
  2178. struct ceph_msg *msg)
  2179. {
  2180. void *p, *end;
  2181. u8 proto_ver;
  2182. u64 cookie, ver, notify_id;
  2183. u8 opcode;
  2184. struct ceph_osd_event *event;
  2185. struct ceph_osd_event_work *event_work;
  2186. p = msg->front.iov_base;
  2187. end = p + msg->front.iov_len;
  2188. ceph_decode_8_safe(&p, end, proto_ver, bad);
  2189. ceph_decode_8_safe(&p, end, opcode, bad);
  2190. ceph_decode_64_safe(&p, end, cookie, bad);
  2191. ceph_decode_64_safe(&p, end, ver, bad);
  2192. ceph_decode_64_safe(&p, end, notify_id, bad);
  2193. spin_lock(&osdc->event_lock);
  2194. event = __find_event(osdc, cookie);
  2195. if (event) {
  2196. BUG_ON(event->one_shot);
  2197. get_event(event);
  2198. }
  2199. spin_unlock(&osdc->event_lock);
  2200. dout("handle_watch_notify cookie %lld ver %lld event %p\n",
  2201. cookie, ver, event);
  2202. if (event) {
  2203. event_work = kmalloc(sizeof(*event_work), GFP_NOIO);
  2204. if (!event_work) {
  2205. pr_err("couldn't allocate event_work\n");
  2206. ceph_osdc_put_event(event);
  2207. return;
  2208. }
  2209. INIT_WORK(&event_work->work, do_event_work);
  2210. event_work->event = event;
  2211. event_work->ver = ver;
  2212. event_work->notify_id = notify_id;
  2213. event_work->opcode = opcode;
  2214. queue_work(osdc->notify_wq, &event_work->work);
  2215. }
  2216. return;
  2217. bad:
  2218. pr_err("osdc handle_watch_notify corrupt msg\n");
  2219. }
  2220. /*
  2221. * Register request, send initial attempt.
  2222. */
  2223. int ceph_osdc_start_request(struct ceph_osd_client *osdc,
  2224. struct ceph_osd_request *req,
  2225. bool nofail)
  2226. {
  2227. int rc;
  2228. down_read(&osdc->map_sem);
  2229. mutex_lock(&osdc->request_mutex);
  2230. rc = __ceph_osdc_start_request(osdc, req, nofail);
  2231. mutex_unlock(&osdc->request_mutex);
  2232. up_read(&osdc->map_sem);
  2233. return rc;
  2234. }
  2235. EXPORT_SYMBOL(ceph_osdc_start_request);
  2236. /*
  2237. * Unregister a registered request. The request is not completed (i.e.
  2238. * no callbacks or wakeups) - higher layers are supposed to know what
  2239. * they are canceling.
  2240. */
  2241. void ceph_osdc_cancel_request(struct ceph_osd_request *req)
  2242. {
  2243. struct ceph_osd_client *osdc = req->r_osdc;
  2244. mutex_lock(&osdc->request_mutex);
  2245. if (req->r_linger)
  2246. __unregister_linger_request(osdc, req);
  2247. __unregister_request(osdc, req);
  2248. mutex_unlock(&osdc->request_mutex);
  2249. dout("%s %p tid %llu canceled\n", __func__, req, req->r_tid);
  2250. }
  2251. EXPORT_SYMBOL(ceph_osdc_cancel_request);
  2252. /*
  2253. * wait for a request to complete
  2254. */
  2255. int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
  2256. struct ceph_osd_request *req)
  2257. {
  2258. int rc;
  2259. dout("%s %p tid %llu\n", __func__, req, req->r_tid);
  2260. rc = wait_for_completion_interruptible(&req->r_completion);
  2261. if (rc < 0) {
  2262. dout("%s %p tid %llu interrupted\n", __func__, req, req->r_tid);
  2263. ceph_osdc_cancel_request(req);
  2264. /* kludge - need to to wake ceph_osdc_sync() */
  2265. complete_all(&req->r_safe_completion);
  2266. return rc;
  2267. }
  2268. dout("%s %p tid %llu result %d\n", __func__, req, req->r_tid,
  2269. req->r_result);
  2270. return req->r_result;
  2271. }
  2272. EXPORT_SYMBOL(ceph_osdc_wait_request);
  2273. /*
  2274. * sync - wait for all in-flight requests to flush. avoid starvation.
  2275. */
  2276. void ceph_osdc_sync(struct ceph_osd_client *osdc)
  2277. {
  2278. struct ceph_osd_request *req;
  2279. u64 last_tid, next_tid = 0;
  2280. mutex_lock(&osdc->request_mutex);
  2281. last_tid = osdc->last_tid;
  2282. while (1) {
  2283. req = __lookup_request_ge(osdc, next_tid);
  2284. if (!req)
  2285. break;
  2286. if (req->r_tid > last_tid)
  2287. break;
  2288. next_tid = req->r_tid + 1;
  2289. if ((req->r_flags & CEPH_OSD_FLAG_WRITE) == 0)
  2290. continue;
  2291. ceph_osdc_get_request(req);
  2292. mutex_unlock(&osdc->request_mutex);
  2293. dout("sync waiting on tid %llu (last is %llu)\n",
  2294. req->r_tid, last_tid);
  2295. wait_for_completion(&req->r_safe_completion);
  2296. mutex_lock(&osdc->request_mutex);
  2297. ceph_osdc_put_request(req);
  2298. }
  2299. mutex_unlock(&osdc->request_mutex);
  2300. dout("sync done (thru tid %llu)\n", last_tid);
  2301. }
  2302. EXPORT_SYMBOL(ceph_osdc_sync);
  2303. /*
  2304. * Call all pending notify callbacks - for use after a watch is
  2305. * unregistered, to make sure no more callbacks for it will be invoked
  2306. */
  2307. void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc)
  2308. {
  2309. flush_workqueue(osdc->notify_wq);
  2310. }
  2311. EXPORT_SYMBOL(ceph_osdc_flush_notifies);
  2312. /*
  2313. * init, shutdown
  2314. */
  2315. int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
  2316. {
  2317. int err;
  2318. dout("init\n");
  2319. osdc->client = client;
  2320. osdc->osdmap = NULL;
  2321. init_rwsem(&osdc->map_sem);
  2322. mutex_init(&osdc->request_mutex);
  2323. osdc->last_tid = 0;
  2324. osdc->osds = RB_ROOT;
  2325. INIT_LIST_HEAD(&osdc->osd_lru);
  2326. osdc->requests = RB_ROOT;
  2327. INIT_LIST_HEAD(&osdc->req_lru);
  2328. INIT_LIST_HEAD(&osdc->req_unsent);
  2329. INIT_LIST_HEAD(&osdc->req_notarget);
  2330. INIT_LIST_HEAD(&osdc->req_linger);
  2331. osdc->num_requests = 0;
  2332. INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
  2333. INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
  2334. spin_lock_init(&osdc->event_lock);
  2335. osdc->event_tree = RB_ROOT;
  2336. osdc->event_count = 0;
  2337. err = -ENOMEM;
  2338. osdc->req_mempool = mempool_create_slab_pool(10,
  2339. ceph_osd_request_cache);
  2340. if (!osdc->req_mempool)
  2341. goto out;
  2342. err = ceph_msgpool_init(&osdc->msgpool_op, CEPH_MSG_OSD_OP,
  2343. PAGE_SIZE, 10, true, "osd_op");
  2344. if (err < 0)
  2345. goto out_mempool;
  2346. err = ceph_msgpool_init(&osdc->msgpool_op_reply, CEPH_MSG_OSD_OPREPLY,
  2347. PAGE_SIZE, 10, true, "osd_op_reply");
  2348. if (err < 0)
  2349. goto out_msgpool;
  2350. err = -ENOMEM;
  2351. osdc->notify_wq = create_singlethread_workqueue("ceph-watch-notify");
  2352. if (!osdc->notify_wq)
  2353. goto out_msgpool_reply;
  2354. schedule_delayed_work(&osdc->timeout_work,
  2355. osdc->client->options->osd_keepalive_timeout);
  2356. schedule_delayed_work(&osdc->osds_timeout_work,
  2357. round_jiffies_relative(osdc->client->options->osd_idle_ttl));
  2358. return 0;
  2359. out_msgpool_reply:
  2360. ceph_msgpool_destroy(&osdc->msgpool_op_reply);
  2361. out_msgpool:
  2362. ceph_msgpool_destroy(&osdc->msgpool_op);
  2363. out_mempool:
  2364. mempool_destroy(osdc->req_mempool);
  2365. out:
  2366. return err;
  2367. }
  2368. void ceph_osdc_stop(struct ceph_osd_client *osdc)
  2369. {
  2370. flush_workqueue(osdc->notify_wq);
  2371. destroy_workqueue(osdc->notify_wq);
  2372. cancel_delayed_work_sync(&osdc->timeout_work);
  2373. cancel_delayed_work_sync(&osdc->osds_timeout_work);
  2374. mutex_lock(&osdc->request_mutex);
  2375. while (!RB_EMPTY_ROOT(&osdc->osds)) {
  2376. struct ceph_osd *osd = rb_entry(rb_first(&osdc->osds),
  2377. struct ceph_osd, o_node);
  2378. remove_osd(osdc, osd);
  2379. }
  2380. mutex_unlock(&osdc->request_mutex);
  2381. if (osdc->osdmap) {
  2382. ceph_osdmap_destroy(osdc->osdmap);
  2383. osdc->osdmap = NULL;
  2384. }
  2385. mempool_destroy(osdc->req_mempool);
  2386. ceph_msgpool_destroy(&osdc->msgpool_op);
  2387. ceph_msgpool_destroy(&osdc->msgpool_op_reply);
  2388. }
  2389. /*
  2390. * Read some contiguous pages. If we cross a stripe boundary, shorten
  2391. * *plen. Return number of bytes read, or error.
  2392. */
  2393. int ceph_osdc_readpages(struct ceph_osd_client *osdc,
  2394. struct ceph_vino vino, struct ceph_file_layout *layout,
  2395. u64 off, u64 *plen,
  2396. u32 truncate_seq, u64 truncate_size,
  2397. struct page **pages, int num_pages, int page_align)
  2398. {
  2399. struct ceph_osd_request *req;
  2400. int rc = 0;
  2401. dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
  2402. vino.snap, off, *plen);
  2403. req = ceph_osdc_new_request(osdc, layout, vino, off, plen, 0, 1,
  2404. CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
  2405. NULL, truncate_seq, truncate_size,
  2406. false);
  2407. if (IS_ERR(req))
  2408. return PTR_ERR(req);
  2409. /* it may be a short read due to an object boundary */
  2410. osd_req_op_extent_osd_data_pages(req, 0,
  2411. pages, *plen, page_align, false, false);
  2412. dout("readpages final extent is %llu~%llu (%llu bytes align %d)\n",
  2413. off, *plen, *plen, page_align);
  2414. rc = ceph_osdc_start_request(osdc, req, false);
  2415. if (!rc)
  2416. rc = ceph_osdc_wait_request(osdc, req);
  2417. ceph_osdc_put_request(req);
  2418. dout("readpages result %d\n", rc);
  2419. return rc;
  2420. }
  2421. EXPORT_SYMBOL(ceph_osdc_readpages);
  2422. /*
  2423. * do a synchronous write on N pages
  2424. */
  2425. int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
  2426. struct ceph_file_layout *layout,
  2427. struct ceph_snap_context *snapc,
  2428. u64 off, u64 len,
  2429. u32 truncate_seq, u64 truncate_size,
  2430. struct timespec *mtime,
  2431. struct page **pages, int num_pages)
  2432. {
  2433. struct ceph_osd_request *req;
  2434. int rc = 0;
  2435. int page_align = off & ~PAGE_MASK;
  2436. req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1,
  2437. CEPH_OSD_OP_WRITE,
  2438. CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
  2439. snapc, truncate_seq, truncate_size,
  2440. true);
  2441. if (IS_ERR(req))
  2442. return PTR_ERR(req);
  2443. /* it may be a short write due to an object boundary */
  2444. osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align,
  2445. false, false);
  2446. dout("writepages %llu~%llu (%llu bytes)\n", off, len, len);
  2447. req->r_mtime = *mtime;
  2448. rc = ceph_osdc_start_request(osdc, req, true);
  2449. if (!rc)
  2450. rc = ceph_osdc_wait_request(osdc, req);
  2451. ceph_osdc_put_request(req);
  2452. if (rc == 0)
  2453. rc = len;
  2454. dout("writepages result %d\n", rc);
  2455. return rc;
  2456. }
  2457. EXPORT_SYMBOL(ceph_osdc_writepages);
  2458. int ceph_osdc_setup(void)
  2459. {
  2460. size_t size = sizeof(struct ceph_osd_request) +
  2461. CEPH_OSD_SLAB_OPS * sizeof(struct ceph_osd_req_op);
  2462. BUG_ON(ceph_osd_request_cache);
  2463. ceph_osd_request_cache = kmem_cache_create("ceph_osd_request", size,
  2464. 0, 0, NULL);
  2465. return ceph_osd_request_cache ? 0 : -ENOMEM;
  2466. }
  2467. EXPORT_SYMBOL(ceph_osdc_setup);
  2468. void ceph_osdc_cleanup(void)
  2469. {
  2470. BUG_ON(!ceph_osd_request_cache);
  2471. kmem_cache_destroy(ceph_osd_request_cache);
  2472. ceph_osd_request_cache = NULL;
  2473. }
  2474. EXPORT_SYMBOL(ceph_osdc_cleanup);
  2475. /*
  2476. * handle incoming message
  2477. */
  2478. static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
  2479. {
  2480. struct ceph_osd *osd = con->private;
  2481. struct ceph_osd_client *osdc;
  2482. int type = le16_to_cpu(msg->hdr.type);
  2483. if (!osd)
  2484. goto out;
  2485. osdc = osd->o_osdc;
  2486. switch (type) {
  2487. case CEPH_MSG_OSD_MAP:
  2488. ceph_osdc_handle_map(osdc, msg);
  2489. break;
  2490. case CEPH_MSG_OSD_OPREPLY:
  2491. handle_reply(osdc, msg);
  2492. break;
  2493. case CEPH_MSG_WATCH_NOTIFY:
  2494. handle_watch_notify(osdc, msg);
  2495. break;
  2496. default:
  2497. pr_err("received unknown message type %d %s\n", type,
  2498. ceph_msg_type_name(type));
  2499. }
  2500. out:
  2501. ceph_msg_put(msg);
  2502. }
  2503. /*
  2504. * Lookup and return message for incoming reply. Don't try to do
  2505. * anything about a larger than preallocated data portion of the
  2506. * message at the moment - for now, just skip the message.
  2507. */
  2508. static struct ceph_msg *get_reply(struct ceph_connection *con,
  2509. struct ceph_msg_header *hdr,
  2510. int *skip)
  2511. {
  2512. struct ceph_osd *osd = con->private;
  2513. struct ceph_osd_client *osdc = osd->o_osdc;
  2514. struct ceph_msg *m;
  2515. struct ceph_osd_request *req;
  2516. int front_len = le32_to_cpu(hdr->front_len);
  2517. int data_len = le32_to_cpu(hdr->data_len);
  2518. u64 tid;
  2519. tid = le64_to_cpu(hdr->tid);
  2520. mutex_lock(&osdc->request_mutex);
  2521. req = lookup_request(&osdc->requests, tid);
  2522. if (!req) {
  2523. dout("%s osd%d tid %llu unknown, skipping\n", __func__,
  2524. osd->o_osd, tid);
  2525. m = NULL;
  2526. *skip = 1;
  2527. goto out;
  2528. }
  2529. ceph_msg_revoke_incoming(req->r_reply);
  2530. if (front_len > req->r_reply->front_alloc_len) {
  2531. pr_warn("%s osd%d tid %llu front %d > preallocated %d\n",
  2532. __func__, osd->o_osd, req->r_tid, front_len,
  2533. req->r_reply->front_alloc_len);
  2534. m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front_len, GFP_NOFS,
  2535. false);
  2536. if (!m)
  2537. goto out;
  2538. ceph_msg_put(req->r_reply);
  2539. req->r_reply = m;
  2540. }
  2541. if (data_len > req->r_reply->data_length) {
  2542. pr_warn("%s osd%d tid %llu data %d > preallocated %zu, skipping\n",
  2543. __func__, osd->o_osd, req->r_tid, data_len,
  2544. req->r_reply->data_length);
  2545. m = NULL;
  2546. *skip = 1;
  2547. goto out;
  2548. }
  2549. m = ceph_msg_get(req->r_reply);
  2550. dout("get_reply tid %lld %p\n", tid, m);
  2551. out:
  2552. mutex_unlock(&osdc->request_mutex);
  2553. return m;
  2554. }
  2555. static struct ceph_msg *alloc_msg(struct ceph_connection *con,
  2556. struct ceph_msg_header *hdr,
  2557. int *skip)
  2558. {
  2559. struct ceph_osd *osd = con->private;
  2560. int type = le16_to_cpu(hdr->type);
  2561. int front = le32_to_cpu(hdr->front_len);
  2562. *skip = 0;
  2563. switch (type) {
  2564. case CEPH_MSG_OSD_MAP:
  2565. case CEPH_MSG_WATCH_NOTIFY:
  2566. return ceph_msg_new(type, front, GFP_NOFS, false);
  2567. case CEPH_MSG_OSD_OPREPLY:
  2568. return get_reply(con, hdr, skip);
  2569. default:
  2570. pr_info("alloc_msg unexpected msg type %d from osd%d\n", type,
  2571. osd->o_osd);
  2572. *skip = 1;
  2573. return NULL;
  2574. }
  2575. }
  2576. /*
  2577. * Wrappers to refcount containing ceph_osd struct
  2578. */
  2579. static struct ceph_connection *get_osd_con(struct ceph_connection *con)
  2580. {
  2581. struct ceph_osd *osd = con->private;
  2582. if (get_osd(osd))
  2583. return con;
  2584. return NULL;
  2585. }
  2586. static void put_osd_con(struct ceph_connection *con)
  2587. {
  2588. struct ceph_osd *osd = con->private;
  2589. put_osd(osd);
  2590. }
  2591. /*
  2592. * authentication
  2593. */
  2594. /*
  2595. * Note: returned pointer is the address of a structure that's
  2596. * managed separately. Caller must *not* attempt to free it.
  2597. */
  2598. static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con,
  2599. int *proto, int force_new)
  2600. {
  2601. struct ceph_osd *o = con->private;
  2602. struct ceph_osd_client *osdc = o->o_osdc;
  2603. struct ceph_auth_client *ac = osdc->client->monc.auth;
  2604. struct ceph_auth_handshake *auth = &o->o_auth;
  2605. if (force_new && auth->authorizer) {
  2606. ceph_auth_destroy_authorizer(auth->authorizer);
  2607. auth->authorizer = NULL;
  2608. }
  2609. if (!auth->authorizer) {
  2610. int ret = ceph_auth_create_authorizer(ac, CEPH_ENTITY_TYPE_OSD,
  2611. auth);
  2612. if (ret)
  2613. return ERR_PTR(ret);
  2614. } else {
  2615. int ret = ceph_auth_update_authorizer(ac, CEPH_ENTITY_TYPE_OSD,
  2616. auth);
  2617. if (ret)
  2618. return ERR_PTR(ret);
  2619. }
  2620. *proto = ac->protocol;
  2621. return auth;
  2622. }
  2623. static int verify_authorizer_reply(struct ceph_connection *con, int len)
  2624. {
  2625. struct ceph_osd *o = con->private;
  2626. struct ceph_osd_client *osdc = o->o_osdc;
  2627. struct ceph_auth_client *ac = osdc->client->monc.auth;
  2628. return ceph_auth_verify_authorizer_reply(ac, o->o_auth.authorizer, len);
  2629. }
  2630. static int invalidate_authorizer(struct ceph_connection *con)
  2631. {
  2632. struct ceph_osd *o = con->private;
  2633. struct ceph_osd_client *osdc = o->o_osdc;
  2634. struct ceph_auth_client *ac = osdc->client->monc.auth;
  2635. ceph_auth_invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD);
  2636. return ceph_monc_validate_auth(&osdc->client->monc);
  2637. }
  2638. static int osd_sign_message(struct ceph_msg *msg)
  2639. {
  2640. struct ceph_osd *o = msg->con->private;
  2641. struct ceph_auth_handshake *auth = &o->o_auth;
  2642. return ceph_auth_sign_message(auth, msg);
  2643. }
  2644. static int osd_check_message_signature(struct ceph_msg *msg)
  2645. {
  2646. struct ceph_osd *o = msg->con->private;
  2647. struct ceph_auth_handshake *auth = &o->o_auth;
  2648. return ceph_auth_check_message_signature(auth, msg);
  2649. }
  2650. static const struct ceph_connection_operations osd_con_ops = {
  2651. .get = get_osd_con,
  2652. .put = put_osd_con,
  2653. .dispatch = dispatch,
  2654. .get_authorizer = get_authorizer,
  2655. .verify_authorizer_reply = verify_authorizer_reply,
  2656. .invalidate_authorizer = invalidate_authorizer,
  2657. .alloc_msg = alloc_msg,
  2658. .sign_message = osd_sign_message,
  2659. .check_message_signature = osd_check_message_signature,
  2660. .fault = osd_reset,
  2661. };