md-cluster.c 41 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496
  1. /*
  2. * Copyright (C) 2015, SUSE
  3. *
  4. * This program is free software; you can redistribute it and/or modify
  5. * it under the terms of the GNU General Public License as published by
  6. * the Free Software Foundation; either version 2, or (at your option)
  7. * any later version.
  8. *
  9. */
  10. #include <linux/module.h>
  11. #include <linux/kthread.h>
  12. #include <linux/dlm.h>
  13. #include <linux/sched.h>
  14. #include <linux/raid/md_p.h>
  15. #include "md.h"
  16. #include "bitmap.h"
  17. #include "md-cluster.h"
  18. #define LVB_SIZE 64
  19. #define NEW_DEV_TIMEOUT 5000
  20. struct dlm_lock_resource {
  21. dlm_lockspace_t *ls;
  22. struct dlm_lksb lksb;
  23. char *name; /* lock name. */
  24. uint32_t flags; /* flags to pass to dlm_lock() */
  25. wait_queue_head_t sync_locking; /* wait queue for synchronized locking */
  26. bool sync_locking_done;
  27. void (*bast)(void *arg, int mode); /* blocking AST function pointer*/
  28. struct mddev *mddev; /* pointing back to mddev. */
  29. int mode;
  30. };
  31. struct suspend_info {
  32. int slot;
  33. sector_t lo;
  34. sector_t hi;
  35. struct list_head list;
  36. };
  37. struct resync_info {
  38. __le64 lo;
  39. __le64 hi;
  40. };
  41. /* md_cluster_info flags */
  42. #define MD_CLUSTER_WAITING_FOR_NEWDISK 1
  43. #define MD_CLUSTER_SUSPEND_READ_BALANCING 2
  44. #define MD_CLUSTER_BEGIN_JOIN_CLUSTER 3
  45. /* Lock the send communication. This is done through
  46. * bit manipulation as opposed to a mutex in order to
  47. * accomodate lock and hold. See next comment.
  48. */
  49. #define MD_CLUSTER_SEND_LOCK 4
  50. /* If cluster operations (such as adding a disk) must lock the
  51. * communication channel, so as to perform extra operations
  52. * (update metadata) and no other operation is allowed on the
  53. * MD. Token needs to be locked and held until the operation
  54. * completes witha md_update_sb(), which would eventually release
  55. * the lock.
  56. */
  57. #define MD_CLUSTER_SEND_LOCKED_ALREADY 5
  58. /* We should receive message after node joined cluster and
  59. * set up all the related infos such as bitmap and personality */
  60. #define MD_CLUSTER_ALREADY_IN_CLUSTER 6
  61. #define MD_CLUSTER_PENDING_RECV_EVENT 7
  62. #define MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD 8
  63. struct md_cluster_info {
  64. struct mddev *mddev; /* the md device which md_cluster_info belongs to */
  65. /* dlm lock space and resources for clustered raid. */
  66. dlm_lockspace_t *lockspace;
  67. int slot_number;
  68. struct completion completion;
  69. struct mutex recv_mutex;
  70. struct dlm_lock_resource *bitmap_lockres;
  71. struct dlm_lock_resource **other_bitmap_lockres;
  72. struct dlm_lock_resource *resync_lockres;
  73. struct list_head suspend_list;
  74. spinlock_t suspend_lock;
  75. struct md_thread *recovery_thread;
  76. unsigned long recovery_map;
  77. /* communication loc resources */
  78. struct dlm_lock_resource *ack_lockres;
  79. struct dlm_lock_resource *message_lockres;
  80. struct dlm_lock_resource *token_lockres;
  81. struct dlm_lock_resource *no_new_dev_lockres;
  82. struct md_thread *recv_thread;
  83. struct completion newdisk_completion;
  84. wait_queue_head_t wait;
  85. unsigned long state;
  86. /* record the region in RESYNCING message */
  87. sector_t sync_low;
  88. sector_t sync_hi;
  89. };
  90. enum msg_type {
  91. METADATA_UPDATED = 0,
  92. RESYNCING,
  93. NEWDISK,
  94. REMOVE,
  95. RE_ADD,
  96. BITMAP_NEEDS_SYNC,
  97. CHANGE_CAPACITY,
  98. };
  99. struct cluster_msg {
  100. __le32 type;
  101. __le32 slot;
  102. /* TODO: Unionize this for smaller footprint */
  103. __le64 low;
  104. __le64 high;
  105. char uuid[16];
  106. __le32 raid_slot;
  107. };
  108. static void sync_ast(void *arg)
  109. {
  110. struct dlm_lock_resource *res;
  111. res = arg;
  112. res->sync_locking_done = true;
  113. wake_up(&res->sync_locking);
  114. }
  115. static int dlm_lock_sync(struct dlm_lock_resource *res, int mode)
  116. {
  117. int ret = 0;
  118. ret = dlm_lock(res->ls, mode, &res->lksb,
  119. res->flags, res->name, strlen(res->name),
  120. 0, sync_ast, res, res->bast);
  121. if (ret)
  122. return ret;
  123. wait_event(res->sync_locking, res->sync_locking_done);
  124. res->sync_locking_done = false;
  125. if (res->lksb.sb_status == 0)
  126. res->mode = mode;
  127. return res->lksb.sb_status;
  128. }
  129. static int dlm_unlock_sync(struct dlm_lock_resource *res)
  130. {
  131. return dlm_lock_sync(res, DLM_LOCK_NL);
  132. }
  133. /*
  134. * An variation of dlm_lock_sync, which make lock request could
  135. * be interrupted
  136. */
  137. static int dlm_lock_sync_interruptible(struct dlm_lock_resource *res, int mode,
  138. struct mddev *mddev)
  139. {
  140. int ret = 0;
  141. ret = dlm_lock(res->ls, mode, &res->lksb,
  142. res->flags, res->name, strlen(res->name),
  143. 0, sync_ast, res, res->bast);
  144. if (ret)
  145. return ret;
  146. wait_event(res->sync_locking, res->sync_locking_done
  147. || kthread_should_stop()
  148. || test_bit(MD_CLOSING, &mddev->flags));
  149. if (!res->sync_locking_done) {
  150. /*
  151. * the convert queue contains the lock request when request is
  152. * interrupted, and sync_ast could still be run, so need to
  153. * cancel the request and reset completion
  154. */
  155. ret = dlm_unlock(res->ls, res->lksb.sb_lkid, DLM_LKF_CANCEL,
  156. &res->lksb, res);
  157. res->sync_locking_done = false;
  158. if (unlikely(ret != 0))
  159. pr_info("failed to cancel previous lock request "
  160. "%s return %d\n", res->name, ret);
  161. return -EPERM;
  162. } else
  163. res->sync_locking_done = false;
  164. if (res->lksb.sb_status == 0)
  165. res->mode = mode;
  166. return res->lksb.sb_status;
  167. }
  168. static struct dlm_lock_resource *lockres_init(struct mddev *mddev,
  169. char *name, void (*bastfn)(void *arg, int mode), int with_lvb)
  170. {
  171. struct dlm_lock_resource *res = NULL;
  172. int ret, namelen;
  173. struct md_cluster_info *cinfo = mddev->cluster_info;
  174. res = kzalloc(sizeof(struct dlm_lock_resource), GFP_KERNEL);
  175. if (!res)
  176. return NULL;
  177. init_waitqueue_head(&res->sync_locking);
  178. res->sync_locking_done = false;
  179. res->ls = cinfo->lockspace;
  180. res->mddev = mddev;
  181. res->mode = DLM_LOCK_IV;
  182. namelen = strlen(name);
  183. res->name = kzalloc(namelen + 1, GFP_KERNEL);
  184. if (!res->name) {
  185. pr_err("md-cluster: Unable to allocate resource name for resource %s\n", name);
  186. goto out_err;
  187. }
  188. strlcpy(res->name, name, namelen + 1);
  189. if (with_lvb) {
  190. res->lksb.sb_lvbptr = kzalloc(LVB_SIZE, GFP_KERNEL);
  191. if (!res->lksb.sb_lvbptr) {
  192. pr_err("md-cluster: Unable to allocate LVB for resource %s\n", name);
  193. goto out_err;
  194. }
  195. res->flags = DLM_LKF_VALBLK;
  196. }
  197. if (bastfn)
  198. res->bast = bastfn;
  199. res->flags |= DLM_LKF_EXPEDITE;
  200. ret = dlm_lock_sync(res, DLM_LOCK_NL);
  201. if (ret) {
  202. pr_err("md-cluster: Unable to lock NL on new lock resource %s\n", name);
  203. goto out_err;
  204. }
  205. res->flags &= ~DLM_LKF_EXPEDITE;
  206. res->flags |= DLM_LKF_CONVERT;
  207. return res;
  208. out_err:
  209. kfree(res->lksb.sb_lvbptr);
  210. kfree(res->name);
  211. kfree(res);
  212. return NULL;
  213. }
  214. static void lockres_free(struct dlm_lock_resource *res)
  215. {
  216. int ret = 0;
  217. if (!res)
  218. return;
  219. /*
  220. * use FORCEUNLOCK flag, so we can unlock even the lock is on the
  221. * waiting or convert queue
  222. */
  223. ret = dlm_unlock(res->ls, res->lksb.sb_lkid, DLM_LKF_FORCEUNLOCK,
  224. &res->lksb, res);
  225. if (unlikely(ret != 0))
  226. pr_err("failed to unlock %s return %d\n", res->name, ret);
  227. else
  228. wait_event(res->sync_locking, res->sync_locking_done);
  229. kfree(res->name);
  230. kfree(res->lksb.sb_lvbptr);
  231. kfree(res);
  232. }
  233. static void add_resync_info(struct dlm_lock_resource *lockres,
  234. sector_t lo, sector_t hi)
  235. {
  236. struct resync_info *ri;
  237. ri = (struct resync_info *)lockres->lksb.sb_lvbptr;
  238. ri->lo = cpu_to_le64(lo);
  239. ri->hi = cpu_to_le64(hi);
  240. }
  241. static struct suspend_info *read_resync_info(struct mddev *mddev, struct dlm_lock_resource *lockres)
  242. {
  243. struct resync_info ri;
  244. struct suspend_info *s = NULL;
  245. sector_t hi = 0;
  246. dlm_lock_sync(lockres, DLM_LOCK_CR);
  247. memcpy(&ri, lockres->lksb.sb_lvbptr, sizeof(struct resync_info));
  248. hi = le64_to_cpu(ri.hi);
  249. if (hi > 0) {
  250. s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL);
  251. if (!s)
  252. goto out;
  253. s->hi = hi;
  254. s->lo = le64_to_cpu(ri.lo);
  255. }
  256. dlm_unlock_sync(lockres);
  257. out:
  258. return s;
  259. }
  260. static void recover_bitmaps(struct md_thread *thread)
  261. {
  262. struct mddev *mddev = thread->mddev;
  263. struct md_cluster_info *cinfo = mddev->cluster_info;
  264. struct dlm_lock_resource *bm_lockres;
  265. char str[64];
  266. int slot, ret;
  267. struct suspend_info *s, *tmp;
  268. sector_t lo, hi;
  269. while (cinfo->recovery_map) {
  270. slot = fls64((u64)cinfo->recovery_map) - 1;
  271. /* Clear suspend_area associated with the bitmap */
  272. spin_lock_irq(&cinfo->suspend_lock);
  273. list_for_each_entry_safe(s, tmp, &cinfo->suspend_list, list)
  274. if (slot == s->slot) {
  275. list_del(&s->list);
  276. kfree(s);
  277. }
  278. spin_unlock_irq(&cinfo->suspend_lock);
  279. snprintf(str, 64, "bitmap%04d", slot);
  280. bm_lockres = lockres_init(mddev, str, NULL, 1);
  281. if (!bm_lockres) {
  282. pr_err("md-cluster: Cannot initialize bitmaps\n");
  283. goto clear_bit;
  284. }
  285. ret = dlm_lock_sync_interruptible(bm_lockres, DLM_LOCK_PW, mddev);
  286. if (ret) {
  287. pr_err("md-cluster: Could not DLM lock %s: %d\n",
  288. str, ret);
  289. goto clear_bit;
  290. }
  291. ret = bitmap_copy_from_slot(mddev, slot, &lo, &hi, true);
  292. if (ret) {
  293. pr_err("md-cluster: Could not copy data from bitmap %d\n", slot);
  294. goto clear_bit;
  295. }
  296. if (hi > 0) {
  297. if (lo < mddev->recovery_cp)
  298. mddev->recovery_cp = lo;
  299. /* wake up thread to continue resync in case resync
  300. * is not finished */
  301. if (mddev->recovery_cp != MaxSector) {
  302. set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
  303. md_wakeup_thread(mddev->thread);
  304. }
  305. }
  306. clear_bit:
  307. lockres_free(bm_lockres);
  308. clear_bit(slot, &cinfo->recovery_map);
  309. }
  310. }
  311. static void recover_prep(void *arg)
  312. {
  313. struct mddev *mddev = arg;
  314. struct md_cluster_info *cinfo = mddev->cluster_info;
  315. set_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state);
  316. }
  317. static void __recover_slot(struct mddev *mddev, int slot)
  318. {
  319. struct md_cluster_info *cinfo = mddev->cluster_info;
  320. set_bit(slot, &cinfo->recovery_map);
  321. if (!cinfo->recovery_thread) {
  322. cinfo->recovery_thread = md_register_thread(recover_bitmaps,
  323. mddev, "recover");
  324. if (!cinfo->recovery_thread) {
  325. pr_warn("md-cluster: Could not create recovery thread\n");
  326. return;
  327. }
  328. }
  329. md_wakeup_thread(cinfo->recovery_thread);
  330. }
  331. static void recover_slot(void *arg, struct dlm_slot *slot)
  332. {
  333. struct mddev *mddev = arg;
  334. struct md_cluster_info *cinfo = mddev->cluster_info;
  335. pr_info("md-cluster: %s Node %d/%d down. My slot: %d. Initiating recovery.\n",
  336. mddev->bitmap_info.cluster_name,
  337. slot->nodeid, slot->slot,
  338. cinfo->slot_number);
  339. /* deduct one since dlm slot starts from one while the num of
  340. * cluster-md begins with 0 */
  341. __recover_slot(mddev, slot->slot - 1);
  342. }
  343. static void recover_done(void *arg, struct dlm_slot *slots,
  344. int num_slots, int our_slot,
  345. uint32_t generation)
  346. {
  347. struct mddev *mddev = arg;
  348. struct md_cluster_info *cinfo = mddev->cluster_info;
  349. cinfo->slot_number = our_slot;
  350. /* completion is only need to be complete when node join cluster,
  351. * it doesn't need to run during another node's failure */
  352. if (test_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state)) {
  353. complete(&cinfo->completion);
  354. clear_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state);
  355. }
  356. clear_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state);
  357. }
  358. /* the ops is called when node join the cluster, and do lock recovery
  359. * if node failure occurs */
  360. static const struct dlm_lockspace_ops md_ls_ops = {
  361. .recover_prep = recover_prep,
  362. .recover_slot = recover_slot,
  363. .recover_done = recover_done,
  364. };
  365. /*
  366. * The BAST function for the ack lock resource
  367. * This function wakes up the receive thread in
  368. * order to receive and process the message.
  369. */
  370. static void ack_bast(void *arg, int mode)
  371. {
  372. struct dlm_lock_resource *res = arg;
  373. struct md_cluster_info *cinfo = res->mddev->cluster_info;
  374. if (mode == DLM_LOCK_EX) {
  375. if (test_bit(MD_CLUSTER_ALREADY_IN_CLUSTER, &cinfo->state))
  376. md_wakeup_thread(cinfo->recv_thread);
  377. else
  378. set_bit(MD_CLUSTER_PENDING_RECV_EVENT, &cinfo->state);
  379. }
  380. }
  381. static void __remove_suspend_info(struct md_cluster_info *cinfo, int slot)
  382. {
  383. struct suspend_info *s, *tmp;
  384. list_for_each_entry_safe(s, tmp, &cinfo->suspend_list, list)
  385. if (slot == s->slot) {
  386. list_del(&s->list);
  387. kfree(s);
  388. break;
  389. }
  390. }
  391. static void remove_suspend_info(struct mddev *mddev, int slot)
  392. {
  393. struct md_cluster_info *cinfo = mddev->cluster_info;
  394. spin_lock_irq(&cinfo->suspend_lock);
  395. __remove_suspend_info(cinfo, slot);
  396. spin_unlock_irq(&cinfo->suspend_lock);
  397. mddev->pers->quiesce(mddev, 2);
  398. }
  399. static void process_suspend_info(struct mddev *mddev,
  400. int slot, sector_t lo, sector_t hi)
  401. {
  402. struct md_cluster_info *cinfo = mddev->cluster_info;
  403. struct suspend_info *s;
  404. if (!hi) {
  405. remove_suspend_info(mddev, slot);
  406. set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
  407. md_wakeup_thread(mddev->thread);
  408. return;
  409. }
  410. /*
  411. * The bitmaps are not same for different nodes
  412. * if RESYNCING is happening in one node, then
  413. * the node which received the RESYNCING message
  414. * probably will perform resync with the region
  415. * [lo, hi] again, so we could reduce resync time
  416. * a lot if we can ensure that the bitmaps among
  417. * different nodes are match up well.
  418. *
  419. * sync_low/hi is used to record the region which
  420. * arrived in the previous RESYNCING message,
  421. *
  422. * Call bitmap_sync_with_cluster to clear
  423. * NEEDED_MASK and set RESYNC_MASK since
  424. * resync thread is running in another node,
  425. * so we don't need to do the resync again
  426. * with the same section */
  427. bitmap_sync_with_cluster(mddev, cinfo->sync_low,
  428. cinfo->sync_hi,
  429. lo, hi);
  430. cinfo->sync_low = lo;
  431. cinfo->sync_hi = hi;
  432. s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL);
  433. if (!s)
  434. return;
  435. s->slot = slot;
  436. s->lo = lo;
  437. s->hi = hi;
  438. mddev->pers->quiesce(mddev, 1);
  439. mddev->pers->quiesce(mddev, 0);
  440. spin_lock_irq(&cinfo->suspend_lock);
  441. /* Remove existing entry (if exists) before adding */
  442. __remove_suspend_info(cinfo, slot);
  443. list_add(&s->list, &cinfo->suspend_list);
  444. spin_unlock_irq(&cinfo->suspend_lock);
  445. mddev->pers->quiesce(mddev, 2);
  446. }
  447. static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
  448. {
  449. char disk_uuid[64];
  450. struct md_cluster_info *cinfo = mddev->cluster_info;
  451. char event_name[] = "EVENT=ADD_DEVICE";
  452. char raid_slot[16];
  453. char *envp[] = {event_name, disk_uuid, raid_slot, NULL};
  454. int len;
  455. len = snprintf(disk_uuid, 64, "DEVICE_UUID=");
  456. sprintf(disk_uuid + len, "%pU", cmsg->uuid);
  457. snprintf(raid_slot, 16, "RAID_DISK=%d", le32_to_cpu(cmsg->raid_slot));
  458. pr_info("%s:%d Sending kobject change with %s and %s\n", __func__, __LINE__, disk_uuid, raid_slot);
  459. init_completion(&cinfo->newdisk_completion);
  460. set_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state);
  461. kobject_uevent_env(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE, envp);
  462. wait_for_completion_timeout(&cinfo->newdisk_completion,
  463. NEW_DEV_TIMEOUT);
  464. clear_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state);
  465. }
  466. static void process_metadata_update(struct mddev *mddev, struct cluster_msg *msg)
  467. {
  468. int got_lock = 0;
  469. struct md_cluster_info *cinfo = mddev->cluster_info;
  470. mddev->good_device_nr = le32_to_cpu(msg->raid_slot);
  471. dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR);
  472. wait_event(mddev->thread->wqueue,
  473. (got_lock = mddev_trylock(mddev)) ||
  474. test_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state));
  475. md_reload_sb(mddev, mddev->good_device_nr);
  476. if (got_lock)
  477. mddev_unlock(mddev);
  478. }
  479. static void process_remove_disk(struct mddev *mddev, struct cluster_msg *msg)
  480. {
  481. struct md_rdev *rdev;
  482. rcu_read_lock();
  483. rdev = md_find_rdev_nr_rcu(mddev, le32_to_cpu(msg->raid_slot));
  484. if (rdev) {
  485. set_bit(ClusterRemove, &rdev->flags);
  486. set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
  487. md_wakeup_thread(mddev->thread);
  488. }
  489. else
  490. pr_warn("%s: %d Could not find disk(%d) to REMOVE\n",
  491. __func__, __LINE__, le32_to_cpu(msg->raid_slot));
  492. rcu_read_unlock();
  493. }
  494. static void process_readd_disk(struct mddev *mddev, struct cluster_msg *msg)
  495. {
  496. struct md_rdev *rdev;
  497. rcu_read_lock();
  498. rdev = md_find_rdev_nr_rcu(mddev, le32_to_cpu(msg->raid_slot));
  499. if (rdev && test_bit(Faulty, &rdev->flags))
  500. clear_bit(Faulty, &rdev->flags);
  501. else
  502. pr_warn("%s: %d Could not find disk(%d) which is faulty",
  503. __func__, __LINE__, le32_to_cpu(msg->raid_slot));
  504. rcu_read_unlock();
  505. }
  506. static int process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
  507. {
  508. int ret = 0;
  509. if (WARN(mddev->cluster_info->slot_number - 1 == le32_to_cpu(msg->slot),
  510. "node %d received it's own msg\n", le32_to_cpu(msg->slot)))
  511. return -1;
  512. switch (le32_to_cpu(msg->type)) {
  513. case METADATA_UPDATED:
  514. process_metadata_update(mddev, msg);
  515. break;
  516. case CHANGE_CAPACITY:
  517. set_capacity(mddev->gendisk, mddev->array_sectors);
  518. revalidate_disk(mddev->gendisk);
  519. break;
  520. case RESYNCING:
  521. process_suspend_info(mddev, le32_to_cpu(msg->slot),
  522. le64_to_cpu(msg->low),
  523. le64_to_cpu(msg->high));
  524. break;
  525. case NEWDISK:
  526. process_add_new_disk(mddev, msg);
  527. break;
  528. case REMOVE:
  529. process_remove_disk(mddev, msg);
  530. break;
  531. case RE_ADD:
  532. process_readd_disk(mddev, msg);
  533. break;
  534. case BITMAP_NEEDS_SYNC:
  535. __recover_slot(mddev, le32_to_cpu(msg->slot));
  536. break;
  537. default:
  538. ret = -1;
  539. pr_warn("%s:%d Received unknown message from %d\n",
  540. __func__, __LINE__, msg->slot);
  541. }
  542. return ret;
  543. }
  544. /*
  545. * thread for receiving message
  546. */
  547. static void recv_daemon(struct md_thread *thread)
  548. {
  549. struct md_cluster_info *cinfo = thread->mddev->cluster_info;
  550. struct dlm_lock_resource *ack_lockres = cinfo->ack_lockres;
  551. struct dlm_lock_resource *message_lockres = cinfo->message_lockres;
  552. struct cluster_msg msg;
  553. int ret;
  554. mutex_lock(&cinfo->recv_mutex);
  555. /*get CR on Message*/
  556. if (dlm_lock_sync(message_lockres, DLM_LOCK_CR)) {
  557. pr_err("md/raid1:failed to get CR on MESSAGE\n");
  558. mutex_unlock(&cinfo->recv_mutex);
  559. return;
  560. }
  561. /* read lvb and wake up thread to process this message_lockres */
  562. memcpy(&msg, message_lockres->lksb.sb_lvbptr, sizeof(struct cluster_msg));
  563. ret = process_recvd_msg(thread->mddev, &msg);
  564. if (ret)
  565. goto out;
  566. /*release CR on ack_lockres*/
  567. ret = dlm_unlock_sync(ack_lockres);
  568. if (unlikely(ret != 0))
  569. pr_info("unlock ack failed return %d\n", ret);
  570. /*up-convert to PR on message_lockres*/
  571. ret = dlm_lock_sync(message_lockres, DLM_LOCK_PR);
  572. if (unlikely(ret != 0))
  573. pr_info("lock PR on msg failed return %d\n", ret);
  574. /*get CR on ack_lockres again*/
  575. ret = dlm_lock_sync(ack_lockres, DLM_LOCK_CR);
  576. if (unlikely(ret != 0))
  577. pr_info("lock CR on ack failed return %d\n", ret);
  578. out:
  579. /*release CR on message_lockres*/
  580. ret = dlm_unlock_sync(message_lockres);
  581. if (unlikely(ret != 0))
  582. pr_info("unlock msg failed return %d\n", ret);
  583. mutex_unlock(&cinfo->recv_mutex);
  584. }
  585. /* lock_token()
  586. * Takes the lock on the TOKEN lock resource so no other
  587. * node can communicate while the operation is underway.
  588. */
  589. static int lock_token(struct md_cluster_info *cinfo, bool mddev_locked)
  590. {
  591. int error, set_bit = 0;
  592. struct mddev *mddev = cinfo->mddev;
  593. /*
  594. * If resync thread run after raid1d thread, then process_metadata_update
  595. * could not continue if raid1d held reconfig_mutex (and raid1d is blocked
  596. * since another node already got EX on Token and waitting the EX of Ack),
  597. * so let resync wake up thread in case flag is set.
  598. */
  599. if (mddev_locked && !test_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD,
  600. &cinfo->state)) {
  601. error = test_and_set_bit_lock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD,
  602. &cinfo->state);
  603. WARN_ON_ONCE(error);
  604. md_wakeup_thread(mddev->thread);
  605. set_bit = 1;
  606. }
  607. error = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX);
  608. if (set_bit)
  609. clear_bit_unlock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state);
  610. if (error)
  611. pr_err("md-cluster(%s:%d): failed to get EX on TOKEN (%d)\n",
  612. __func__, __LINE__, error);
  613. /* Lock the receive sequence */
  614. mutex_lock(&cinfo->recv_mutex);
  615. return error;
  616. }
  617. /* lock_comm()
  618. * Sets the MD_CLUSTER_SEND_LOCK bit to lock the send channel.
  619. */
  620. static int lock_comm(struct md_cluster_info *cinfo, bool mddev_locked)
  621. {
  622. wait_event(cinfo->wait,
  623. !test_and_set_bit(MD_CLUSTER_SEND_LOCK, &cinfo->state));
  624. return lock_token(cinfo, mddev_locked);
  625. }
  626. static void unlock_comm(struct md_cluster_info *cinfo)
  627. {
  628. WARN_ON(cinfo->token_lockres->mode != DLM_LOCK_EX);
  629. mutex_unlock(&cinfo->recv_mutex);
  630. dlm_unlock_sync(cinfo->token_lockres);
  631. clear_bit(MD_CLUSTER_SEND_LOCK, &cinfo->state);
  632. wake_up(&cinfo->wait);
  633. }
  634. /* __sendmsg()
  635. * This function performs the actual sending of the message. This function is
  636. * usually called after performing the encompassing operation
  637. * The function:
  638. * 1. Grabs the message lockresource in EX mode
  639. * 2. Copies the message to the message LVB
  640. * 3. Downconverts message lockresource to CW
  641. * 4. Upconverts ack lock resource from CR to EX. This forces the BAST on other nodes
  642. * and the other nodes read the message. The thread will wait here until all other
  643. * nodes have released ack lock resource.
  644. * 5. Downconvert ack lockresource to CR
  645. */
  646. static int __sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg)
  647. {
  648. int error;
  649. int slot = cinfo->slot_number - 1;
  650. cmsg->slot = cpu_to_le32(slot);
  651. /*get EX on Message*/
  652. error = dlm_lock_sync(cinfo->message_lockres, DLM_LOCK_EX);
  653. if (error) {
  654. pr_err("md-cluster: failed to get EX on MESSAGE (%d)\n", error);
  655. goto failed_message;
  656. }
  657. memcpy(cinfo->message_lockres->lksb.sb_lvbptr, (void *)cmsg,
  658. sizeof(struct cluster_msg));
  659. /*down-convert EX to CW on Message*/
  660. error = dlm_lock_sync(cinfo->message_lockres, DLM_LOCK_CW);
  661. if (error) {
  662. pr_err("md-cluster: failed to convert EX to CW on MESSAGE(%d)\n",
  663. error);
  664. goto failed_ack;
  665. }
  666. /*up-convert CR to EX on Ack*/
  667. error = dlm_lock_sync(cinfo->ack_lockres, DLM_LOCK_EX);
  668. if (error) {
  669. pr_err("md-cluster: failed to convert CR to EX on ACK(%d)\n",
  670. error);
  671. goto failed_ack;
  672. }
  673. /*down-convert EX to CR on Ack*/
  674. error = dlm_lock_sync(cinfo->ack_lockres, DLM_LOCK_CR);
  675. if (error) {
  676. pr_err("md-cluster: failed to convert EX to CR on ACK(%d)\n",
  677. error);
  678. goto failed_ack;
  679. }
  680. failed_ack:
  681. error = dlm_unlock_sync(cinfo->message_lockres);
  682. if (unlikely(error != 0)) {
  683. pr_err("md-cluster: failed convert to NL on MESSAGE(%d)\n",
  684. error);
  685. /* in case the message can't be released due to some reason */
  686. goto failed_ack;
  687. }
  688. failed_message:
  689. return error;
  690. }
  691. static int sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg,
  692. bool mddev_locked)
  693. {
  694. int ret;
  695. lock_comm(cinfo, mddev_locked);
  696. ret = __sendmsg(cinfo, cmsg);
  697. unlock_comm(cinfo);
  698. return ret;
  699. }
  700. static int gather_all_resync_info(struct mddev *mddev, int total_slots)
  701. {
  702. struct md_cluster_info *cinfo = mddev->cluster_info;
  703. int i, ret = 0;
  704. struct dlm_lock_resource *bm_lockres;
  705. struct suspend_info *s;
  706. char str[64];
  707. sector_t lo, hi;
  708. for (i = 0; i < total_slots; i++) {
  709. memset(str, '\0', 64);
  710. snprintf(str, 64, "bitmap%04d", i);
  711. bm_lockres = lockres_init(mddev, str, NULL, 1);
  712. if (!bm_lockres)
  713. return -ENOMEM;
  714. if (i == (cinfo->slot_number - 1)) {
  715. lockres_free(bm_lockres);
  716. continue;
  717. }
  718. bm_lockres->flags |= DLM_LKF_NOQUEUE;
  719. ret = dlm_lock_sync(bm_lockres, DLM_LOCK_PW);
  720. if (ret == -EAGAIN) {
  721. s = read_resync_info(mddev, bm_lockres);
  722. if (s) {
  723. pr_info("%s:%d Resync[%llu..%llu] in progress on %d\n",
  724. __func__, __LINE__,
  725. (unsigned long long) s->lo,
  726. (unsigned long long) s->hi, i);
  727. spin_lock_irq(&cinfo->suspend_lock);
  728. s->slot = i;
  729. list_add(&s->list, &cinfo->suspend_list);
  730. spin_unlock_irq(&cinfo->suspend_lock);
  731. }
  732. ret = 0;
  733. lockres_free(bm_lockres);
  734. continue;
  735. }
  736. if (ret) {
  737. lockres_free(bm_lockres);
  738. goto out;
  739. }
  740. /* Read the disk bitmap sb and check if it needs recovery */
  741. ret = bitmap_copy_from_slot(mddev, i, &lo, &hi, false);
  742. if (ret) {
  743. pr_warn("md-cluster: Could not gather bitmaps from slot %d", i);
  744. lockres_free(bm_lockres);
  745. continue;
  746. }
  747. if ((hi > 0) && (lo < mddev->recovery_cp)) {
  748. set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
  749. mddev->recovery_cp = lo;
  750. md_check_recovery(mddev);
  751. }
  752. lockres_free(bm_lockres);
  753. }
  754. out:
  755. return ret;
  756. }
  757. static int join(struct mddev *mddev, int nodes)
  758. {
  759. struct md_cluster_info *cinfo;
  760. int ret, ops_rv;
  761. char str[64];
  762. cinfo = kzalloc(sizeof(struct md_cluster_info), GFP_KERNEL);
  763. if (!cinfo)
  764. return -ENOMEM;
  765. INIT_LIST_HEAD(&cinfo->suspend_list);
  766. spin_lock_init(&cinfo->suspend_lock);
  767. init_completion(&cinfo->completion);
  768. set_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state);
  769. init_waitqueue_head(&cinfo->wait);
  770. mutex_init(&cinfo->recv_mutex);
  771. mddev->cluster_info = cinfo;
  772. cinfo->mddev = mddev;
  773. memset(str, 0, 64);
  774. sprintf(str, "%pU", mddev->uuid);
  775. ret = dlm_new_lockspace(str, mddev->bitmap_info.cluster_name,
  776. DLM_LSFL_FS, LVB_SIZE,
  777. &md_ls_ops, mddev, &ops_rv, &cinfo->lockspace);
  778. if (ret)
  779. goto err;
  780. wait_for_completion(&cinfo->completion);
  781. if (nodes < cinfo->slot_number) {
  782. pr_err("md-cluster: Slot allotted(%d) is greater than available slots(%d).",
  783. cinfo->slot_number, nodes);
  784. ret = -ERANGE;
  785. goto err;
  786. }
  787. /* Initiate the communication resources */
  788. ret = -ENOMEM;
  789. cinfo->recv_thread = md_register_thread(recv_daemon, mddev, "cluster_recv");
  790. if (!cinfo->recv_thread) {
  791. pr_err("md-cluster: cannot allocate memory for recv_thread!\n");
  792. goto err;
  793. }
  794. cinfo->message_lockres = lockres_init(mddev, "message", NULL, 1);
  795. if (!cinfo->message_lockres)
  796. goto err;
  797. cinfo->token_lockres = lockres_init(mddev, "token", NULL, 0);
  798. if (!cinfo->token_lockres)
  799. goto err;
  800. cinfo->no_new_dev_lockres = lockres_init(mddev, "no-new-dev", NULL, 0);
  801. if (!cinfo->no_new_dev_lockres)
  802. goto err;
  803. ret = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX);
  804. if (ret) {
  805. ret = -EAGAIN;
  806. pr_err("md-cluster: can't join cluster to avoid lock issue\n");
  807. goto err;
  808. }
  809. cinfo->ack_lockres = lockres_init(mddev, "ack", ack_bast, 0);
  810. if (!cinfo->ack_lockres) {
  811. ret = -ENOMEM;
  812. goto err;
  813. }
  814. /* get sync CR lock on ACK. */
  815. if (dlm_lock_sync(cinfo->ack_lockres, DLM_LOCK_CR))
  816. pr_err("md-cluster: failed to get a sync CR lock on ACK!(%d)\n",
  817. ret);
  818. dlm_unlock_sync(cinfo->token_lockres);
  819. /* get sync CR lock on no-new-dev. */
  820. if (dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR))
  821. pr_err("md-cluster: failed to get a sync CR lock on no-new-dev!(%d)\n", ret);
  822. pr_info("md-cluster: Joined cluster %s slot %d\n", str, cinfo->slot_number);
  823. snprintf(str, 64, "bitmap%04d", cinfo->slot_number - 1);
  824. cinfo->bitmap_lockres = lockres_init(mddev, str, NULL, 1);
  825. if (!cinfo->bitmap_lockres) {
  826. ret = -ENOMEM;
  827. goto err;
  828. }
  829. if (dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW)) {
  830. pr_err("Failed to get bitmap lock\n");
  831. ret = -EINVAL;
  832. goto err;
  833. }
  834. cinfo->resync_lockres = lockres_init(mddev, "resync", NULL, 0);
  835. if (!cinfo->resync_lockres) {
  836. ret = -ENOMEM;
  837. goto err;
  838. }
  839. return 0;
  840. err:
  841. set_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state);
  842. md_unregister_thread(&cinfo->recovery_thread);
  843. md_unregister_thread(&cinfo->recv_thread);
  844. lockres_free(cinfo->message_lockres);
  845. lockres_free(cinfo->token_lockres);
  846. lockres_free(cinfo->ack_lockres);
  847. lockres_free(cinfo->no_new_dev_lockres);
  848. lockres_free(cinfo->resync_lockres);
  849. lockres_free(cinfo->bitmap_lockres);
  850. if (cinfo->lockspace)
  851. dlm_release_lockspace(cinfo->lockspace, 2);
  852. mddev->cluster_info = NULL;
  853. kfree(cinfo);
  854. return ret;
  855. }
  856. static void load_bitmaps(struct mddev *mddev, int total_slots)
  857. {
  858. struct md_cluster_info *cinfo = mddev->cluster_info;
  859. /* load all the node's bitmap info for resync */
  860. if (gather_all_resync_info(mddev, total_slots))
  861. pr_err("md-cluster: failed to gather all resyn infos\n");
  862. set_bit(MD_CLUSTER_ALREADY_IN_CLUSTER, &cinfo->state);
  863. /* wake up recv thread in case something need to be handled */
  864. if (test_and_clear_bit(MD_CLUSTER_PENDING_RECV_EVENT, &cinfo->state))
  865. md_wakeup_thread(cinfo->recv_thread);
  866. }
  867. static void resync_bitmap(struct mddev *mddev)
  868. {
  869. struct md_cluster_info *cinfo = mddev->cluster_info;
  870. struct cluster_msg cmsg = {0};
  871. int err;
  872. cmsg.type = cpu_to_le32(BITMAP_NEEDS_SYNC);
  873. err = sendmsg(cinfo, &cmsg, 1);
  874. if (err)
  875. pr_err("%s:%d: failed to send BITMAP_NEEDS_SYNC message (%d)\n",
  876. __func__, __LINE__, err);
  877. }
  878. static void unlock_all_bitmaps(struct mddev *mddev);
  879. static int leave(struct mddev *mddev)
  880. {
  881. struct md_cluster_info *cinfo = mddev->cluster_info;
  882. if (!cinfo)
  883. return 0;
  884. /* BITMAP_NEEDS_SYNC message should be sent when node
  885. * is leaving the cluster with dirty bitmap, also we
  886. * can only deliver it when dlm connection is available */
  887. if (cinfo->slot_number > 0 && mddev->recovery_cp != MaxSector)
  888. resync_bitmap(mddev);
  889. set_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state);
  890. md_unregister_thread(&cinfo->recovery_thread);
  891. md_unregister_thread(&cinfo->recv_thread);
  892. lockres_free(cinfo->message_lockres);
  893. lockres_free(cinfo->token_lockres);
  894. lockres_free(cinfo->ack_lockres);
  895. lockres_free(cinfo->no_new_dev_lockres);
  896. lockres_free(cinfo->resync_lockres);
  897. lockres_free(cinfo->bitmap_lockres);
  898. unlock_all_bitmaps(mddev);
  899. dlm_release_lockspace(cinfo->lockspace, 2);
  900. kfree(cinfo);
  901. return 0;
  902. }
  903. /* slot_number(): Returns the MD slot number to use
  904. * DLM starts the slot numbers from 1, wheras cluster-md
  905. * wants the number to be from zero, so we deduct one
  906. */
  907. static int slot_number(struct mddev *mddev)
  908. {
  909. struct md_cluster_info *cinfo = mddev->cluster_info;
  910. return cinfo->slot_number - 1;
  911. }
  912. /*
  913. * Check if the communication is already locked, else lock the communication
  914. * channel.
  915. * If it is already locked, token is in EX mode, and hence lock_token()
  916. * should not be called.
  917. */
  918. static int metadata_update_start(struct mddev *mddev)
  919. {
  920. struct md_cluster_info *cinfo = mddev->cluster_info;
  921. int ret;
  922. /*
  923. * metadata_update_start is always called with the protection of
  924. * reconfig_mutex, so set WAITING_FOR_TOKEN here.
  925. */
  926. ret = test_and_set_bit_lock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD,
  927. &cinfo->state);
  928. WARN_ON_ONCE(ret);
  929. md_wakeup_thread(mddev->thread);
  930. wait_event(cinfo->wait,
  931. !test_and_set_bit(MD_CLUSTER_SEND_LOCK, &cinfo->state) ||
  932. test_and_clear_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, &cinfo->state));
  933. /* If token is already locked, return 0 */
  934. if (cinfo->token_lockres->mode == DLM_LOCK_EX) {
  935. clear_bit_unlock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state);
  936. return 0;
  937. }
  938. ret = lock_token(cinfo, 1);
  939. clear_bit_unlock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state);
  940. return ret;
  941. }
  942. static int metadata_update_finish(struct mddev *mddev)
  943. {
  944. struct md_cluster_info *cinfo = mddev->cluster_info;
  945. struct cluster_msg cmsg;
  946. struct md_rdev *rdev;
  947. int ret = 0;
  948. int raid_slot = -1;
  949. memset(&cmsg, 0, sizeof(cmsg));
  950. cmsg.type = cpu_to_le32(METADATA_UPDATED);
  951. /* Pick up a good active device number to send.
  952. */
  953. rdev_for_each(rdev, mddev)
  954. if (rdev->raid_disk > -1 && !test_bit(Faulty, &rdev->flags)) {
  955. raid_slot = rdev->desc_nr;
  956. break;
  957. }
  958. if (raid_slot >= 0) {
  959. cmsg.raid_slot = cpu_to_le32(raid_slot);
  960. ret = __sendmsg(cinfo, &cmsg);
  961. } else
  962. pr_warn("md-cluster: No good device id found to send\n");
  963. clear_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, &cinfo->state);
  964. unlock_comm(cinfo);
  965. return ret;
  966. }
  967. static void metadata_update_cancel(struct mddev *mddev)
  968. {
  969. struct md_cluster_info *cinfo = mddev->cluster_info;
  970. clear_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, &cinfo->state);
  971. unlock_comm(cinfo);
  972. }
  973. /*
  974. * return 0 if all the bitmaps have the same sync_size
  975. */
  976. int cluster_check_sync_size(struct mddev *mddev)
  977. {
  978. int i, rv;
  979. bitmap_super_t *sb;
  980. unsigned long my_sync_size, sync_size = 0;
  981. int node_num = mddev->bitmap_info.nodes;
  982. int current_slot = md_cluster_ops->slot_number(mddev);
  983. struct bitmap *bitmap = mddev->bitmap;
  984. char str[64];
  985. struct dlm_lock_resource *bm_lockres;
  986. sb = kmap_atomic(bitmap->storage.sb_page);
  987. my_sync_size = sb->sync_size;
  988. kunmap_atomic(sb);
  989. for (i = 0; i < node_num; i++) {
  990. if (i == current_slot)
  991. continue;
  992. bitmap = get_bitmap_from_slot(mddev, i);
  993. if (IS_ERR(bitmap)) {
  994. pr_err("can't get bitmap from slot %d\n", i);
  995. return -1;
  996. }
  997. /*
  998. * If we can hold the bitmap lock of one node then
  999. * the slot is not occupied, update the sb.
  1000. */
  1001. snprintf(str, 64, "bitmap%04d", i);
  1002. bm_lockres = lockres_init(mddev, str, NULL, 1);
  1003. if (!bm_lockres) {
  1004. pr_err("md-cluster: Cannot initialize %s\n", str);
  1005. bitmap_free(bitmap);
  1006. return -1;
  1007. }
  1008. bm_lockres->flags |= DLM_LKF_NOQUEUE;
  1009. rv = dlm_lock_sync(bm_lockres, DLM_LOCK_PW);
  1010. if (!rv)
  1011. bitmap_update_sb(bitmap);
  1012. lockres_free(bm_lockres);
  1013. sb = kmap_atomic(bitmap->storage.sb_page);
  1014. if (sync_size == 0)
  1015. sync_size = sb->sync_size;
  1016. else if (sync_size != sb->sync_size) {
  1017. kunmap_atomic(sb);
  1018. bitmap_free(bitmap);
  1019. return -1;
  1020. }
  1021. kunmap_atomic(sb);
  1022. bitmap_free(bitmap);
  1023. }
  1024. return (my_sync_size == sync_size) ? 0 : -1;
  1025. }
  1026. /*
  1027. * Update the size for cluster raid is a little more complex, we perform it
  1028. * by the steps:
  1029. * 1. hold token lock and update superblock in initiator node.
  1030. * 2. send METADATA_UPDATED msg to other nodes.
  1031. * 3. The initiator node continues to check each bitmap's sync_size, if all
  1032. * bitmaps have the same value of sync_size, then we can set capacity and
  1033. * let other nodes to perform it. If one node can't update sync_size
  1034. * accordingly, we need to revert to previous value.
  1035. */
  1036. static void update_size(struct mddev *mddev, sector_t old_dev_sectors)
  1037. {
  1038. struct md_cluster_info *cinfo = mddev->cluster_info;
  1039. struct cluster_msg cmsg;
  1040. struct md_rdev *rdev;
  1041. int ret = 0;
  1042. int raid_slot = -1;
  1043. md_update_sb(mddev, 1);
  1044. lock_comm(cinfo, 1);
  1045. memset(&cmsg, 0, sizeof(cmsg));
  1046. cmsg.type = cpu_to_le32(METADATA_UPDATED);
  1047. rdev_for_each(rdev, mddev)
  1048. if (rdev->raid_disk >= 0 && !test_bit(Faulty, &rdev->flags)) {
  1049. raid_slot = rdev->desc_nr;
  1050. break;
  1051. }
  1052. if (raid_slot >= 0) {
  1053. cmsg.raid_slot = cpu_to_le32(raid_slot);
  1054. /*
  1055. * We can only change capiticy after all the nodes can do it,
  1056. * so need to wait after other nodes already received the msg
  1057. * and handled the change
  1058. */
  1059. ret = __sendmsg(cinfo, &cmsg);
  1060. if (ret) {
  1061. pr_err("%s:%d: failed to send METADATA_UPDATED msg\n",
  1062. __func__, __LINE__);
  1063. unlock_comm(cinfo);
  1064. return;
  1065. }
  1066. } else {
  1067. pr_err("md-cluster: No good device id found to send\n");
  1068. unlock_comm(cinfo);
  1069. return;
  1070. }
  1071. /*
  1072. * check the sync_size from other node's bitmap, if sync_size
  1073. * have already updated in other nodes as expected, send an
  1074. * empty metadata msg to permit the change of capacity
  1075. */
  1076. if (cluster_check_sync_size(mddev) == 0) {
  1077. memset(&cmsg, 0, sizeof(cmsg));
  1078. cmsg.type = cpu_to_le32(CHANGE_CAPACITY);
  1079. ret = __sendmsg(cinfo, &cmsg);
  1080. if (ret)
  1081. pr_err("%s:%d: failed to send CHANGE_CAPACITY msg\n",
  1082. __func__, __LINE__);
  1083. set_capacity(mddev->gendisk, mddev->array_sectors);
  1084. revalidate_disk(mddev->gendisk);
  1085. } else {
  1086. /* revert to previous sectors */
  1087. ret = mddev->pers->resize(mddev, old_dev_sectors);
  1088. if (!ret)
  1089. revalidate_disk(mddev->gendisk);
  1090. ret = __sendmsg(cinfo, &cmsg);
  1091. if (ret)
  1092. pr_err("%s:%d: failed to send METADATA_UPDATED msg\n",
  1093. __func__, __LINE__);
  1094. }
  1095. unlock_comm(cinfo);
  1096. }
  1097. static int resync_start(struct mddev *mddev)
  1098. {
  1099. struct md_cluster_info *cinfo = mddev->cluster_info;
  1100. return dlm_lock_sync_interruptible(cinfo->resync_lockres, DLM_LOCK_EX, mddev);
  1101. }
  1102. static int resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi)
  1103. {
  1104. struct md_cluster_info *cinfo = mddev->cluster_info;
  1105. struct resync_info ri;
  1106. struct cluster_msg cmsg = {0};
  1107. /* do not send zero again, if we have sent before */
  1108. if (hi == 0) {
  1109. memcpy(&ri, cinfo->bitmap_lockres->lksb.sb_lvbptr, sizeof(struct resync_info));
  1110. if (le64_to_cpu(ri.hi) == 0)
  1111. return 0;
  1112. }
  1113. add_resync_info(cinfo->bitmap_lockres, lo, hi);
  1114. /* Re-acquire the lock to refresh LVB */
  1115. dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW);
  1116. cmsg.type = cpu_to_le32(RESYNCING);
  1117. cmsg.low = cpu_to_le64(lo);
  1118. cmsg.high = cpu_to_le64(hi);
  1119. /*
  1120. * mddev_lock is held if resync_info_update is called from
  1121. * resync_finish (md_reap_sync_thread -> resync_finish)
  1122. */
  1123. if (lo == 0 && hi == 0)
  1124. return sendmsg(cinfo, &cmsg, 1);
  1125. else
  1126. return sendmsg(cinfo, &cmsg, 0);
  1127. }
  1128. static int resync_finish(struct mddev *mddev)
  1129. {
  1130. struct md_cluster_info *cinfo = mddev->cluster_info;
  1131. dlm_unlock_sync(cinfo->resync_lockres);
  1132. return resync_info_update(mddev, 0, 0);
  1133. }
  1134. static int area_resyncing(struct mddev *mddev, int direction,
  1135. sector_t lo, sector_t hi)
  1136. {
  1137. struct md_cluster_info *cinfo = mddev->cluster_info;
  1138. int ret = 0;
  1139. struct suspend_info *s;
  1140. if ((direction == READ) &&
  1141. test_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state))
  1142. return 1;
  1143. spin_lock_irq(&cinfo->suspend_lock);
  1144. if (list_empty(&cinfo->suspend_list))
  1145. goto out;
  1146. list_for_each_entry(s, &cinfo->suspend_list, list)
  1147. if (hi > s->lo && lo < s->hi) {
  1148. ret = 1;
  1149. break;
  1150. }
  1151. out:
  1152. spin_unlock_irq(&cinfo->suspend_lock);
  1153. return ret;
  1154. }
  1155. /* add_new_disk() - initiates a disk add
  1156. * However, if this fails before writing md_update_sb(),
  1157. * add_new_disk_cancel() must be called to release token lock
  1158. */
  1159. static int add_new_disk(struct mddev *mddev, struct md_rdev *rdev)
  1160. {
  1161. struct md_cluster_info *cinfo = mddev->cluster_info;
  1162. struct cluster_msg cmsg;
  1163. int ret = 0;
  1164. struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
  1165. char *uuid = sb->device_uuid;
  1166. memset(&cmsg, 0, sizeof(cmsg));
  1167. cmsg.type = cpu_to_le32(NEWDISK);
  1168. memcpy(cmsg.uuid, uuid, 16);
  1169. cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
  1170. lock_comm(cinfo, 1);
  1171. ret = __sendmsg(cinfo, &cmsg);
  1172. if (ret) {
  1173. unlock_comm(cinfo);
  1174. return ret;
  1175. }
  1176. cinfo->no_new_dev_lockres->flags |= DLM_LKF_NOQUEUE;
  1177. ret = dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_EX);
  1178. cinfo->no_new_dev_lockres->flags &= ~DLM_LKF_NOQUEUE;
  1179. /* Some node does not "see" the device */
  1180. if (ret == -EAGAIN)
  1181. ret = -ENOENT;
  1182. if (ret)
  1183. unlock_comm(cinfo);
  1184. else {
  1185. dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR);
  1186. /* Since MD_CHANGE_DEVS will be set in add_bound_rdev which
  1187. * will run soon after add_new_disk, the below path will be
  1188. * invoked:
  1189. * md_wakeup_thread(mddev->thread)
  1190. * -> conf->thread (raid1d)
  1191. * -> md_check_recovery -> md_update_sb
  1192. * -> metadata_update_start/finish
  1193. * MD_CLUSTER_SEND_LOCKED_ALREADY will be cleared eventually.
  1194. *
  1195. * For other failure cases, metadata_update_cancel and
  1196. * add_new_disk_cancel also clear below bit as well.
  1197. * */
  1198. set_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, &cinfo->state);
  1199. wake_up(&cinfo->wait);
  1200. }
  1201. return ret;
  1202. }
  1203. static void add_new_disk_cancel(struct mddev *mddev)
  1204. {
  1205. struct md_cluster_info *cinfo = mddev->cluster_info;
  1206. clear_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, &cinfo->state);
  1207. unlock_comm(cinfo);
  1208. }
  1209. static int new_disk_ack(struct mddev *mddev, bool ack)
  1210. {
  1211. struct md_cluster_info *cinfo = mddev->cluster_info;
  1212. if (!test_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state)) {
  1213. pr_warn("md-cluster(%s): Spurious cluster confirmation\n", mdname(mddev));
  1214. return -EINVAL;
  1215. }
  1216. if (ack)
  1217. dlm_unlock_sync(cinfo->no_new_dev_lockres);
  1218. complete(&cinfo->newdisk_completion);
  1219. return 0;
  1220. }
  1221. static int remove_disk(struct mddev *mddev, struct md_rdev *rdev)
  1222. {
  1223. struct cluster_msg cmsg = {0};
  1224. struct md_cluster_info *cinfo = mddev->cluster_info;
  1225. cmsg.type = cpu_to_le32(REMOVE);
  1226. cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
  1227. return sendmsg(cinfo, &cmsg, 1);
  1228. }
  1229. static int lock_all_bitmaps(struct mddev *mddev)
  1230. {
  1231. int slot, my_slot, ret, held = 1, i = 0;
  1232. char str[64];
  1233. struct md_cluster_info *cinfo = mddev->cluster_info;
  1234. cinfo->other_bitmap_lockres = kzalloc((mddev->bitmap_info.nodes - 1) *
  1235. sizeof(struct dlm_lock_resource *),
  1236. GFP_KERNEL);
  1237. if (!cinfo->other_bitmap_lockres) {
  1238. pr_err("md: can't alloc mem for other bitmap locks\n");
  1239. return 0;
  1240. }
  1241. my_slot = slot_number(mddev);
  1242. for (slot = 0; slot < mddev->bitmap_info.nodes; slot++) {
  1243. if (slot == my_slot)
  1244. continue;
  1245. memset(str, '\0', 64);
  1246. snprintf(str, 64, "bitmap%04d", slot);
  1247. cinfo->other_bitmap_lockres[i] = lockres_init(mddev, str, NULL, 1);
  1248. if (!cinfo->other_bitmap_lockres[i])
  1249. return -ENOMEM;
  1250. cinfo->other_bitmap_lockres[i]->flags |= DLM_LKF_NOQUEUE;
  1251. ret = dlm_lock_sync(cinfo->other_bitmap_lockres[i], DLM_LOCK_PW);
  1252. if (ret)
  1253. held = -1;
  1254. i++;
  1255. }
  1256. return held;
  1257. }
  1258. static void unlock_all_bitmaps(struct mddev *mddev)
  1259. {
  1260. struct md_cluster_info *cinfo = mddev->cluster_info;
  1261. int i;
  1262. /* release other node's bitmap lock if they are existed */
  1263. if (cinfo->other_bitmap_lockres) {
  1264. for (i = 0; i < mddev->bitmap_info.nodes - 1; i++) {
  1265. if (cinfo->other_bitmap_lockres[i]) {
  1266. lockres_free(cinfo->other_bitmap_lockres[i]);
  1267. }
  1268. }
  1269. kfree(cinfo->other_bitmap_lockres);
  1270. }
  1271. }
  1272. static int gather_bitmaps(struct md_rdev *rdev)
  1273. {
  1274. int sn, err;
  1275. sector_t lo, hi;
  1276. struct cluster_msg cmsg = {0};
  1277. struct mddev *mddev = rdev->mddev;
  1278. struct md_cluster_info *cinfo = mddev->cluster_info;
  1279. cmsg.type = cpu_to_le32(RE_ADD);
  1280. cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
  1281. err = sendmsg(cinfo, &cmsg, 1);
  1282. if (err)
  1283. goto out;
  1284. for (sn = 0; sn < mddev->bitmap_info.nodes; sn++) {
  1285. if (sn == (cinfo->slot_number - 1))
  1286. continue;
  1287. err = bitmap_copy_from_slot(mddev, sn, &lo, &hi, false);
  1288. if (err) {
  1289. pr_warn("md-cluster: Could not gather bitmaps from slot %d", sn);
  1290. goto out;
  1291. }
  1292. if ((hi > 0) && (lo < mddev->recovery_cp))
  1293. mddev->recovery_cp = lo;
  1294. }
  1295. out:
  1296. return err;
  1297. }
  1298. static struct md_cluster_operations cluster_ops = {
  1299. .join = join,
  1300. .leave = leave,
  1301. .slot_number = slot_number,
  1302. .resync_start = resync_start,
  1303. .resync_finish = resync_finish,
  1304. .resync_info_update = resync_info_update,
  1305. .metadata_update_start = metadata_update_start,
  1306. .metadata_update_finish = metadata_update_finish,
  1307. .metadata_update_cancel = metadata_update_cancel,
  1308. .area_resyncing = area_resyncing,
  1309. .add_new_disk = add_new_disk,
  1310. .add_new_disk_cancel = add_new_disk_cancel,
  1311. .new_disk_ack = new_disk_ack,
  1312. .remove_disk = remove_disk,
  1313. .load_bitmaps = load_bitmaps,
  1314. .gather_bitmaps = gather_bitmaps,
  1315. .lock_all_bitmaps = lock_all_bitmaps,
  1316. .unlock_all_bitmaps = unlock_all_bitmaps,
  1317. .update_size = update_size,
  1318. };
  1319. static int __init cluster_init(void)
  1320. {
  1321. pr_warn("md-cluster: EXPERIMENTAL. Use with caution\n");
  1322. pr_info("Registering Cluster MD functions\n");
  1323. register_md_cluster_operations(&cluster_ops, THIS_MODULE);
  1324. return 0;
  1325. }
  1326. static void cluster_exit(void)
  1327. {
  1328. unregister_md_cluster_operations();
  1329. }
  1330. module_init(cluster_init);
  1331. module_exit(cluster_exit);
  1332. MODULE_AUTHOR("SUSE");
  1333. MODULE_LICENSE("GPL");
  1334. MODULE_DESCRIPTION("Clustering support for MD");