md-cluster.c 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026
  1. /*
  2. * Copyright (C) 2015, SUSE
  3. *
  4. * This program is free software; you can redistribute it and/or modify
  5. * it under the terms of the GNU General Public License as published by
  6. * the Free Software Foundation; either version 2, or (at your option)
  7. * any later version.
  8. *
  9. */
  10. #include <linux/module.h>
  11. #include <linux/dlm.h>
  12. #include <linux/sched.h>
  13. #include <linux/raid/md_p.h>
  14. #include "md.h"
  15. #include "bitmap.h"
  16. #include "md-cluster.h"
  17. #define LVB_SIZE 64
  18. #define NEW_DEV_TIMEOUT 5000
  19. struct dlm_lock_resource {
  20. dlm_lockspace_t *ls;
  21. struct dlm_lksb lksb;
  22. char *name; /* lock name. */
  23. uint32_t flags; /* flags to pass to dlm_lock() */
  24. struct completion completion; /* completion for synchronized locking */
  25. void (*bast)(void *arg, int mode); /* blocking AST function pointer*/
  26. struct mddev *mddev; /* pointing back to mddev. */
  27. };
  28. struct suspend_info {
  29. int slot;
  30. sector_t lo;
  31. sector_t hi;
  32. struct list_head list;
  33. };
  34. struct resync_info {
  35. __le64 lo;
  36. __le64 hi;
  37. };
  38. /* md_cluster_info flags */
  39. #define MD_CLUSTER_WAITING_FOR_NEWDISK 1
  40. #define MD_CLUSTER_SUSPEND_READ_BALANCING 2
  41. #define MD_CLUSTER_BEGIN_JOIN_CLUSTER 3
  42. struct md_cluster_info {
  43. /* dlm lock space and resources for clustered raid. */
  44. dlm_lockspace_t *lockspace;
  45. int slot_number;
  46. struct completion completion;
  47. struct mutex sb_mutex;
  48. struct dlm_lock_resource *bitmap_lockres;
  49. struct list_head suspend_list;
  50. spinlock_t suspend_lock;
  51. struct md_thread *recovery_thread;
  52. unsigned long recovery_map;
  53. /* communication loc resources */
  54. struct dlm_lock_resource *ack_lockres;
  55. struct dlm_lock_resource *message_lockres;
  56. struct dlm_lock_resource *token_lockres;
  57. struct dlm_lock_resource *no_new_dev_lockres;
  58. struct md_thread *recv_thread;
  59. struct completion newdisk_completion;
  60. unsigned long state;
  61. };
  62. enum msg_type {
  63. METADATA_UPDATED = 0,
  64. RESYNCING,
  65. NEWDISK,
  66. REMOVE,
  67. RE_ADD,
  68. BITMAP_NEEDS_SYNC,
  69. };
  70. struct cluster_msg {
  71. int type;
  72. int slot;
  73. /* TODO: Unionize this for smaller footprint */
  74. sector_t low;
  75. sector_t high;
  76. char uuid[16];
  77. int raid_slot;
  78. };
  79. static void sync_ast(void *arg)
  80. {
  81. struct dlm_lock_resource *res;
  82. res = (struct dlm_lock_resource *) arg;
  83. complete(&res->completion);
  84. }
  85. static int dlm_lock_sync(struct dlm_lock_resource *res, int mode)
  86. {
  87. int ret = 0;
  88. ret = dlm_lock(res->ls, mode, &res->lksb,
  89. res->flags, res->name, strlen(res->name),
  90. 0, sync_ast, res, res->bast);
  91. if (ret)
  92. return ret;
  93. wait_for_completion(&res->completion);
  94. return res->lksb.sb_status;
  95. }
  96. static int dlm_unlock_sync(struct dlm_lock_resource *res)
  97. {
  98. return dlm_lock_sync(res, DLM_LOCK_NL);
  99. }
  100. static struct dlm_lock_resource *lockres_init(struct mddev *mddev,
  101. char *name, void (*bastfn)(void *arg, int mode), int with_lvb)
  102. {
  103. struct dlm_lock_resource *res = NULL;
  104. int ret, namelen;
  105. struct md_cluster_info *cinfo = mddev->cluster_info;
  106. res = kzalloc(sizeof(struct dlm_lock_resource), GFP_KERNEL);
  107. if (!res)
  108. return NULL;
  109. init_completion(&res->completion);
  110. res->ls = cinfo->lockspace;
  111. res->mddev = mddev;
  112. namelen = strlen(name);
  113. res->name = kzalloc(namelen + 1, GFP_KERNEL);
  114. if (!res->name) {
  115. pr_err("md-cluster: Unable to allocate resource name for resource %s\n", name);
  116. goto out_err;
  117. }
  118. strlcpy(res->name, name, namelen + 1);
  119. if (with_lvb) {
  120. res->lksb.sb_lvbptr = kzalloc(LVB_SIZE, GFP_KERNEL);
  121. if (!res->lksb.sb_lvbptr) {
  122. pr_err("md-cluster: Unable to allocate LVB for resource %s\n", name);
  123. goto out_err;
  124. }
  125. res->flags = DLM_LKF_VALBLK;
  126. }
  127. if (bastfn)
  128. res->bast = bastfn;
  129. res->flags |= DLM_LKF_EXPEDITE;
  130. ret = dlm_lock_sync(res, DLM_LOCK_NL);
  131. if (ret) {
  132. pr_err("md-cluster: Unable to lock NL on new lock resource %s\n", name);
  133. goto out_err;
  134. }
  135. res->flags &= ~DLM_LKF_EXPEDITE;
  136. res->flags |= DLM_LKF_CONVERT;
  137. return res;
  138. out_err:
  139. kfree(res->lksb.sb_lvbptr);
  140. kfree(res->name);
  141. kfree(res);
  142. return NULL;
  143. }
  144. static void lockres_free(struct dlm_lock_resource *res)
  145. {
  146. int ret;
  147. if (!res)
  148. return;
  149. /* cancel a lock request or a conversion request that is blocked */
  150. res->flags |= DLM_LKF_CANCEL;
  151. retry:
  152. ret = dlm_unlock(res->ls, res->lksb.sb_lkid, 0, &res->lksb, res);
  153. if (unlikely(ret != 0)) {
  154. pr_info("%s: failed to unlock %s return %d\n", __func__, res->name, ret);
  155. /* if a lock conversion is cancelled, then the lock is put
  156. * back to grant queue, need to ensure it is unlocked */
  157. if (ret == -DLM_ECANCEL)
  158. goto retry;
  159. }
  160. res->flags &= ~DLM_LKF_CANCEL;
  161. wait_for_completion(&res->completion);
  162. kfree(res->name);
  163. kfree(res->lksb.sb_lvbptr);
  164. kfree(res);
  165. }
  166. static void add_resync_info(struct mddev *mddev, struct dlm_lock_resource *lockres,
  167. sector_t lo, sector_t hi)
  168. {
  169. struct resync_info *ri;
  170. ri = (struct resync_info *)lockres->lksb.sb_lvbptr;
  171. ri->lo = cpu_to_le64(lo);
  172. ri->hi = cpu_to_le64(hi);
  173. }
  174. static struct suspend_info *read_resync_info(struct mddev *mddev, struct dlm_lock_resource *lockres)
  175. {
  176. struct resync_info ri;
  177. struct suspend_info *s = NULL;
  178. sector_t hi = 0;
  179. dlm_lock_sync(lockres, DLM_LOCK_CR);
  180. memcpy(&ri, lockres->lksb.sb_lvbptr, sizeof(struct resync_info));
  181. hi = le64_to_cpu(ri.hi);
  182. if (ri.hi > 0) {
  183. s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL);
  184. if (!s)
  185. goto out;
  186. s->hi = hi;
  187. s->lo = le64_to_cpu(ri.lo);
  188. }
  189. dlm_unlock_sync(lockres);
  190. out:
  191. return s;
  192. }
  193. static void recover_bitmaps(struct md_thread *thread)
  194. {
  195. struct mddev *mddev = thread->mddev;
  196. struct md_cluster_info *cinfo = mddev->cluster_info;
  197. struct dlm_lock_resource *bm_lockres;
  198. char str[64];
  199. int slot, ret;
  200. struct suspend_info *s, *tmp;
  201. sector_t lo, hi;
  202. while (cinfo->recovery_map) {
  203. slot = fls64((u64)cinfo->recovery_map) - 1;
  204. /* Clear suspend_area associated with the bitmap */
  205. spin_lock_irq(&cinfo->suspend_lock);
  206. list_for_each_entry_safe(s, tmp, &cinfo->suspend_list, list)
  207. if (slot == s->slot) {
  208. list_del(&s->list);
  209. kfree(s);
  210. }
  211. spin_unlock_irq(&cinfo->suspend_lock);
  212. snprintf(str, 64, "bitmap%04d", slot);
  213. bm_lockres = lockres_init(mddev, str, NULL, 1);
  214. if (!bm_lockres) {
  215. pr_err("md-cluster: Cannot initialize bitmaps\n");
  216. goto clear_bit;
  217. }
  218. ret = dlm_lock_sync(bm_lockres, DLM_LOCK_PW);
  219. if (ret) {
  220. pr_err("md-cluster: Could not DLM lock %s: %d\n",
  221. str, ret);
  222. goto clear_bit;
  223. }
  224. ret = bitmap_copy_from_slot(mddev, slot, &lo, &hi, true);
  225. if (ret) {
  226. pr_err("md-cluster: Could not copy data from bitmap %d\n", slot);
  227. goto dlm_unlock;
  228. }
  229. if (hi > 0) {
  230. /* TODO:Wait for current resync to get over */
  231. set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
  232. if (lo < mddev->recovery_cp)
  233. mddev->recovery_cp = lo;
  234. md_check_recovery(mddev);
  235. }
  236. dlm_unlock:
  237. dlm_unlock_sync(bm_lockres);
  238. clear_bit:
  239. clear_bit(slot, &cinfo->recovery_map);
  240. }
  241. }
  242. static void recover_prep(void *arg)
  243. {
  244. struct mddev *mddev = arg;
  245. struct md_cluster_info *cinfo = mddev->cluster_info;
  246. set_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state);
  247. }
  248. static void __recover_slot(struct mddev *mddev, int slot)
  249. {
  250. struct md_cluster_info *cinfo = mddev->cluster_info;
  251. set_bit(slot, &cinfo->recovery_map);
  252. if (!cinfo->recovery_thread) {
  253. cinfo->recovery_thread = md_register_thread(recover_bitmaps,
  254. mddev, "recover");
  255. if (!cinfo->recovery_thread) {
  256. pr_warn("md-cluster: Could not create recovery thread\n");
  257. return;
  258. }
  259. }
  260. md_wakeup_thread(cinfo->recovery_thread);
  261. }
  262. static void recover_slot(void *arg, struct dlm_slot *slot)
  263. {
  264. struct mddev *mddev = arg;
  265. struct md_cluster_info *cinfo = mddev->cluster_info;
  266. pr_info("md-cluster: %s Node %d/%d down. My slot: %d. Initiating recovery.\n",
  267. mddev->bitmap_info.cluster_name,
  268. slot->nodeid, slot->slot,
  269. cinfo->slot_number);
  270. /* deduct one since dlm slot starts from one while the num of
  271. * cluster-md begins with 0 */
  272. __recover_slot(mddev, slot->slot - 1);
  273. }
  274. static void recover_done(void *arg, struct dlm_slot *slots,
  275. int num_slots, int our_slot,
  276. uint32_t generation)
  277. {
  278. struct mddev *mddev = arg;
  279. struct md_cluster_info *cinfo = mddev->cluster_info;
  280. cinfo->slot_number = our_slot;
  281. /* completion is only need to be complete when node join cluster,
  282. * it doesn't need to run during another node's failure */
  283. if (test_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state)) {
  284. complete(&cinfo->completion);
  285. clear_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state);
  286. }
  287. clear_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state);
  288. }
  289. /* the ops is called when node join the cluster, and do lock recovery
  290. * if node failure occurs */
  291. static const struct dlm_lockspace_ops md_ls_ops = {
  292. .recover_prep = recover_prep,
  293. .recover_slot = recover_slot,
  294. .recover_done = recover_done,
  295. };
  296. /*
  297. * The BAST function for the ack lock resource
  298. * This function wakes up the receive thread in
  299. * order to receive and process the message.
  300. */
  301. static void ack_bast(void *arg, int mode)
  302. {
  303. struct dlm_lock_resource *res = (struct dlm_lock_resource *)arg;
  304. struct md_cluster_info *cinfo = res->mddev->cluster_info;
  305. if (mode == DLM_LOCK_EX)
  306. md_wakeup_thread(cinfo->recv_thread);
  307. }
  308. static void __remove_suspend_info(struct md_cluster_info *cinfo, int slot)
  309. {
  310. struct suspend_info *s, *tmp;
  311. list_for_each_entry_safe(s, tmp, &cinfo->suspend_list, list)
  312. if (slot == s->slot) {
  313. pr_info("%s:%d Deleting suspend_info: %d\n",
  314. __func__, __LINE__, slot);
  315. list_del(&s->list);
  316. kfree(s);
  317. break;
  318. }
  319. }
  320. static void remove_suspend_info(struct md_cluster_info *cinfo, int slot)
  321. {
  322. spin_lock_irq(&cinfo->suspend_lock);
  323. __remove_suspend_info(cinfo, slot);
  324. spin_unlock_irq(&cinfo->suspend_lock);
  325. }
  326. static void process_suspend_info(struct md_cluster_info *cinfo,
  327. int slot, sector_t lo, sector_t hi)
  328. {
  329. struct suspend_info *s;
  330. if (!hi) {
  331. remove_suspend_info(cinfo, slot);
  332. return;
  333. }
  334. s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL);
  335. if (!s)
  336. return;
  337. s->slot = slot;
  338. s->lo = lo;
  339. s->hi = hi;
  340. spin_lock_irq(&cinfo->suspend_lock);
  341. /* Remove existing entry (if exists) before adding */
  342. __remove_suspend_info(cinfo, slot);
  343. list_add(&s->list, &cinfo->suspend_list);
  344. spin_unlock_irq(&cinfo->suspend_lock);
  345. }
  346. static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
  347. {
  348. char disk_uuid[64];
  349. struct md_cluster_info *cinfo = mddev->cluster_info;
  350. char event_name[] = "EVENT=ADD_DEVICE";
  351. char raid_slot[16];
  352. char *envp[] = {event_name, disk_uuid, raid_slot, NULL};
  353. int len;
  354. len = snprintf(disk_uuid, 64, "DEVICE_UUID=");
  355. sprintf(disk_uuid + len, "%pU", cmsg->uuid);
  356. snprintf(raid_slot, 16, "RAID_DISK=%d", cmsg->raid_slot);
  357. pr_info("%s:%d Sending kobject change with %s and %s\n", __func__, __LINE__, disk_uuid, raid_slot);
  358. init_completion(&cinfo->newdisk_completion);
  359. set_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state);
  360. kobject_uevent_env(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE, envp);
  361. wait_for_completion_timeout(&cinfo->newdisk_completion,
  362. NEW_DEV_TIMEOUT);
  363. clear_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state);
  364. }
  365. static void process_metadata_update(struct mddev *mddev, struct cluster_msg *msg)
  366. {
  367. struct md_cluster_info *cinfo = mddev->cluster_info;
  368. md_reload_sb(mddev);
  369. dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR);
  370. }
  371. static void process_remove_disk(struct mddev *mddev, struct cluster_msg *msg)
  372. {
  373. struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev, msg->raid_slot);
  374. if (rdev)
  375. md_kick_rdev_from_array(rdev);
  376. else
  377. pr_warn("%s: %d Could not find disk(%d) to REMOVE\n", __func__, __LINE__, msg->raid_slot);
  378. }
  379. static void process_readd_disk(struct mddev *mddev, struct cluster_msg *msg)
  380. {
  381. struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev, msg->raid_slot);
  382. if (rdev && test_bit(Faulty, &rdev->flags))
  383. clear_bit(Faulty, &rdev->flags);
  384. else
  385. pr_warn("%s: %d Could not find disk(%d) which is faulty", __func__, __LINE__, msg->raid_slot);
  386. }
  387. static void process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
  388. {
  389. switch (msg->type) {
  390. case METADATA_UPDATED:
  391. pr_info("%s: %d Received message: METADATA_UPDATE from %d\n",
  392. __func__, __LINE__, msg->slot);
  393. process_metadata_update(mddev, msg);
  394. break;
  395. case RESYNCING:
  396. pr_info("%s: %d Received message: RESYNCING from %d\n",
  397. __func__, __LINE__, msg->slot);
  398. process_suspend_info(mddev->cluster_info, msg->slot,
  399. msg->low, msg->high);
  400. break;
  401. case NEWDISK:
  402. pr_info("%s: %d Received message: NEWDISK from %d\n",
  403. __func__, __LINE__, msg->slot);
  404. process_add_new_disk(mddev, msg);
  405. break;
  406. case REMOVE:
  407. pr_info("%s: %d Received REMOVE from %d\n",
  408. __func__, __LINE__, msg->slot);
  409. process_remove_disk(mddev, msg);
  410. break;
  411. case RE_ADD:
  412. pr_info("%s: %d Received RE_ADD from %d\n",
  413. __func__, __LINE__, msg->slot);
  414. process_readd_disk(mddev, msg);
  415. break;
  416. case BITMAP_NEEDS_SYNC:
  417. pr_info("%s: %d Received BITMAP_NEEDS_SYNC from %d\n",
  418. __func__, __LINE__, msg->slot);
  419. __recover_slot(mddev, msg->slot);
  420. break;
  421. default:
  422. pr_warn("%s:%d Received unknown message from %d\n",
  423. __func__, __LINE__, msg->slot);
  424. }
  425. }
  426. /*
  427. * thread for receiving message
  428. */
  429. static void recv_daemon(struct md_thread *thread)
  430. {
  431. struct md_cluster_info *cinfo = thread->mddev->cluster_info;
  432. struct dlm_lock_resource *ack_lockres = cinfo->ack_lockres;
  433. struct dlm_lock_resource *message_lockres = cinfo->message_lockres;
  434. struct cluster_msg msg;
  435. int ret;
  436. /*get CR on Message*/
  437. if (dlm_lock_sync(message_lockres, DLM_LOCK_CR)) {
  438. pr_err("md/raid1:failed to get CR on MESSAGE\n");
  439. return;
  440. }
  441. /* read lvb and wake up thread to process this message_lockres */
  442. memcpy(&msg, message_lockres->lksb.sb_lvbptr, sizeof(struct cluster_msg));
  443. process_recvd_msg(thread->mddev, &msg);
  444. /*release CR on ack_lockres*/
  445. ret = dlm_unlock_sync(ack_lockres);
  446. if (unlikely(ret != 0))
  447. pr_info("unlock ack failed return %d\n", ret);
  448. /*up-convert to PR on message_lockres*/
  449. ret = dlm_lock_sync(message_lockres, DLM_LOCK_PR);
  450. if (unlikely(ret != 0))
  451. pr_info("lock PR on msg failed return %d\n", ret);
  452. /*get CR on ack_lockres again*/
  453. ret = dlm_lock_sync(ack_lockres, DLM_LOCK_CR);
  454. if (unlikely(ret != 0))
  455. pr_info("lock CR on ack failed return %d\n", ret);
  456. /*release CR on message_lockres*/
  457. ret = dlm_unlock_sync(message_lockres);
  458. if (unlikely(ret != 0))
  459. pr_info("unlock msg failed return %d\n", ret);
  460. }
  461. /* lock_comm()
  462. * Takes the lock on the TOKEN lock resource so no other
  463. * node can communicate while the operation is underway.
  464. */
  465. static int lock_comm(struct md_cluster_info *cinfo)
  466. {
  467. int error;
  468. error = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX);
  469. if (error)
  470. pr_err("md-cluster(%s:%d): failed to get EX on TOKEN (%d)\n",
  471. __func__, __LINE__, error);
  472. return error;
  473. }
  474. static void unlock_comm(struct md_cluster_info *cinfo)
  475. {
  476. dlm_unlock_sync(cinfo->token_lockres);
  477. }
  478. /* __sendmsg()
  479. * This function performs the actual sending of the message. This function is
  480. * usually called after performing the encompassing operation
  481. * The function:
  482. * 1. Grabs the message lockresource in EX mode
  483. * 2. Copies the message to the message LVB
  484. * 3. Downconverts message lockresource to CW
  485. * 4. Upconverts ack lock resource from CR to EX. This forces the BAST on other nodes
  486. * and the other nodes read the message. The thread will wait here until all other
  487. * nodes have released ack lock resource.
  488. * 5. Downconvert ack lockresource to CR
  489. */
  490. static int __sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg)
  491. {
  492. int error;
  493. int slot = cinfo->slot_number - 1;
  494. cmsg->slot = cpu_to_le32(slot);
  495. /*get EX on Message*/
  496. error = dlm_lock_sync(cinfo->message_lockres, DLM_LOCK_EX);
  497. if (error) {
  498. pr_err("md-cluster: failed to get EX on MESSAGE (%d)\n", error);
  499. goto failed_message;
  500. }
  501. memcpy(cinfo->message_lockres->lksb.sb_lvbptr, (void *)cmsg,
  502. sizeof(struct cluster_msg));
  503. /*down-convert EX to CW on Message*/
  504. error = dlm_lock_sync(cinfo->message_lockres, DLM_LOCK_CW);
  505. if (error) {
  506. pr_err("md-cluster: failed to convert EX to CW on MESSAGE(%d)\n",
  507. error);
  508. goto failed_ack;
  509. }
  510. /*up-convert CR to EX on Ack*/
  511. error = dlm_lock_sync(cinfo->ack_lockres, DLM_LOCK_EX);
  512. if (error) {
  513. pr_err("md-cluster: failed to convert CR to EX on ACK(%d)\n",
  514. error);
  515. goto failed_ack;
  516. }
  517. /*down-convert EX to CR on Ack*/
  518. error = dlm_lock_sync(cinfo->ack_lockres, DLM_LOCK_CR);
  519. if (error) {
  520. pr_err("md-cluster: failed to convert EX to CR on ACK(%d)\n",
  521. error);
  522. goto failed_ack;
  523. }
  524. failed_ack:
  525. error = dlm_unlock_sync(cinfo->message_lockres);
  526. if (unlikely(error != 0)) {
  527. pr_err("md-cluster: failed convert to NL on MESSAGE(%d)\n",
  528. error);
  529. /* in case the message can't be released due to some reason */
  530. goto failed_ack;
  531. }
  532. failed_message:
  533. return error;
  534. }
  535. static int sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg)
  536. {
  537. int ret;
  538. lock_comm(cinfo);
  539. ret = __sendmsg(cinfo, cmsg);
  540. unlock_comm(cinfo);
  541. return ret;
  542. }
  543. static int gather_all_resync_info(struct mddev *mddev, int total_slots)
  544. {
  545. struct md_cluster_info *cinfo = mddev->cluster_info;
  546. int i, ret = 0;
  547. struct dlm_lock_resource *bm_lockres;
  548. struct suspend_info *s;
  549. char str[64];
  550. sector_t lo, hi;
  551. for (i = 0; i < total_slots; i++) {
  552. memset(str, '\0', 64);
  553. snprintf(str, 64, "bitmap%04d", i);
  554. bm_lockres = lockres_init(mddev, str, NULL, 1);
  555. if (!bm_lockres)
  556. return -ENOMEM;
  557. if (i == (cinfo->slot_number - 1))
  558. continue;
  559. bm_lockres->flags |= DLM_LKF_NOQUEUE;
  560. ret = dlm_lock_sync(bm_lockres, DLM_LOCK_PW);
  561. if (ret == -EAGAIN) {
  562. memset(bm_lockres->lksb.sb_lvbptr, '\0', LVB_SIZE);
  563. s = read_resync_info(mddev, bm_lockres);
  564. if (s) {
  565. pr_info("%s:%d Resync[%llu..%llu] in progress on %d\n",
  566. __func__, __LINE__,
  567. (unsigned long long) s->lo,
  568. (unsigned long long) s->hi, i);
  569. spin_lock_irq(&cinfo->suspend_lock);
  570. s->slot = i;
  571. list_add(&s->list, &cinfo->suspend_list);
  572. spin_unlock_irq(&cinfo->suspend_lock);
  573. }
  574. ret = 0;
  575. lockres_free(bm_lockres);
  576. continue;
  577. }
  578. if (ret) {
  579. lockres_free(bm_lockres);
  580. goto out;
  581. }
  582. /* Read the disk bitmap sb and check if it needs recovery */
  583. ret = bitmap_copy_from_slot(mddev, i, &lo, &hi, false);
  584. if (ret) {
  585. pr_warn("md-cluster: Could not gather bitmaps from slot %d", i);
  586. lockres_free(bm_lockres);
  587. continue;
  588. }
  589. if ((hi > 0) && (lo < mddev->recovery_cp)) {
  590. set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
  591. mddev->recovery_cp = lo;
  592. md_check_recovery(mddev);
  593. }
  594. dlm_unlock_sync(bm_lockres);
  595. lockres_free(bm_lockres);
  596. }
  597. out:
  598. return ret;
  599. }
  600. static int join(struct mddev *mddev, int nodes)
  601. {
  602. struct md_cluster_info *cinfo;
  603. int ret, ops_rv;
  604. char str[64];
  605. cinfo = kzalloc(sizeof(struct md_cluster_info), GFP_KERNEL);
  606. if (!cinfo)
  607. return -ENOMEM;
  608. INIT_LIST_HEAD(&cinfo->suspend_list);
  609. spin_lock_init(&cinfo->suspend_lock);
  610. init_completion(&cinfo->completion);
  611. set_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state);
  612. mutex_init(&cinfo->sb_mutex);
  613. mddev->cluster_info = cinfo;
  614. memset(str, 0, 64);
  615. sprintf(str, "%pU", mddev->uuid);
  616. ret = dlm_new_lockspace(str, mddev->bitmap_info.cluster_name,
  617. DLM_LSFL_FS, LVB_SIZE,
  618. &md_ls_ops, mddev, &ops_rv, &cinfo->lockspace);
  619. if (ret)
  620. goto err;
  621. wait_for_completion(&cinfo->completion);
  622. if (nodes < cinfo->slot_number) {
  623. pr_err("md-cluster: Slot allotted(%d) is greater than available slots(%d).",
  624. cinfo->slot_number, nodes);
  625. ret = -ERANGE;
  626. goto err;
  627. }
  628. /* Initiate the communication resources */
  629. ret = -ENOMEM;
  630. cinfo->recv_thread = md_register_thread(recv_daemon, mddev, "cluster_recv");
  631. if (!cinfo->recv_thread) {
  632. pr_err("md-cluster: cannot allocate memory for recv_thread!\n");
  633. goto err;
  634. }
  635. cinfo->message_lockres = lockres_init(mddev, "message", NULL, 1);
  636. if (!cinfo->message_lockres)
  637. goto err;
  638. cinfo->token_lockres = lockres_init(mddev, "token", NULL, 0);
  639. if (!cinfo->token_lockres)
  640. goto err;
  641. cinfo->ack_lockres = lockres_init(mddev, "ack", ack_bast, 0);
  642. if (!cinfo->ack_lockres)
  643. goto err;
  644. cinfo->no_new_dev_lockres = lockres_init(mddev, "no-new-dev", NULL, 0);
  645. if (!cinfo->no_new_dev_lockres)
  646. goto err;
  647. /* get sync CR lock on ACK. */
  648. if (dlm_lock_sync(cinfo->ack_lockres, DLM_LOCK_CR))
  649. pr_err("md-cluster: failed to get a sync CR lock on ACK!(%d)\n",
  650. ret);
  651. /* get sync CR lock on no-new-dev. */
  652. if (dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR))
  653. pr_err("md-cluster: failed to get a sync CR lock on no-new-dev!(%d)\n", ret);
  654. pr_info("md-cluster: Joined cluster %s slot %d\n", str, cinfo->slot_number);
  655. snprintf(str, 64, "bitmap%04d", cinfo->slot_number - 1);
  656. cinfo->bitmap_lockres = lockres_init(mddev, str, NULL, 1);
  657. if (!cinfo->bitmap_lockres)
  658. goto err;
  659. if (dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW)) {
  660. pr_err("Failed to get bitmap lock\n");
  661. ret = -EINVAL;
  662. goto err;
  663. }
  664. ret = gather_all_resync_info(mddev, nodes);
  665. if (ret)
  666. goto err;
  667. return 0;
  668. err:
  669. lockres_free(cinfo->message_lockres);
  670. lockres_free(cinfo->token_lockres);
  671. lockres_free(cinfo->ack_lockres);
  672. lockres_free(cinfo->no_new_dev_lockres);
  673. lockres_free(cinfo->bitmap_lockres);
  674. if (cinfo->lockspace)
  675. dlm_release_lockspace(cinfo->lockspace, 2);
  676. mddev->cluster_info = NULL;
  677. kfree(cinfo);
  678. return ret;
  679. }
  680. static int leave(struct mddev *mddev)
  681. {
  682. struct md_cluster_info *cinfo = mddev->cluster_info;
  683. if (!cinfo)
  684. return 0;
  685. md_unregister_thread(&cinfo->recovery_thread);
  686. md_unregister_thread(&cinfo->recv_thread);
  687. lockres_free(cinfo->message_lockres);
  688. lockres_free(cinfo->token_lockres);
  689. lockres_free(cinfo->ack_lockres);
  690. lockres_free(cinfo->no_new_dev_lockres);
  691. lockres_free(cinfo->bitmap_lockres);
  692. dlm_release_lockspace(cinfo->lockspace, 2);
  693. return 0;
  694. }
  695. /* slot_number(): Returns the MD slot number to use
  696. * DLM starts the slot numbers from 1, wheras cluster-md
  697. * wants the number to be from zero, so we deduct one
  698. */
  699. static int slot_number(struct mddev *mddev)
  700. {
  701. struct md_cluster_info *cinfo = mddev->cluster_info;
  702. return cinfo->slot_number - 1;
  703. }
  704. static void resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi)
  705. {
  706. struct md_cluster_info *cinfo = mddev->cluster_info;
  707. add_resync_info(mddev, cinfo->bitmap_lockres, lo, hi);
  708. /* Re-acquire the lock to refresh LVB */
  709. dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW);
  710. }
  711. static int metadata_update_start(struct mddev *mddev)
  712. {
  713. return lock_comm(mddev->cluster_info);
  714. }
  715. static int metadata_update_finish(struct mddev *mddev)
  716. {
  717. struct md_cluster_info *cinfo = mddev->cluster_info;
  718. struct cluster_msg cmsg;
  719. int ret;
  720. memset(&cmsg, 0, sizeof(cmsg));
  721. cmsg.type = cpu_to_le32(METADATA_UPDATED);
  722. ret = __sendmsg(cinfo, &cmsg);
  723. unlock_comm(cinfo);
  724. return ret;
  725. }
  726. static int metadata_update_cancel(struct mddev *mddev)
  727. {
  728. struct md_cluster_info *cinfo = mddev->cluster_info;
  729. return dlm_unlock_sync(cinfo->token_lockres);
  730. }
  731. static int resync_send(struct mddev *mddev, enum msg_type type,
  732. sector_t lo, sector_t hi)
  733. {
  734. struct md_cluster_info *cinfo = mddev->cluster_info;
  735. struct cluster_msg cmsg;
  736. int slot = cinfo->slot_number - 1;
  737. pr_info("%s:%d lo: %llu hi: %llu\n", __func__, __LINE__,
  738. (unsigned long long)lo,
  739. (unsigned long long)hi);
  740. resync_info_update(mddev, lo, hi);
  741. cmsg.type = cpu_to_le32(type);
  742. cmsg.slot = cpu_to_le32(slot);
  743. cmsg.low = cpu_to_le64(lo);
  744. cmsg.high = cpu_to_le64(hi);
  745. return sendmsg(cinfo, &cmsg);
  746. }
  747. static int resync_start(struct mddev *mddev, sector_t lo, sector_t hi)
  748. {
  749. pr_info("%s:%d\n", __func__, __LINE__);
  750. return resync_send(mddev, RESYNCING, lo, hi);
  751. }
  752. static void resync_finish(struct mddev *mddev)
  753. {
  754. struct md_cluster_info *cinfo = mddev->cluster_info;
  755. struct cluster_msg cmsg;
  756. int slot = cinfo->slot_number - 1;
  757. pr_info("%s:%d\n", __func__, __LINE__);
  758. resync_send(mddev, RESYNCING, 0, 0);
  759. if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
  760. cmsg.type = cpu_to_le32(BITMAP_NEEDS_SYNC);
  761. cmsg.slot = cpu_to_le32(slot);
  762. sendmsg(cinfo, &cmsg);
  763. }
  764. }
  765. static int area_resyncing(struct mddev *mddev, int direction,
  766. sector_t lo, sector_t hi)
  767. {
  768. struct md_cluster_info *cinfo = mddev->cluster_info;
  769. int ret = 0;
  770. struct suspend_info *s;
  771. if ((direction == READ) &&
  772. test_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state))
  773. return 1;
  774. spin_lock_irq(&cinfo->suspend_lock);
  775. if (list_empty(&cinfo->suspend_list))
  776. goto out;
  777. list_for_each_entry(s, &cinfo->suspend_list, list)
  778. if (hi > s->lo && lo < s->hi) {
  779. ret = 1;
  780. break;
  781. }
  782. out:
  783. spin_unlock_irq(&cinfo->suspend_lock);
  784. return ret;
  785. }
  786. static int add_new_disk_start(struct mddev *mddev, struct md_rdev *rdev)
  787. {
  788. struct md_cluster_info *cinfo = mddev->cluster_info;
  789. struct cluster_msg cmsg;
  790. int ret = 0;
  791. struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
  792. char *uuid = sb->device_uuid;
  793. memset(&cmsg, 0, sizeof(cmsg));
  794. cmsg.type = cpu_to_le32(NEWDISK);
  795. memcpy(cmsg.uuid, uuid, 16);
  796. cmsg.raid_slot = rdev->desc_nr;
  797. lock_comm(cinfo);
  798. ret = __sendmsg(cinfo, &cmsg);
  799. if (ret)
  800. return ret;
  801. cinfo->no_new_dev_lockres->flags |= DLM_LKF_NOQUEUE;
  802. ret = dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_EX);
  803. cinfo->no_new_dev_lockres->flags &= ~DLM_LKF_NOQUEUE;
  804. /* Some node does not "see" the device */
  805. if (ret == -EAGAIN)
  806. ret = -ENOENT;
  807. else
  808. dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR);
  809. return ret;
  810. }
  811. static int add_new_disk_finish(struct mddev *mddev)
  812. {
  813. struct cluster_msg cmsg;
  814. struct md_cluster_info *cinfo = mddev->cluster_info;
  815. int ret;
  816. /* Write sb and inform others */
  817. md_update_sb(mddev, 1);
  818. cmsg.type = METADATA_UPDATED;
  819. ret = __sendmsg(cinfo, &cmsg);
  820. unlock_comm(cinfo);
  821. return ret;
  822. }
  823. static int new_disk_ack(struct mddev *mddev, bool ack)
  824. {
  825. struct md_cluster_info *cinfo = mddev->cluster_info;
  826. if (!test_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state)) {
  827. pr_warn("md-cluster(%s): Spurious cluster confirmation\n", mdname(mddev));
  828. return -EINVAL;
  829. }
  830. if (ack)
  831. dlm_unlock_sync(cinfo->no_new_dev_lockres);
  832. complete(&cinfo->newdisk_completion);
  833. return 0;
  834. }
  835. static int remove_disk(struct mddev *mddev, struct md_rdev *rdev)
  836. {
  837. struct cluster_msg cmsg;
  838. struct md_cluster_info *cinfo = mddev->cluster_info;
  839. cmsg.type = REMOVE;
  840. cmsg.raid_slot = rdev->desc_nr;
  841. return __sendmsg(cinfo, &cmsg);
  842. }
  843. static int gather_bitmaps(struct md_rdev *rdev)
  844. {
  845. int sn, err;
  846. sector_t lo, hi;
  847. struct cluster_msg cmsg;
  848. struct mddev *mddev = rdev->mddev;
  849. struct md_cluster_info *cinfo = mddev->cluster_info;
  850. cmsg.type = RE_ADD;
  851. cmsg.raid_slot = rdev->desc_nr;
  852. err = sendmsg(cinfo, &cmsg);
  853. if (err)
  854. goto out;
  855. for (sn = 0; sn < mddev->bitmap_info.nodes; sn++) {
  856. if (sn == (cinfo->slot_number - 1))
  857. continue;
  858. err = bitmap_copy_from_slot(mddev, sn, &lo, &hi, false);
  859. if (err) {
  860. pr_warn("md-cluster: Could not gather bitmaps from slot %d", sn);
  861. goto out;
  862. }
  863. if ((hi > 0) && (lo < mddev->recovery_cp))
  864. mddev->recovery_cp = lo;
  865. }
  866. out:
  867. return err;
  868. }
  869. static struct md_cluster_operations cluster_ops = {
  870. .join = join,
  871. .leave = leave,
  872. .slot_number = slot_number,
  873. .resync_info_update = resync_info_update,
  874. .resync_start = resync_start,
  875. .resync_finish = resync_finish,
  876. .metadata_update_start = metadata_update_start,
  877. .metadata_update_finish = metadata_update_finish,
  878. .metadata_update_cancel = metadata_update_cancel,
  879. .area_resyncing = area_resyncing,
  880. .add_new_disk_start = add_new_disk_start,
  881. .add_new_disk_finish = add_new_disk_finish,
  882. .new_disk_ack = new_disk_ack,
  883. .remove_disk = remove_disk,
  884. .gather_bitmaps = gather_bitmaps,
  885. };
  886. static int __init cluster_init(void)
  887. {
  888. pr_warn("md-cluster: EXPERIMENTAL. Use with caution\n");
  889. pr_info("Registering Cluster MD functions\n");
  890. register_md_cluster_operations(&cluster_ops, THIS_MODULE);
  891. return 0;
  892. }
  893. static void cluster_exit(void)
  894. {
  895. unregister_md_cluster_operations();
  896. }
  897. module_init(cluster_init);
  898. module_exit(cluster_exit);
  899. MODULE_LICENSE("GPL");
  900. MODULE_DESCRIPTION("Clustering support for MD");