rotate.c 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623
  1. /* Handle fileserver selection and rotation.
  2. *
  3. * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved.
  4. * Written by David Howells (dhowells@redhat.com)
  5. *
  6. * This program is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU General Public Licence
  8. * as published by the Free Software Foundation; either version
  9. * 2 of the Licence, or (at your option) any later version.
  10. */
  11. #include <linux/kernel.h>
  12. #include <linux/slab.h>
  13. #include <linux/fs.h>
  14. #include <linux/sched.h>
  15. #include <linux/delay.h>
  16. #include <linux/sched/signal.h>
  17. #include "internal.h"
  18. #include "afs_fs.h"
  19. /*
  20. * Begin an operation on the fileserver.
  21. *
  22. * Fileserver operations are serialised on the server by vnode, so we serialise
  23. * them here also using the io_lock.
  24. */
  25. bool afs_begin_vnode_operation(struct afs_fs_cursor *fc, struct afs_vnode *vnode,
  26. struct key *key)
  27. {
  28. memset(fc, 0, sizeof(*fc));
  29. fc->vnode = vnode;
  30. fc->key = key;
  31. fc->ac.error = SHRT_MAX;
  32. fc->error = -EDESTADDRREQ;
  33. if (mutex_lock_interruptible(&vnode->io_lock) < 0) {
  34. fc->error = -EINTR;
  35. fc->flags |= AFS_FS_CURSOR_STOP;
  36. return false;
  37. }
  38. if (vnode->lock_state != AFS_VNODE_LOCK_NONE)
  39. fc->flags |= AFS_FS_CURSOR_CUR_ONLY;
  40. return true;
  41. }
  42. /*
  43. * Begin iteration through a server list, starting with the vnode's last used
  44. * server if possible, or the last recorded good server if not.
  45. */
  46. static bool afs_start_fs_iteration(struct afs_fs_cursor *fc,
  47. struct afs_vnode *vnode)
  48. {
  49. struct afs_cb_interest *cbi;
  50. int i;
  51. read_lock(&vnode->volume->servers_lock);
  52. fc->server_list = afs_get_serverlist(vnode->volume->servers);
  53. read_unlock(&vnode->volume->servers_lock);
  54. fc->untried = (1UL << fc->server_list->nr_servers) - 1;
  55. fc->index = READ_ONCE(fc->server_list->preferred);
  56. cbi = vnode->cb_interest;
  57. if (cbi) {
  58. /* See if the vnode's preferred record is still available */
  59. for (i = 0; i < fc->server_list->nr_servers; i++) {
  60. if (fc->server_list->servers[i].cb_interest == cbi) {
  61. fc->index = i;
  62. goto found_interest;
  63. }
  64. }
  65. /* If we have a lock outstanding on a server that's no longer
  66. * serving this vnode, then we can't switch to another server
  67. * and have to return an error.
  68. */
  69. if (fc->flags & AFS_FS_CURSOR_CUR_ONLY) {
  70. fc->error = -ESTALE;
  71. return false;
  72. }
  73. /* Note that the callback promise is effectively broken */
  74. write_seqlock(&vnode->cb_lock);
  75. ASSERTCMP(cbi, ==, vnode->cb_interest);
  76. vnode->cb_interest = NULL;
  77. if (test_and_clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags))
  78. vnode->cb_break++;
  79. write_sequnlock(&vnode->cb_lock);
  80. afs_put_cb_interest(afs_v2net(vnode), cbi);
  81. cbi = NULL;
  82. }
  83. found_interest:
  84. return true;
  85. }
  86. /*
  87. * Post volume busy note.
  88. */
  89. static void afs_busy(struct afs_volume *volume, u32 abort_code)
  90. {
  91. const char *m;
  92. switch (abort_code) {
  93. case VOFFLINE: m = "offline"; break;
  94. case VRESTARTING: m = "restarting"; break;
  95. case VSALVAGING: m = "being salvaged"; break;
  96. default: m = "busy"; break;
  97. }
  98. pr_notice("kAFS: Volume %llu '%s' is %s\n", volume->vid, volume->name, m);
  99. }
  100. /*
  101. * Sleep and retry the operation to the same fileserver.
  102. */
  103. static bool afs_sleep_and_retry(struct afs_fs_cursor *fc)
  104. {
  105. msleep_interruptible(1000);
  106. if (signal_pending(current)) {
  107. fc->error = -ERESTARTSYS;
  108. return false;
  109. }
  110. return true;
  111. }
  112. /*
  113. * Select the fileserver to use. May be called multiple times to rotate
  114. * through the fileservers.
  115. */
  116. bool afs_select_fileserver(struct afs_fs_cursor *fc)
  117. {
  118. struct afs_addr_list *alist;
  119. struct afs_server *server;
  120. struct afs_vnode *vnode = fc->vnode;
  121. struct afs_error e;
  122. u32 rtt;
  123. int error = fc->ac.error, i;
  124. _enter("%lx[%d],%lx[%d],%d,%d",
  125. fc->untried, fc->index,
  126. fc->ac.tried, fc->ac.index,
  127. error, fc->ac.abort_code);
  128. if (fc->flags & AFS_FS_CURSOR_STOP) {
  129. _leave(" = f [stopped]");
  130. return false;
  131. }
  132. fc->nr_iterations++;
  133. /* Evaluate the result of the previous operation, if there was one. */
  134. switch (error) {
  135. case SHRT_MAX:
  136. goto start;
  137. case 0:
  138. default:
  139. /* Success or local failure. Stop. */
  140. fc->error = error;
  141. fc->flags |= AFS_FS_CURSOR_STOP;
  142. _leave(" = f [okay/local %d]", error);
  143. return false;
  144. case -ECONNABORTED:
  145. /* The far side rejected the operation on some grounds. This
  146. * might involve the server being busy or the volume having been moved.
  147. */
  148. switch (fc->ac.abort_code) {
  149. case VNOVOL:
  150. /* This fileserver doesn't know about the volume.
  151. * - May indicate that the VL is wrong - retry once and compare
  152. * the results.
  153. * - May indicate that the fileserver couldn't attach to the vol.
  154. */
  155. if (fc->flags & AFS_FS_CURSOR_VNOVOL) {
  156. fc->error = -EREMOTEIO;
  157. goto next_server;
  158. }
  159. write_lock(&vnode->volume->servers_lock);
  160. fc->server_list->vnovol_mask |= 1 << fc->index;
  161. write_unlock(&vnode->volume->servers_lock);
  162. set_bit(AFS_VOLUME_NEEDS_UPDATE, &vnode->volume->flags);
  163. error = afs_check_volume_status(vnode->volume, fc->key);
  164. if (error < 0)
  165. goto failed_set_error;
  166. if (test_bit(AFS_VOLUME_DELETED, &vnode->volume->flags)) {
  167. fc->error = -ENOMEDIUM;
  168. goto failed;
  169. }
  170. /* If the server list didn't change, then assume that
  171. * it's the fileserver having trouble.
  172. */
  173. if (vnode->volume->servers == fc->server_list) {
  174. fc->error = -EREMOTEIO;
  175. goto next_server;
  176. }
  177. /* Try again */
  178. fc->flags |= AFS_FS_CURSOR_VNOVOL;
  179. _leave(" = t [vnovol]");
  180. return true;
  181. case VSALVAGE: /* TODO: Should this return an error or iterate? */
  182. case VVOLEXISTS:
  183. case VNOSERVICE:
  184. case VONLINE:
  185. case VDISKFULL:
  186. case VOVERQUOTA:
  187. fc->error = afs_abort_to_error(fc->ac.abort_code);
  188. goto next_server;
  189. case VOFFLINE:
  190. if (!test_and_set_bit(AFS_VOLUME_OFFLINE, &vnode->volume->flags)) {
  191. afs_busy(vnode->volume, fc->ac.abort_code);
  192. clear_bit(AFS_VOLUME_BUSY, &vnode->volume->flags);
  193. }
  194. if (fc->flags & AFS_FS_CURSOR_NO_VSLEEP) {
  195. fc->error = -EADV;
  196. goto failed;
  197. }
  198. if (fc->flags & AFS_FS_CURSOR_CUR_ONLY) {
  199. fc->error = -ESTALE;
  200. goto failed;
  201. }
  202. goto busy;
  203. case VSALVAGING:
  204. case VRESTARTING:
  205. case VBUSY:
  206. /* Retry after going round all the servers unless we
  207. * have a file lock we need to maintain.
  208. */
  209. if (fc->flags & AFS_FS_CURSOR_NO_VSLEEP) {
  210. fc->error = -EBUSY;
  211. goto failed;
  212. }
  213. if (!test_and_set_bit(AFS_VOLUME_BUSY, &vnode->volume->flags)) {
  214. afs_busy(vnode->volume, fc->ac.abort_code);
  215. clear_bit(AFS_VOLUME_OFFLINE, &vnode->volume->flags);
  216. }
  217. busy:
  218. if (fc->flags & AFS_FS_CURSOR_CUR_ONLY) {
  219. if (!afs_sleep_and_retry(fc))
  220. goto failed;
  221. /* Retry with same server & address */
  222. _leave(" = t [vbusy]");
  223. return true;
  224. }
  225. fc->flags |= AFS_FS_CURSOR_VBUSY;
  226. goto next_server;
  227. case VMOVED:
  228. /* The volume migrated to another server. We consider
  229. * consider all locks and callbacks broken and request
  230. * an update from the VLDB.
  231. *
  232. * We also limit the number of VMOVED hops we will
  233. * honour, just in case someone sets up a loop.
  234. */
  235. if (fc->flags & AFS_FS_CURSOR_VMOVED) {
  236. fc->error = -EREMOTEIO;
  237. goto failed;
  238. }
  239. fc->flags |= AFS_FS_CURSOR_VMOVED;
  240. set_bit(AFS_VOLUME_WAIT, &vnode->volume->flags);
  241. set_bit(AFS_VOLUME_NEEDS_UPDATE, &vnode->volume->flags);
  242. error = afs_check_volume_status(vnode->volume, fc->key);
  243. if (error < 0)
  244. goto failed_set_error;
  245. /* If the server list didn't change, then the VLDB is
  246. * out of sync with the fileservers. This is hopefully
  247. * a temporary condition, however, so we don't want to
  248. * permanently block access to the file.
  249. *
  250. * TODO: Try other fileservers if we can.
  251. *
  252. * TODO: Retry a few times with sleeps.
  253. */
  254. if (vnode->volume->servers == fc->server_list) {
  255. fc->error = -ENOMEDIUM;
  256. goto failed;
  257. }
  258. goto restart_from_beginning;
  259. default:
  260. clear_bit(AFS_VOLUME_OFFLINE, &vnode->volume->flags);
  261. clear_bit(AFS_VOLUME_BUSY, &vnode->volume->flags);
  262. fc->error = afs_abort_to_error(fc->ac.abort_code);
  263. goto failed;
  264. }
  265. case -ETIMEDOUT:
  266. case -ETIME:
  267. if (fc->error != -EDESTADDRREQ)
  268. goto iterate_address;
  269. /* Fall through */
  270. case -ERFKILL:
  271. case -EADDRNOTAVAIL:
  272. case -ENETUNREACH:
  273. case -EHOSTUNREACH:
  274. case -EHOSTDOWN:
  275. case -ECONNREFUSED:
  276. _debug("no conn");
  277. fc->error = error;
  278. goto iterate_address;
  279. case -ECONNRESET:
  280. _debug("call reset");
  281. fc->error = error;
  282. goto failed;
  283. }
  284. restart_from_beginning:
  285. _debug("restart");
  286. afs_end_cursor(&fc->ac);
  287. afs_put_cb_interest(afs_v2net(vnode), fc->cbi);
  288. fc->cbi = NULL;
  289. afs_put_serverlist(afs_v2net(vnode), fc->server_list);
  290. fc->server_list = NULL;
  291. start:
  292. _debug("start");
  293. /* See if we need to do an update of the volume record. Note that the
  294. * volume may have moved or even have been deleted.
  295. */
  296. error = afs_check_volume_status(vnode->volume, fc->key);
  297. if (error < 0)
  298. goto failed_set_error;
  299. if (!afs_start_fs_iteration(fc, vnode))
  300. goto failed;
  301. _debug("__ VOL %llx __", vnode->volume->vid);
  302. error = afs_probe_fileservers(afs_v2net(vnode), fc->key, fc->server_list);
  303. if (error < 0)
  304. goto failed_set_error;
  305. pick_server:
  306. _debug("pick [%lx]", fc->untried);
  307. error = afs_wait_for_fs_probes(fc->server_list, fc->untried);
  308. if (error < 0)
  309. goto failed_set_error;
  310. /* Pick the untried server with the lowest RTT. If we have outstanding
  311. * callbacks, we stick with the server we're already using if we can.
  312. */
  313. if (fc->cbi) {
  314. _debug("cbi %u", fc->index);
  315. if (test_bit(fc->index, &fc->untried))
  316. goto selected_server;
  317. afs_put_cb_interest(afs_v2net(vnode), fc->cbi);
  318. fc->cbi = NULL;
  319. _debug("nocbi");
  320. }
  321. fc->index = -1;
  322. rtt = U32_MAX;
  323. for (i = 0; i < fc->server_list->nr_servers; i++) {
  324. struct afs_server *s = fc->server_list->servers[i].server;
  325. if (!test_bit(i, &fc->untried) || !s->probe.responded)
  326. continue;
  327. if (s->probe.rtt < rtt) {
  328. fc->index = i;
  329. rtt = s->probe.rtt;
  330. }
  331. }
  332. if (fc->index == -1)
  333. goto no_more_servers;
  334. selected_server:
  335. _debug("use %d", fc->index);
  336. __clear_bit(fc->index, &fc->untried);
  337. /* We're starting on a different fileserver from the list. We need to
  338. * check it, create a callback intercept, find its address list and
  339. * probe its capabilities before we use it.
  340. */
  341. ASSERTCMP(fc->ac.alist, ==, NULL);
  342. server = fc->server_list->servers[fc->index].server;
  343. if (!afs_check_server_record(fc, server))
  344. goto failed;
  345. _debug("USING SERVER: %pU", &server->uuid);
  346. /* Make sure we've got a callback interest record for this server. We
  347. * have to link it in before we send the request as we can be sent a
  348. * break request before we've finished decoding the reply and
  349. * installing the vnode.
  350. */
  351. error = afs_register_server_cb_interest(vnode, fc->server_list,
  352. fc->index);
  353. if (error < 0)
  354. goto failed_set_error;
  355. fc->cbi = afs_get_cb_interest(vnode->cb_interest);
  356. read_lock(&server->fs_lock);
  357. alist = rcu_dereference_protected(server->addresses,
  358. lockdep_is_held(&server->fs_lock));
  359. afs_get_addrlist(alist);
  360. read_unlock(&server->fs_lock);
  361. memset(&fc->ac, 0, sizeof(fc->ac));
  362. if (!fc->ac.alist)
  363. fc->ac.alist = alist;
  364. else
  365. afs_put_addrlist(alist);
  366. fc->ac.index = -1;
  367. iterate_address:
  368. ASSERT(fc->ac.alist);
  369. /* Iterate over the current server's address list to try and find an
  370. * address on which it will respond to us.
  371. */
  372. if (!afs_iterate_addresses(&fc->ac))
  373. goto next_server;
  374. _debug("address [%u] %u/%u", fc->index, fc->ac.index, fc->ac.alist->nr_addrs);
  375. _leave(" = t");
  376. return true;
  377. next_server:
  378. _debug("next");
  379. afs_end_cursor(&fc->ac);
  380. goto pick_server;
  381. no_more_servers:
  382. /* That's all the servers poked to no good effect. Try again if some
  383. * of them were busy.
  384. */
  385. if (fc->flags & AFS_FS_CURSOR_VBUSY)
  386. goto restart_from_beginning;
  387. e.error = -EDESTADDRREQ;
  388. e.responded = false;
  389. for (i = 0; i < fc->server_list->nr_servers; i++) {
  390. struct afs_server *s = fc->server_list->servers[i].server;
  391. afs_prioritise_error(&e, READ_ONCE(s->probe.error),
  392. s->probe.abort_code);
  393. }
  394. failed_set_error:
  395. fc->error = error;
  396. failed:
  397. fc->flags |= AFS_FS_CURSOR_STOP;
  398. afs_end_cursor(&fc->ac);
  399. _leave(" = f [failed %d]", fc->error);
  400. return false;
  401. }
  402. /*
  403. * Select the same fileserver we used for a vnode before and only that
  404. * fileserver. We use this when we have a lock on that file, which is backed
  405. * only by the fileserver we obtained it from.
  406. */
  407. bool afs_select_current_fileserver(struct afs_fs_cursor *fc)
  408. {
  409. struct afs_vnode *vnode = fc->vnode;
  410. struct afs_cb_interest *cbi = vnode->cb_interest;
  411. struct afs_addr_list *alist;
  412. int error = fc->ac.error;
  413. _enter("");
  414. switch (error) {
  415. case SHRT_MAX:
  416. if (!cbi) {
  417. fc->error = -ESTALE;
  418. fc->flags |= AFS_FS_CURSOR_STOP;
  419. return false;
  420. }
  421. fc->cbi = afs_get_cb_interest(vnode->cb_interest);
  422. read_lock(&cbi->server->fs_lock);
  423. alist = rcu_dereference_protected(cbi->server->addresses,
  424. lockdep_is_held(&cbi->server->fs_lock));
  425. afs_get_addrlist(alist);
  426. read_unlock(&cbi->server->fs_lock);
  427. if (!alist) {
  428. fc->error = -ESTALE;
  429. fc->flags |= AFS_FS_CURSOR_STOP;
  430. return false;
  431. }
  432. memset(&fc->ac, 0, sizeof(fc->ac));
  433. fc->ac.alist = alist;
  434. fc->ac.index = -1;
  435. goto iterate_address;
  436. case 0:
  437. default:
  438. /* Success or local failure. Stop. */
  439. fc->error = error;
  440. fc->flags |= AFS_FS_CURSOR_STOP;
  441. _leave(" = f [okay/local %d]", error);
  442. return false;
  443. case -ECONNABORTED:
  444. fc->error = afs_abort_to_error(fc->ac.abort_code);
  445. fc->flags |= AFS_FS_CURSOR_STOP;
  446. _leave(" = f [abort]");
  447. return false;
  448. case -ERFKILL:
  449. case -EADDRNOTAVAIL:
  450. case -ENETUNREACH:
  451. case -EHOSTUNREACH:
  452. case -EHOSTDOWN:
  453. case -ECONNREFUSED:
  454. case -ETIMEDOUT:
  455. case -ETIME:
  456. _debug("no conn");
  457. fc->error = error;
  458. goto iterate_address;
  459. }
  460. iterate_address:
  461. /* Iterate over the current server's address list to try and find an
  462. * address on which it will respond to us.
  463. */
  464. if (afs_iterate_addresses(&fc->ac)) {
  465. _leave(" = t");
  466. return true;
  467. }
  468. afs_end_cursor(&fc->ac);
  469. return false;
  470. }
  471. /*
  472. * Dump cursor state in the case of the error being EDESTADDRREQ.
  473. */
  474. static void afs_dump_edestaddrreq(const struct afs_fs_cursor *fc)
  475. {
  476. static int count;
  477. int i;
  478. if (!IS_ENABLED(CONFIG_AFS_DEBUG_CURSOR) || count > 3)
  479. return;
  480. count++;
  481. rcu_read_lock();
  482. pr_notice("EDESTADDR occurred\n");
  483. pr_notice("FC: cbb=%x cbb2=%x fl=%hx err=%hd\n",
  484. fc->cb_break, fc->cb_break_2, fc->flags, fc->error);
  485. pr_notice("FC: ut=%lx ix=%d ni=%u\n",
  486. fc->untried, fc->index, fc->nr_iterations);
  487. if (fc->server_list) {
  488. const struct afs_server_list *sl = fc->server_list;
  489. pr_notice("FC: SL nr=%u pr=%u vnov=%hx\n",
  490. sl->nr_servers, sl->preferred, sl->vnovol_mask);
  491. for (i = 0; i < sl->nr_servers; i++) {
  492. const struct afs_server *s = sl->servers[i].server;
  493. pr_notice("FC: server fl=%lx av=%u %pU\n",
  494. s->flags, s->addr_version, &s->uuid);
  495. if (s->addresses) {
  496. const struct afs_addr_list *a =
  497. rcu_dereference(s->addresses);
  498. pr_notice("FC: - av=%u nr=%u/%u/%u pr=%u\n",
  499. a->version,
  500. a->nr_ipv4, a->nr_addrs, a->max_addrs,
  501. a->preferred);
  502. pr_notice("FC: - pr=%lx R=%lx F=%lx\n",
  503. a->probed, a->responded, a->failed);
  504. if (a == fc->ac.alist)
  505. pr_notice("FC: - current\n");
  506. }
  507. }
  508. }
  509. pr_notice("AC: t=%lx ax=%u ac=%d er=%d r=%u ni=%u\n",
  510. fc->ac.tried, fc->ac.index, fc->ac.abort_code, fc->ac.error,
  511. fc->ac.responded, fc->ac.nr_iterations);
  512. rcu_read_unlock();
  513. }
  514. /*
  515. * Tidy up a filesystem cursor and unlock the vnode.
  516. */
  517. int afs_end_vnode_operation(struct afs_fs_cursor *fc)
  518. {
  519. struct afs_net *net = afs_v2net(fc->vnode);
  520. if (fc->error == -EDESTADDRREQ ||
  521. fc->error == -EADDRNOTAVAIL ||
  522. fc->error == -ENETUNREACH ||
  523. fc->error == -EHOSTUNREACH)
  524. afs_dump_edestaddrreq(fc);
  525. mutex_unlock(&fc->vnode->io_lock);
  526. afs_end_cursor(&fc->ac);
  527. afs_put_cb_interest(net, fc->cbi);
  528. afs_put_serverlist(net, fc->server_list);
  529. if (fc->error == -ECONNABORTED)
  530. fc->error = afs_abort_to_error(fc->ac.abort_code);
  531. return fc->error;
  532. }