extent_tree.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605
  1. /*
  2. * Copyright (c) 2014 Christoph Hellwig.
  3. */
  4. #include <linux/vmalloc.h>
  5. #include "blocklayout.h"
  6. #define NFSDBG_FACILITY NFSDBG_PNFS_LD
  7. static inline struct pnfs_block_extent *
  8. ext_node(struct rb_node *node)
  9. {
  10. return rb_entry(node, struct pnfs_block_extent, be_node);
  11. }
  12. static struct pnfs_block_extent *
  13. ext_tree_first(struct rb_root *root)
  14. {
  15. struct rb_node *node = rb_first(root);
  16. return node ? ext_node(node) : NULL;
  17. }
  18. static struct pnfs_block_extent *
  19. ext_tree_prev(struct pnfs_block_extent *be)
  20. {
  21. struct rb_node *node = rb_prev(&be->be_node);
  22. return node ? ext_node(node) : NULL;
  23. }
  24. static struct pnfs_block_extent *
  25. ext_tree_next(struct pnfs_block_extent *be)
  26. {
  27. struct rb_node *node = rb_next(&be->be_node);
  28. return node ? ext_node(node) : NULL;
  29. }
  30. static inline sector_t
  31. ext_f_end(struct pnfs_block_extent *be)
  32. {
  33. return be->be_f_offset + be->be_length;
  34. }
  35. static struct pnfs_block_extent *
  36. __ext_tree_search(struct rb_root *root, sector_t start)
  37. {
  38. struct rb_node *node = root->rb_node;
  39. struct pnfs_block_extent *be = NULL;
  40. while (node) {
  41. be = ext_node(node);
  42. if (start < be->be_f_offset)
  43. node = node->rb_left;
  44. else if (start >= ext_f_end(be))
  45. node = node->rb_right;
  46. else
  47. return be;
  48. }
  49. if (be) {
  50. if (start < be->be_f_offset)
  51. return be;
  52. if (start >= ext_f_end(be))
  53. return ext_tree_next(be);
  54. }
  55. return NULL;
  56. }
  57. static bool
  58. ext_can_merge(struct pnfs_block_extent *be1, struct pnfs_block_extent *be2)
  59. {
  60. if (be1->be_state != be2->be_state)
  61. return false;
  62. if (be1->be_device != be2->be_device)
  63. return false;
  64. if (be1->be_f_offset + be1->be_length != be2->be_f_offset)
  65. return false;
  66. if (be1->be_state != PNFS_BLOCK_NONE_DATA &&
  67. (be1->be_v_offset + be1->be_length != be2->be_v_offset))
  68. return false;
  69. if (be1->be_state == PNFS_BLOCK_INVALID_DATA &&
  70. be1->be_tag != be2->be_tag)
  71. return false;
  72. return true;
  73. }
  74. static struct pnfs_block_extent *
  75. ext_try_to_merge_left(struct rb_root *root, struct pnfs_block_extent *be)
  76. {
  77. struct pnfs_block_extent *left = ext_tree_prev(be);
  78. if (left && ext_can_merge(left, be)) {
  79. left->be_length += be->be_length;
  80. rb_erase(&be->be_node, root);
  81. nfs4_put_deviceid_node(be->be_device);
  82. kfree(be);
  83. return left;
  84. }
  85. return be;
  86. }
  87. static struct pnfs_block_extent *
  88. ext_try_to_merge_right(struct rb_root *root, struct pnfs_block_extent *be)
  89. {
  90. struct pnfs_block_extent *right = ext_tree_next(be);
  91. if (right && ext_can_merge(be, right)) {
  92. be->be_length += right->be_length;
  93. rb_erase(&right->be_node, root);
  94. nfs4_put_deviceid_node(right->be_device);
  95. kfree(right);
  96. }
  97. return be;
  98. }
  99. static void
  100. __ext_tree_insert(struct rb_root *root,
  101. struct pnfs_block_extent *new, bool merge_ok)
  102. {
  103. struct rb_node **p = &root->rb_node, *parent = NULL;
  104. struct pnfs_block_extent *be;
  105. while (*p) {
  106. parent = *p;
  107. be = ext_node(parent);
  108. if (new->be_f_offset < be->be_f_offset) {
  109. if (merge_ok && ext_can_merge(new, be)) {
  110. be->be_f_offset = new->be_f_offset;
  111. if (be->be_state != PNFS_BLOCK_NONE_DATA)
  112. be->be_v_offset = new->be_v_offset;
  113. be->be_length += new->be_length;
  114. be = ext_try_to_merge_left(root, be);
  115. goto free_new;
  116. }
  117. p = &(*p)->rb_left;
  118. } else if (new->be_f_offset >= ext_f_end(be)) {
  119. if (merge_ok && ext_can_merge(be, new)) {
  120. be->be_length += new->be_length;
  121. be = ext_try_to_merge_right(root, be);
  122. goto free_new;
  123. }
  124. p = &(*p)->rb_right;
  125. } else {
  126. BUG();
  127. }
  128. }
  129. rb_link_node(&new->be_node, parent, p);
  130. rb_insert_color(&new->be_node, root);
  131. return;
  132. free_new:
  133. nfs4_put_deviceid_node(new->be_device);
  134. kfree(new);
  135. }
  136. static int
  137. __ext_tree_remove(struct rb_root *root, sector_t start, sector_t end)
  138. {
  139. struct pnfs_block_extent *be;
  140. sector_t len1 = 0, len2 = 0;
  141. sector_t orig_v_offset;
  142. sector_t orig_len;
  143. be = __ext_tree_search(root, start);
  144. if (!be)
  145. return 0;
  146. if (be->be_f_offset >= end)
  147. return 0;
  148. orig_v_offset = be->be_v_offset;
  149. orig_len = be->be_length;
  150. if (start > be->be_f_offset)
  151. len1 = start - be->be_f_offset;
  152. if (ext_f_end(be) > end)
  153. len2 = ext_f_end(be) - end;
  154. if (len2 > 0) {
  155. if (len1 > 0) {
  156. struct pnfs_block_extent *new;
  157. new = kzalloc(sizeof(*new), GFP_ATOMIC);
  158. if (!new)
  159. return -ENOMEM;
  160. be->be_length = len1;
  161. new->be_f_offset = end;
  162. if (be->be_state != PNFS_BLOCK_NONE_DATA) {
  163. new->be_v_offset =
  164. orig_v_offset + orig_len - len2;
  165. }
  166. new->be_length = len2;
  167. new->be_state = be->be_state;
  168. new->be_tag = be->be_tag;
  169. new->be_device = nfs4_get_deviceid(be->be_device);
  170. __ext_tree_insert(root, new, true);
  171. } else {
  172. be->be_f_offset = end;
  173. if (be->be_state != PNFS_BLOCK_NONE_DATA) {
  174. be->be_v_offset =
  175. orig_v_offset + orig_len - len2;
  176. }
  177. be->be_length = len2;
  178. }
  179. } else {
  180. if (len1 > 0) {
  181. be->be_length = len1;
  182. be = ext_tree_next(be);
  183. }
  184. while (be && ext_f_end(be) <= end) {
  185. struct pnfs_block_extent *next = ext_tree_next(be);
  186. rb_erase(&be->be_node, root);
  187. nfs4_put_deviceid_node(be->be_device);
  188. kfree(be);
  189. be = next;
  190. }
  191. if (be && be->be_f_offset < end) {
  192. len1 = ext_f_end(be) - end;
  193. be->be_f_offset = end;
  194. if (be->be_state != PNFS_BLOCK_NONE_DATA)
  195. be->be_v_offset += be->be_length - len1;
  196. be->be_length = len1;
  197. }
  198. }
  199. return 0;
  200. }
  201. int
  202. ext_tree_insert(struct pnfs_block_layout *bl, struct pnfs_block_extent *new)
  203. {
  204. struct pnfs_block_extent *be;
  205. struct rb_root *root;
  206. int err = 0;
  207. switch (new->be_state) {
  208. case PNFS_BLOCK_READWRITE_DATA:
  209. case PNFS_BLOCK_INVALID_DATA:
  210. root = &bl->bl_ext_rw;
  211. break;
  212. case PNFS_BLOCK_READ_DATA:
  213. case PNFS_BLOCK_NONE_DATA:
  214. root = &bl->bl_ext_ro;
  215. break;
  216. default:
  217. dprintk("invalid extent type\n");
  218. return -EINVAL;
  219. }
  220. spin_lock(&bl->bl_ext_lock);
  221. retry:
  222. be = __ext_tree_search(root, new->be_f_offset);
  223. if (!be || be->be_f_offset >= ext_f_end(new)) {
  224. __ext_tree_insert(root, new, true);
  225. } else if (new->be_f_offset >= be->be_f_offset) {
  226. if (ext_f_end(new) <= ext_f_end(be)) {
  227. nfs4_put_deviceid_node(new->be_device);
  228. kfree(new);
  229. } else {
  230. sector_t new_len = ext_f_end(new) - ext_f_end(be);
  231. sector_t diff = new->be_length - new_len;
  232. new->be_f_offset += diff;
  233. new->be_v_offset += diff;
  234. new->be_length = new_len;
  235. goto retry;
  236. }
  237. } else if (ext_f_end(new) <= ext_f_end(be)) {
  238. new->be_length = be->be_f_offset - new->be_f_offset;
  239. __ext_tree_insert(root, new, true);
  240. } else {
  241. struct pnfs_block_extent *split;
  242. sector_t new_len = ext_f_end(new) - ext_f_end(be);
  243. sector_t diff = new->be_length - new_len;
  244. split = kmemdup(new, sizeof(*new), GFP_ATOMIC);
  245. if (!split) {
  246. err = -EINVAL;
  247. goto out;
  248. }
  249. split->be_length = be->be_f_offset - split->be_f_offset;
  250. split->be_device = nfs4_get_deviceid(new->be_device);
  251. __ext_tree_insert(root, split, true);
  252. new->be_f_offset += diff;
  253. new->be_v_offset += diff;
  254. new->be_length = new_len;
  255. goto retry;
  256. }
  257. out:
  258. spin_unlock(&bl->bl_ext_lock);
  259. return err;
  260. }
  261. static bool
  262. __ext_tree_lookup(struct rb_root *root, sector_t isect,
  263. struct pnfs_block_extent *ret)
  264. {
  265. struct rb_node *node;
  266. struct pnfs_block_extent *be;
  267. node = root->rb_node;
  268. while (node) {
  269. be = ext_node(node);
  270. if (isect < be->be_f_offset)
  271. node = node->rb_left;
  272. else if (isect >= ext_f_end(be))
  273. node = node->rb_right;
  274. else {
  275. *ret = *be;
  276. return true;
  277. }
  278. }
  279. return false;
  280. }
  281. bool
  282. ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect,
  283. struct pnfs_block_extent *ret, bool rw)
  284. {
  285. bool found = false;
  286. spin_lock(&bl->bl_ext_lock);
  287. if (!rw)
  288. found = __ext_tree_lookup(&bl->bl_ext_ro, isect, ret);
  289. if (!found)
  290. found = __ext_tree_lookup(&bl->bl_ext_rw, isect, ret);
  291. spin_unlock(&bl->bl_ext_lock);
  292. return found;
  293. }
  294. int ext_tree_remove(struct pnfs_block_layout *bl, bool rw,
  295. sector_t start, sector_t end)
  296. {
  297. int err, err2;
  298. spin_lock(&bl->bl_ext_lock);
  299. err = __ext_tree_remove(&bl->bl_ext_ro, start, end);
  300. if (rw) {
  301. err2 = __ext_tree_remove(&bl->bl_ext_rw, start, end);
  302. if (!err)
  303. err = err2;
  304. }
  305. spin_unlock(&bl->bl_ext_lock);
  306. return err;
  307. }
  308. static int
  309. ext_tree_split(struct rb_root *root, struct pnfs_block_extent *be,
  310. sector_t split)
  311. {
  312. struct pnfs_block_extent *new;
  313. sector_t orig_len = be->be_length;
  314. new = kzalloc(sizeof(*new), GFP_ATOMIC);
  315. if (!new)
  316. return -ENOMEM;
  317. be->be_length = split - be->be_f_offset;
  318. new->be_f_offset = split;
  319. if (be->be_state != PNFS_BLOCK_NONE_DATA)
  320. new->be_v_offset = be->be_v_offset + be->be_length;
  321. new->be_length = orig_len - be->be_length;
  322. new->be_state = be->be_state;
  323. new->be_tag = be->be_tag;
  324. new->be_device = nfs4_get_deviceid(be->be_device);
  325. __ext_tree_insert(root, new, false);
  326. return 0;
  327. }
  328. int
  329. ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
  330. sector_t len)
  331. {
  332. struct rb_root *root = &bl->bl_ext_rw;
  333. sector_t end = start + len;
  334. struct pnfs_block_extent *be;
  335. int err = 0;
  336. spin_lock(&bl->bl_ext_lock);
  337. /*
  338. * First remove all COW extents or holes from written to range.
  339. */
  340. err = __ext_tree_remove(&bl->bl_ext_ro, start, end);
  341. if (err)
  342. goto out;
  343. /*
  344. * Then mark all invalid extents in the range as written to.
  345. */
  346. for (be = __ext_tree_search(root, start); be; be = ext_tree_next(be)) {
  347. if (be->be_f_offset >= end)
  348. break;
  349. if (be->be_state != PNFS_BLOCK_INVALID_DATA || be->be_tag)
  350. continue;
  351. if (be->be_f_offset < start) {
  352. struct pnfs_block_extent *left = ext_tree_prev(be);
  353. if (left && ext_can_merge(left, be)) {
  354. sector_t diff = start - be->be_f_offset;
  355. left->be_length += diff;
  356. be->be_f_offset += diff;
  357. be->be_v_offset += diff;
  358. be->be_length -= diff;
  359. } else {
  360. err = ext_tree_split(root, be, start);
  361. if (err)
  362. goto out;
  363. }
  364. }
  365. if (ext_f_end(be) > end) {
  366. struct pnfs_block_extent *right = ext_tree_next(be);
  367. if (right && ext_can_merge(be, right)) {
  368. sector_t diff = end - be->be_f_offset;
  369. be->be_length -= diff;
  370. right->be_f_offset -= diff;
  371. right->be_v_offset -= diff;
  372. right->be_length += diff;
  373. } else {
  374. err = ext_tree_split(root, be, end);
  375. if (err)
  376. goto out;
  377. }
  378. }
  379. if (be->be_f_offset >= start && ext_f_end(be) <= end) {
  380. be->be_tag = EXTENT_WRITTEN;
  381. be = ext_try_to_merge_left(root, be);
  382. be = ext_try_to_merge_right(root, be);
  383. }
  384. }
  385. out:
  386. spin_unlock(&bl->bl_ext_lock);
  387. return err;
  388. }
  389. static size_t ext_tree_layoutupdate_size(size_t count)
  390. {
  391. return sizeof(__be32) /* number of entries */ +
  392. PNFS_BLOCK_EXTENT_SIZE * count;
  393. }
  394. static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg,
  395. size_t buffer_size)
  396. {
  397. if (arg->layoutupdate_pages != &arg->layoutupdate_page) {
  398. int nr_pages = DIV_ROUND_UP(buffer_size, PAGE_SIZE), i;
  399. for (i = 0; i < nr_pages; i++)
  400. put_page(arg->layoutupdate_pages[i]);
  401. kfree(arg->layoutupdate_pages);
  402. } else {
  403. put_page(arg->layoutupdate_page);
  404. }
  405. }
  406. static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
  407. size_t buffer_size, size_t *count)
  408. {
  409. struct pnfs_block_extent *be;
  410. int ret = 0;
  411. spin_lock(&bl->bl_ext_lock);
  412. for (be = ext_tree_first(&bl->bl_ext_rw); be; be = ext_tree_next(be)) {
  413. if (be->be_state != PNFS_BLOCK_INVALID_DATA ||
  414. be->be_tag != EXTENT_WRITTEN)
  415. continue;
  416. (*count)++;
  417. if (ext_tree_layoutupdate_size(*count) > buffer_size) {
  418. /* keep counting.. */
  419. ret = -ENOSPC;
  420. continue;
  421. }
  422. p = xdr_encode_opaque_fixed(p, be->be_device->deviceid.data,
  423. NFS4_DEVICEID4_SIZE);
  424. p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
  425. p = xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
  426. p = xdr_encode_hyper(p, 0LL);
  427. *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
  428. be->be_tag = EXTENT_COMMITTING;
  429. }
  430. spin_unlock(&bl->bl_ext_lock);
  431. return ret;
  432. }
  433. int
  434. ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg)
  435. {
  436. struct pnfs_block_layout *bl = BLK_LO2EXT(NFS_I(arg->inode)->layout);
  437. size_t count = 0, buffer_size = PAGE_SIZE;
  438. __be32 *start_p;
  439. int ret;
  440. dprintk("%s enter\n", __func__);
  441. arg->layoutupdate_page = alloc_page(GFP_NOFS);
  442. if (!arg->layoutupdate_page)
  443. return -ENOMEM;
  444. start_p = page_address(arg->layoutupdate_page);
  445. arg->layoutupdate_pages = &arg->layoutupdate_page;
  446. retry:
  447. ret = ext_tree_encode_commit(bl, start_p + 1, buffer_size, &count);
  448. if (unlikely(ret)) {
  449. ext_tree_free_commitdata(arg, buffer_size);
  450. buffer_size = ext_tree_layoutupdate_size(count);
  451. count = 0;
  452. arg->layoutupdate_pages =
  453. kcalloc(DIV_ROUND_UP(buffer_size, PAGE_SIZE),
  454. sizeof(struct page *), GFP_NOFS);
  455. if (!arg->layoutupdate_pages)
  456. return -ENOMEM;
  457. start_p = __vmalloc(buffer_size, GFP_NOFS, PAGE_KERNEL);
  458. if (!start_p) {
  459. kfree(arg->layoutupdate_pages);
  460. return -ENOMEM;
  461. }
  462. goto retry;
  463. }
  464. *start_p = cpu_to_be32(count);
  465. arg->layoutupdate_len = ext_tree_layoutupdate_size(count);
  466. if (unlikely(arg->layoutupdate_pages != &arg->layoutupdate_page)) {
  467. void *p = start_p, *end = p + arg->layoutupdate_len;
  468. int i = 0;
  469. for ( ; p < end; p += PAGE_SIZE)
  470. arg->layoutupdate_pages[i++] = vmalloc_to_page(p);
  471. }
  472. dprintk("%s found %zu ranges\n", __func__, count);
  473. return 0;
  474. }
  475. void
  476. ext_tree_mark_committed(struct nfs4_layoutcommit_args *arg, int status)
  477. {
  478. struct pnfs_block_layout *bl = BLK_LO2EXT(NFS_I(arg->inode)->layout);
  479. struct rb_root *root = &bl->bl_ext_rw;
  480. struct pnfs_block_extent *be;
  481. dprintk("%s status %d\n", __func__, status);
  482. ext_tree_free_commitdata(arg, arg->layoutupdate_len);
  483. spin_lock(&bl->bl_ext_lock);
  484. for (be = ext_tree_first(root); be; be = ext_tree_next(be)) {
  485. if (be->be_state != PNFS_BLOCK_INVALID_DATA ||
  486. be->be_tag != EXTENT_COMMITTING)
  487. continue;
  488. if (status) {
  489. /*
  490. * Mark as written and try again.
  491. *
  492. * XXX: some real error handling here wouldn't hurt..
  493. */
  494. be->be_tag = EXTENT_WRITTEN;
  495. } else {
  496. be->be_state = PNFS_BLOCK_READWRITE_DATA;
  497. be->be_tag = 0;
  498. }
  499. be = ext_try_to_merge_left(root, be);
  500. be = ext_try_to_merge_right(root, be);
  501. }
  502. spin_unlock(&bl->bl_ext_lock);
  503. }