dm-btree.c 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889
  1. /*
  2. * Copyright (C) 2011 Red Hat, Inc.
  3. *
  4. * This file is released under the GPL.
  5. */
  6. #include "dm-btree-internal.h"
  7. #include "dm-space-map.h"
  8. #include "dm-transaction-manager.h"
  9. #include <linux/export.h>
  10. #include <linux/device-mapper.h>
  11. #define DM_MSG_PREFIX "btree"
  12. /*----------------------------------------------------------------
  13. * Array manipulation
  14. *--------------------------------------------------------------*/
  15. static void memcpy_disk(void *dest, const void *src, size_t len)
  16. __dm_written_to_disk(src)
  17. {
  18. memcpy(dest, src, len);
  19. __dm_unbless_for_disk(src);
  20. }
  21. static void array_insert(void *base, size_t elt_size, unsigned nr_elts,
  22. unsigned index, void *elt)
  23. __dm_written_to_disk(elt)
  24. {
  25. if (index < nr_elts)
  26. memmove(base + (elt_size * (index + 1)),
  27. base + (elt_size * index),
  28. (nr_elts - index) * elt_size);
  29. memcpy_disk(base + (elt_size * index), elt, elt_size);
  30. }
  31. /*----------------------------------------------------------------*/
  32. /* makes the assumption that no two keys are the same. */
  33. static int bsearch(struct btree_node *n, uint64_t key, int want_hi)
  34. {
  35. int lo = -1, hi = le32_to_cpu(n->header.nr_entries);
  36. while (hi - lo > 1) {
  37. int mid = lo + ((hi - lo) / 2);
  38. uint64_t mid_key = le64_to_cpu(n->keys[mid]);
  39. if (mid_key == key)
  40. return mid;
  41. if (mid_key < key)
  42. lo = mid;
  43. else
  44. hi = mid;
  45. }
  46. return want_hi ? hi : lo;
  47. }
  48. int lower_bound(struct btree_node *n, uint64_t key)
  49. {
  50. return bsearch(n, key, 0);
  51. }
  52. void inc_children(struct dm_transaction_manager *tm, struct btree_node *n,
  53. struct dm_btree_value_type *vt)
  54. {
  55. unsigned i;
  56. uint32_t nr_entries = le32_to_cpu(n->header.nr_entries);
  57. if (le32_to_cpu(n->header.flags) & INTERNAL_NODE)
  58. for (i = 0; i < nr_entries; i++)
  59. dm_tm_inc(tm, value64(n, i));
  60. else if (vt->inc)
  61. for (i = 0; i < nr_entries; i++)
  62. vt->inc(vt->context, value_ptr(n, i));
  63. }
  64. static int insert_at(size_t value_size, struct btree_node *node, unsigned index,
  65. uint64_t key, void *value)
  66. __dm_written_to_disk(value)
  67. {
  68. uint32_t nr_entries = le32_to_cpu(node->header.nr_entries);
  69. __le64 key_le = cpu_to_le64(key);
  70. if (index > nr_entries ||
  71. index >= le32_to_cpu(node->header.max_entries)) {
  72. DMERR("too many entries in btree node for insert");
  73. __dm_unbless_for_disk(value);
  74. return -ENOMEM;
  75. }
  76. __dm_bless_for_disk(&key_le);
  77. array_insert(node->keys, sizeof(*node->keys), nr_entries, index, &key_le);
  78. array_insert(value_base(node), value_size, nr_entries, index, value);
  79. node->header.nr_entries = cpu_to_le32(nr_entries + 1);
  80. return 0;
  81. }
  82. /*----------------------------------------------------------------*/
  83. /*
  84. * We want 3n entries (for some n). This works more nicely for repeated
  85. * insert remove loops than (2n + 1).
  86. */
  87. static uint32_t calc_max_entries(size_t value_size, size_t block_size)
  88. {
  89. uint32_t total, n;
  90. size_t elt_size = sizeof(uint64_t) + value_size; /* key + value */
  91. block_size -= sizeof(struct node_header);
  92. total = block_size / elt_size;
  93. n = total / 3; /* rounds down */
  94. return 3 * n;
  95. }
  96. int dm_btree_empty(struct dm_btree_info *info, dm_block_t *root)
  97. {
  98. int r;
  99. struct dm_block *b;
  100. struct btree_node *n;
  101. size_t block_size;
  102. uint32_t max_entries;
  103. r = new_block(info, &b);
  104. if (r < 0)
  105. return r;
  106. block_size = dm_bm_block_size(dm_tm_get_bm(info->tm));
  107. max_entries = calc_max_entries(info->value_type.size, block_size);
  108. n = dm_block_data(b);
  109. memset(n, 0, block_size);
  110. n->header.flags = cpu_to_le32(LEAF_NODE);
  111. n->header.nr_entries = cpu_to_le32(0);
  112. n->header.max_entries = cpu_to_le32(max_entries);
  113. n->header.value_size = cpu_to_le32(info->value_type.size);
  114. *root = dm_block_location(b);
  115. unlock_block(info, b);
  116. return 0;
  117. }
  118. EXPORT_SYMBOL_GPL(dm_btree_empty);
  119. /*----------------------------------------------------------------*/
  120. /*
  121. * Deletion uses a recursive algorithm, since we have limited stack space
  122. * we explicitly manage our own stack on the heap.
  123. */
  124. #define MAX_SPINE_DEPTH 64
  125. struct frame {
  126. struct dm_block *b;
  127. struct btree_node *n;
  128. unsigned level;
  129. unsigned nr_children;
  130. unsigned current_child;
  131. };
  132. struct del_stack {
  133. struct dm_btree_info *info;
  134. struct dm_transaction_manager *tm;
  135. int top;
  136. struct frame spine[MAX_SPINE_DEPTH];
  137. };
  138. static int top_frame(struct del_stack *s, struct frame **f)
  139. {
  140. if (s->top < 0) {
  141. DMERR("btree deletion stack empty");
  142. return -EINVAL;
  143. }
  144. *f = s->spine + s->top;
  145. return 0;
  146. }
  147. static int unprocessed_frames(struct del_stack *s)
  148. {
  149. return s->top >= 0;
  150. }
  151. static void prefetch_children(struct del_stack *s, struct frame *f)
  152. {
  153. unsigned i;
  154. struct dm_block_manager *bm = dm_tm_get_bm(s->tm);
  155. for (i = 0; i < f->nr_children; i++)
  156. dm_bm_prefetch(bm, value64(f->n, i));
  157. }
  158. static bool is_internal_level(struct dm_btree_info *info, struct frame *f)
  159. {
  160. return f->level < (info->levels - 1);
  161. }
  162. static int push_frame(struct del_stack *s, dm_block_t b, unsigned level)
  163. {
  164. int r;
  165. uint32_t ref_count;
  166. if (s->top >= MAX_SPINE_DEPTH - 1) {
  167. DMERR("btree deletion stack out of memory");
  168. return -ENOMEM;
  169. }
  170. r = dm_tm_ref(s->tm, b, &ref_count);
  171. if (r)
  172. return r;
  173. if (ref_count > 1)
  174. /*
  175. * This is a shared node, so we can just decrement it's
  176. * reference counter and leave the children.
  177. */
  178. dm_tm_dec(s->tm, b);
  179. else {
  180. uint32_t flags;
  181. struct frame *f = s->spine + ++s->top;
  182. r = dm_tm_read_lock(s->tm, b, &btree_node_validator, &f->b);
  183. if (r) {
  184. s->top--;
  185. return r;
  186. }
  187. f->n = dm_block_data(f->b);
  188. f->level = level;
  189. f->nr_children = le32_to_cpu(f->n->header.nr_entries);
  190. f->current_child = 0;
  191. flags = le32_to_cpu(f->n->header.flags);
  192. if (flags & INTERNAL_NODE || is_internal_level(s->info, f))
  193. prefetch_children(s, f);
  194. }
  195. return 0;
  196. }
  197. static void pop_frame(struct del_stack *s)
  198. {
  199. struct frame *f = s->spine + s->top--;
  200. dm_tm_dec(s->tm, dm_block_location(f->b));
  201. dm_tm_unlock(s->tm, f->b);
  202. }
  203. int dm_btree_del(struct dm_btree_info *info, dm_block_t root)
  204. {
  205. int r;
  206. struct del_stack *s;
  207. s = kmalloc(sizeof(*s), GFP_NOIO);
  208. if (!s)
  209. return -ENOMEM;
  210. s->info = info;
  211. s->tm = info->tm;
  212. s->top = -1;
  213. r = push_frame(s, root, 0);
  214. if (r)
  215. goto out;
  216. while (unprocessed_frames(s)) {
  217. uint32_t flags;
  218. struct frame *f;
  219. dm_block_t b;
  220. r = top_frame(s, &f);
  221. if (r)
  222. goto out;
  223. if (f->current_child >= f->nr_children) {
  224. pop_frame(s);
  225. continue;
  226. }
  227. flags = le32_to_cpu(f->n->header.flags);
  228. if (flags & INTERNAL_NODE) {
  229. b = value64(f->n, f->current_child);
  230. f->current_child++;
  231. r = push_frame(s, b, f->level);
  232. if (r)
  233. goto out;
  234. } else if (is_internal_level(info, f)) {
  235. b = value64(f->n, f->current_child);
  236. f->current_child++;
  237. r = push_frame(s, b, f->level + 1);
  238. if (r)
  239. goto out;
  240. } else {
  241. if (info->value_type.dec) {
  242. unsigned i;
  243. for (i = 0; i < f->nr_children; i++)
  244. info->value_type.dec(info->value_type.context,
  245. value_ptr(f->n, i));
  246. }
  247. pop_frame(s);
  248. }
  249. }
  250. out:
  251. kfree(s);
  252. return r;
  253. }
  254. EXPORT_SYMBOL_GPL(dm_btree_del);
  255. /*----------------------------------------------------------------*/
  256. static int btree_lookup_raw(struct ro_spine *s, dm_block_t block, uint64_t key,
  257. int (*search_fn)(struct btree_node *, uint64_t),
  258. uint64_t *result_key, void *v, size_t value_size)
  259. {
  260. int i, r;
  261. uint32_t flags, nr_entries;
  262. do {
  263. r = ro_step(s, block);
  264. if (r < 0)
  265. return r;
  266. i = search_fn(ro_node(s), key);
  267. flags = le32_to_cpu(ro_node(s)->header.flags);
  268. nr_entries = le32_to_cpu(ro_node(s)->header.nr_entries);
  269. if (i < 0 || i >= nr_entries)
  270. return -ENODATA;
  271. if (flags & INTERNAL_NODE)
  272. block = value64(ro_node(s), i);
  273. } while (!(flags & LEAF_NODE));
  274. *result_key = le64_to_cpu(ro_node(s)->keys[i]);
  275. memcpy(v, value_ptr(ro_node(s), i), value_size);
  276. return 0;
  277. }
  278. int dm_btree_lookup(struct dm_btree_info *info, dm_block_t root,
  279. uint64_t *keys, void *value_le)
  280. {
  281. unsigned level, last_level = info->levels - 1;
  282. int r = -ENODATA;
  283. uint64_t rkey;
  284. __le64 internal_value_le;
  285. struct ro_spine spine;
  286. init_ro_spine(&spine, info);
  287. for (level = 0; level < info->levels; level++) {
  288. size_t size;
  289. void *value_p;
  290. if (level == last_level) {
  291. value_p = value_le;
  292. size = info->value_type.size;
  293. } else {
  294. value_p = &internal_value_le;
  295. size = sizeof(uint64_t);
  296. }
  297. r = btree_lookup_raw(&spine, root, keys[level],
  298. lower_bound, &rkey,
  299. value_p, size);
  300. if (!r) {
  301. if (rkey != keys[level]) {
  302. exit_ro_spine(&spine);
  303. return -ENODATA;
  304. }
  305. } else {
  306. exit_ro_spine(&spine);
  307. return r;
  308. }
  309. root = le64_to_cpu(internal_value_le);
  310. }
  311. exit_ro_spine(&spine);
  312. return r;
  313. }
  314. EXPORT_SYMBOL_GPL(dm_btree_lookup);
  315. /*
  316. * Splits a node by creating a sibling node and shifting half the nodes
  317. * contents across. Assumes there is a parent node, and it has room for
  318. * another child.
  319. *
  320. * Before:
  321. * +--------+
  322. * | Parent |
  323. * +--------+
  324. * |
  325. * v
  326. * +----------+
  327. * | A ++++++ |
  328. * +----------+
  329. *
  330. *
  331. * After:
  332. * +--------+
  333. * | Parent |
  334. * +--------+
  335. * | |
  336. * v +------+
  337. * +---------+ |
  338. * | A* +++ | v
  339. * +---------+ +-------+
  340. * | B +++ |
  341. * +-------+
  342. *
  343. * Where A* is a shadow of A.
  344. */
  345. static int btree_split_sibling(struct shadow_spine *s, unsigned parent_index,
  346. uint64_t key)
  347. {
  348. int r;
  349. size_t size;
  350. unsigned nr_left, nr_right;
  351. struct dm_block *left, *right, *parent;
  352. struct btree_node *ln, *rn, *pn;
  353. __le64 location;
  354. left = shadow_current(s);
  355. r = new_block(s->info, &right);
  356. if (r < 0)
  357. return r;
  358. ln = dm_block_data(left);
  359. rn = dm_block_data(right);
  360. nr_left = le32_to_cpu(ln->header.nr_entries) / 2;
  361. nr_right = le32_to_cpu(ln->header.nr_entries) - nr_left;
  362. ln->header.nr_entries = cpu_to_le32(nr_left);
  363. rn->header.flags = ln->header.flags;
  364. rn->header.nr_entries = cpu_to_le32(nr_right);
  365. rn->header.max_entries = ln->header.max_entries;
  366. rn->header.value_size = ln->header.value_size;
  367. memcpy(rn->keys, ln->keys + nr_left, nr_right * sizeof(rn->keys[0]));
  368. size = le32_to_cpu(ln->header.flags) & INTERNAL_NODE ?
  369. sizeof(uint64_t) : s->info->value_type.size;
  370. memcpy(value_ptr(rn, 0), value_ptr(ln, nr_left),
  371. size * nr_right);
  372. /*
  373. * Patch up the parent
  374. */
  375. parent = shadow_parent(s);
  376. pn = dm_block_data(parent);
  377. location = cpu_to_le64(dm_block_location(left));
  378. __dm_bless_for_disk(&location);
  379. memcpy_disk(value_ptr(pn, parent_index),
  380. &location, sizeof(__le64));
  381. location = cpu_to_le64(dm_block_location(right));
  382. __dm_bless_for_disk(&location);
  383. r = insert_at(sizeof(__le64), pn, parent_index + 1,
  384. le64_to_cpu(rn->keys[0]), &location);
  385. if (r)
  386. return r;
  387. if (key < le64_to_cpu(rn->keys[0])) {
  388. unlock_block(s->info, right);
  389. s->nodes[1] = left;
  390. } else {
  391. unlock_block(s->info, left);
  392. s->nodes[1] = right;
  393. }
  394. return 0;
  395. }
  396. /*
  397. * Splits a node by creating two new children beneath the given node.
  398. *
  399. * Before:
  400. * +----------+
  401. * | A ++++++ |
  402. * +----------+
  403. *
  404. *
  405. * After:
  406. * +------------+
  407. * | A (shadow) |
  408. * +------------+
  409. * | |
  410. * +------+ +----+
  411. * | |
  412. * v v
  413. * +-------+ +-------+
  414. * | B +++ | | C +++ |
  415. * +-------+ +-------+
  416. */
  417. static int btree_split_beneath(struct shadow_spine *s, uint64_t key)
  418. {
  419. int r;
  420. size_t size;
  421. unsigned nr_left, nr_right;
  422. struct dm_block *left, *right, *new_parent;
  423. struct btree_node *pn, *ln, *rn;
  424. __le64 val;
  425. new_parent = shadow_current(s);
  426. r = new_block(s->info, &left);
  427. if (r < 0)
  428. return r;
  429. r = new_block(s->info, &right);
  430. if (r < 0) {
  431. /* FIXME: put left */
  432. return r;
  433. }
  434. pn = dm_block_data(new_parent);
  435. ln = dm_block_data(left);
  436. rn = dm_block_data(right);
  437. nr_left = le32_to_cpu(pn->header.nr_entries) / 2;
  438. nr_right = le32_to_cpu(pn->header.nr_entries) - nr_left;
  439. ln->header.flags = pn->header.flags;
  440. ln->header.nr_entries = cpu_to_le32(nr_left);
  441. ln->header.max_entries = pn->header.max_entries;
  442. ln->header.value_size = pn->header.value_size;
  443. rn->header.flags = pn->header.flags;
  444. rn->header.nr_entries = cpu_to_le32(nr_right);
  445. rn->header.max_entries = pn->header.max_entries;
  446. rn->header.value_size = pn->header.value_size;
  447. memcpy(ln->keys, pn->keys, nr_left * sizeof(pn->keys[0]));
  448. memcpy(rn->keys, pn->keys + nr_left, nr_right * sizeof(pn->keys[0]));
  449. size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ?
  450. sizeof(__le64) : s->info->value_type.size;
  451. memcpy(value_ptr(ln, 0), value_ptr(pn, 0), nr_left * size);
  452. memcpy(value_ptr(rn, 0), value_ptr(pn, nr_left),
  453. nr_right * size);
  454. /* new_parent should just point to l and r now */
  455. pn->header.flags = cpu_to_le32(INTERNAL_NODE);
  456. pn->header.nr_entries = cpu_to_le32(2);
  457. pn->header.max_entries = cpu_to_le32(
  458. calc_max_entries(sizeof(__le64),
  459. dm_bm_block_size(
  460. dm_tm_get_bm(s->info->tm))));
  461. pn->header.value_size = cpu_to_le32(sizeof(__le64));
  462. val = cpu_to_le64(dm_block_location(left));
  463. __dm_bless_for_disk(&val);
  464. pn->keys[0] = ln->keys[0];
  465. memcpy_disk(value_ptr(pn, 0), &val, sizeof(__le64));
  466. val = cpu_to_le64(dm_block_location(right));
  467. __dm_bless_for_disk(&val);
  468. pn->keys[1] = rn->keys[0];
  469. memcpy_disk(value_ptr(pn, 1), &val, sizeof(__le64));
  470. /*
  471. * rejig the spine. This is ugly, since it knows too
  472. * much about the spine
  473. */
  474. if (s->nodes[0] != new_parent) {
  475. unlock_block(s->info, s->nodes[0]);
  476. s->nodes[0] = new_parent;
  477. }
  478. if (key < le64_to_cpu(rn->keys[0])) {
  479. unlock_block(s->info, right);
  480. s->nodes[1] = left;
  481. } else {
  482. unlock_block(s->info, left);
  483. s->nodes[1] = right;
  484. }
  485. s->count = 2;
  486. return 0;
  487. }
  488. static int btree_insert_raw(struct shadow_spine *s, dm_block_t root,
  489. struct dm_btree_value_type *vt,
  490. uint64_t key, unsigned *index)
  491. {
  492. int r, i = *index, top = 1;
  493. struct btree_node *node;
  494. for (;;) {
  495. r = shadow_step(s, root, vt);
  496. if (r < 0)
  497. return r;
  498. node = dm_block_data(shadow_current(s));
  499. /*
  500. * We have to patch up the parent node, ugly, but I don't
  501. * see a way to do this automatically as part of the spine
  502. * op.
  503. */
  504. if (shadow_has_parent(s) && i >= 0) { /* FIXME: second clause unness. */
  505. __le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
  506. __dm_bless_for_disk(&location);
  507. memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i),
  508. &location, sizeof(__le64));
  509. }
  510. node = dm_block_data(shadow_current(s));
  511. if (node->header.nr_entries == node->header.max_entries) {
  512. if (top)
  513. r = btree_split_beneath(s, key);
  514. else
  515. r = btree_split_sibling(s, i, key);
  516. if (r < 0)
  517. return r;
  518. }
  519. node = dm_block_data(shadow_current(s));
  520. i = lower_bound(node, key);
  521. if (le32_to_cpu(node->header.flags) & LEAF_NODE)
  522. break;
  523. if (i < 0) {
  524. /* change the bounds on the lowest key */
  525. node->keys[0] = cpu_to_le64(key);
  526. i = 0;
  527. }
  528. root = value64(node, i);
  529. top = 0;
  530. }
  531. if (i < 0 || le64_to_cpu(node->keys[i]) != key)
  532. i++;
  533. *index = i;
  534. return 0;
  535. }
  536. static int insert(struct dm_btree_info *info, dm_block_t root,
  537. uint64_t *keys, void *value, dm_block_t *new_root,
  538. int *inserted)
  539. __dm_written_to_disk(value)
  540. {
  541. int r, need_insert;
  542. unsigned level, index = -1, last_level = info->levels - 1;
  543. dm_block_t block = root;
  544. struct shadow_spine spine;
  545. struct btree_node *n;
  546. struct dm_btree_value_type le64_type;
  547. init_le64_type(info->tm, &le64_type);
  548. init_shadow_spine(&spine, info);
  549. for (level = 0; level < (info->levels - 1); level++) {
  550. r = btree_insert_raw(&spine, block, &le64_type, keys[level], &index);
  551. if (r < 0)
  552. goto bad;
  553. n = dm_block_data(shadow_current(&spine));
  554. need_insert = ((index >= le32_to_cpu(n->header.nr_entries)) ||
  555. (le64_to_cpu(n->keys[index]) != keys[level]));
  556. if (need_insert) {
  557. dm_block_t new_tree;
  558. __le64 new_le;
  559. r = dm_btree_empty(info, &new_tree);
  560. if (r < 0)
  561. goto bad;
  562. new_le = cpu_to_le64(new_tree);
  563. __dm_bless_for_disk(&new_le);
  564. r = insert_at(sizeof(uint64_t), n, index,
  565. keys[level], &new_le);
  566. if (r)
  567. goto bad;
  568. }
  569. if (level < last_level)
  570. block = value64(n, index);
  571. }
  572. r = btree_insert_raw(&spine, block, &info->value_type,
  573. keys[level], &index);
  574. if (r < 0)
  575. goto bad;
  576. n = dm_block_data(shadow_current(&spine));
  577. need_insert = ((index >= le32_to_cpu(n->header.nr_entries)) ||
  578. (le64_to_cpu(n->keys[index]) != keys[level]));
  579. if (need_insert) {
  580. if (inserted)
  581. *inserted = 1;
  582. r = insert_at(info->value_type.size, n, index,
  583. keys[level], value);
  584. if (r)
  585. goto bad_unblessed;
  586. } else {
  587. if (inserted)
  588. *inserted = 0;
  589. if (info->value_type.dec &&
  590. (!info->value_type.equal ||
  591. !info->value_type.equal(
  592. info->value_type.context,
  593. value_ptr(n, index),
  594. value))) {
  595. info->value_type.dec(info->value_type.context,
  596. value_ptr(n, index));
  597. }
  598. memcpy_disk(value_ptr(n, index),
  599. value, info->value_type.size);
  600. }
  601. *new_root = shadow_root(&spine);
  602. exit_shadow_spine(&spine);
  603. return 0;
  604. bad:
  605. __dm_unbless_for_disk(value);
  606. bad_unblessed:
  607. exit_shadow_spine(&spine);
  608. return r;
  609. }
  610. int dm_btree_insert(struct dm_btree_info *info, dm_block_t root,
  611. uint64_t *keys, void *value, dm_block_t *new_root)
  612. __dm_written_to_disk(value)
  613. {
  614. return insert(info, root, keys, value, new_root, NULL);
  615. }
  616. EXPORT_SYMBOL_GPL(dm_btree_insert);
  617. int dm_btree_insert_notify(struct dm_btree_info *info, dm_block_t root,
  618. uint64_t *keys, void *value, dm_block_t *new_root,
  619. int *inserted)
  620. __dm_written_to_disk(value)
  621. {
  622. return insert(info, root, keys, value, new_root, inserted);
  623. }
  624. EXPORT_SYMBOL_GPL(dm_btree_insert_notify);
  625. /*----------------------------------------------------------------*/
  626. static int find_key(struct ro_spine *s, dm_block_t block, bool find_highest,
  627. uint64_t *result_key, dm_block_t *next_block)
  628. {
  629. int i, r;
  630. uint32_t flags;
  631. do {
  632. r = ro_step(s, block);
  633. if (r < 0)
  634. return r;
  635. flags = le32_to_cpu(ro_node(s)->header.flags);
  636. i = le32_to_cpu(ro_node(s)->header.nr_entries);
  637. if (!i)
  638. return -ENODATA;
  639. else
  640. i--;
  641. if (find_highest)
  642. *result_key = le64_to_cpu(ro_node(s)->keys[i]);
  643. else
  644. *result_key = le64_to_cpu(ro_node(s)->keys[0]);
  645. if (next_block || flags & INTERNAL_NODE)
  646. block = value64(ro_node(s), i);
  647. } while (flags & INTERNAL_NODE);
  648. if (next_block)
  649. *next_block = block;
  650. return 0;
  651. }
  652. static int dm_btree_find_key(struct dm_btree_info *info, dm_block_t root,
  653. bool find_highest, uint64_t *result_keys)
  654. {
  655. int r = 0, count = 0, level;
  656. struct ro_spine spine;
  657. init_ro_spine(&spine, info);
  658. for (level = 0; level < info->levels; level++) {
  659. r = find_key(&spine, root, find_highest, result_keys + level,
  660. level == info->levels - 1 ? NULL : &root);
  661. if (r == -ENODATA) {
  662. r = 0;
  663. break;
  664. } else if (r)
  665. break;
  666. count++;
  667. }
  668. exit_ro_spine(&spine);
  669. return r ? r : count;
  670. }
  671. int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root,
  672. uint64_t *result_keys)
  673. {
  674. return dm_btree_find_key(info, root, true, result_keys);
  675. }
  676. EXPORT_SYMBOL_GPL(dm_btree_find_highest_key);
  677. int dm_btree_find_lowest_key(struct dm_btree_info *info, dm_block_t root,
  678. uint64_t *result_keys)
  679. {
  680. return dm_btree_find_key(info, root, false, result_keys);
  681. }
  682. EXPORT_SYMBOL_GPL(dm_btree_find_lowest_key);
  683. /*----------------------------------------------------------------*/
  684. /*
  685. * FIXME: We shouldn't use a recursive algorithm when we have limited stack
  686. * space. Also this only works for single level trees.
  687. */
  688. static int walk_node(struct dm_btree_info *info, dm_block_t block,
  689. int (*fn)(void *context, uint64_t *keys, void *leaf),
  690. void *context)
  691. {
  692. int r;
  693. unsigned i, nr;
  694. struct dm_block *node;
  695. struct btree_node *n;
  696. uint64_t keys;
  697. r = bn_read_lock(info, block, &node);
  698. if (r)
  699. return r;
  700. n = dm_block_data(node);
  701. nr = le32_to_cpu(n->header.nr_entries);
  702. for (i = 0; i < nr; i++) {
  703. if (le32_to_cpu(n->header.flags) & INTERNAL_NODE) {
  704. r = walk_node(info, value64(n, i), fn, context);
  705. if (r)
  706. goto out;
  707. } else {
  708. keys = le64_to_cpu(*key_ptr(n, i));
  709. r = fn(context, &keys, value_ptr(n, i));
  710. if (r)
  711. goto out;
  712. }
  713. }
  714. out:
  715. dm_tm_unlock(info->tm, node);
  716. return r;
  717. }
  718. int dm_btree_walk(struct dm_btree_info *info, dm_block_t root,
  719. int (*fn)(void *context, uint64_t *keys, void *leaf),
  720. void *context)
  721. {
  722. BUG_ON(info->levels > 1);
  723. return walk_node(info, root, fn, context);
  724. }
  725. EXPORT_SYMBOL_GPL(dm_btree_walk);