mapper.c 23 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001
  1. /*
  2. * Ceph - scalable distributed file system
  3. *
  4. * Copyright (C) 2015 Intel Corporation All Rights Reserved
  5. *
  6. * This is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License version 2.1, as published by the Free Software
  9. * Foundation. See file COPYING.
  10. *
  11. */
  12. #ifdef __KERNEL__
  13. # include <linux/string.h>
  14. # include <linux/slab.h>
  15. # include <linux/bug.h>
  16. # include <linux/kernel.h>
  17. # include <linux/crush/crush.h>
  18. # include <linux/crush/hash.h>
  19. # include <linux/crush/mapper.h>
  20. #else
  21. # include "crush_compat.h"
  22. # include "crush.h"
  23. # include "hash.h"
  24. # include "mapper.h"
  25. #endif
  26. #include "crush_ln_table.h"
  27. #define dprintk(args...) /* printf(args) */
  28. /*
  29. * Implement the core CRUSH mapping algorithm.
  30. */
  31. /**
  32. * crush_find_rule - find a crush_rule id for a given ruleset, type, and size.
  33. * @map: the crush_map
  34. * @ruleset: the storage ruleset id (user defined)
  35. * @type: storage ruleset type (user defined)
  36. * @size: output set size
  37. */
  38. int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size)
  39. {
  40. __u32 i;
  41. for (i = 0; i < map->max_rules; i++) {
  42. if (map->rules[i] &&
  43. map->rules[i]->mask.ruleset == ruleset &&
  44. map->rules[i]->mask.type == type &&
  45. map->rules[i]->mask.min_size <= size &&
  46. map->rules[i]->mask.max_size >= size)
  47. return i;
  48. }
  49. return -1;
  50. }
  51. /*
  52. * bucket choose methods
  53. *
  54. * For each bucket algorithm, we have a "choose" method that, given a
  55. * crush input @x and replica position (usually, position in output set) @r,
  56. * will produce an item in the bucket.
  57. */
  58. /*
  59. * Choose based on a random permutation of the bucket.
  60. *
  61. * We used to use some prime number arithmetic to do this, but it
  62. * wasn't very random, and had some other bad behaviors. Instead, we
  63. * calculate an actual random permutation of the bucket members.
  64. * Since this is expensive, we optimize for the r=0 case, which
  65. * captures the vast majority of calls.
  66. */
  67. static int bucket_perm_choose(struct crush_bucket *bucket,
  68. int x, int r)
  69. {
  70. unsigned int pr = r % bucket->size;
  71. unsigned int i, s;
  72. /* start a new permutation if @x has changed */
  73. if (bucket->perm_x != (__u32)x || bucket->perm_n == 0) {
  74. dprintk("bucket %d new x=%d\n", bucket->id, x);
  75. bucket->perm_x = x;
  76. /* optimize common r=0 case */
  77. if (pr == 0) {
  78. s = crush_hash32_3(bucket->hash, x, bucket->id, 0) %
  79. bucket->size;
  80. bucket->perm[0] = s;
  81. bucket->perm_n = 0xffff; /* magic value, see below */
  82. goto out;
  83. }
  84. for (i = 0; i < bucket->size; i++)
  85. bucket->perm[i] = i;
  86. bucket->perm_n = 0;
  87. } else if (bucket->perm_n == 0xffff) {
  88. /* clean up after the r=0 case above */
  89. for (i = 1; i < bucket->size; i++)
  90. bucket->perm[i] = i;
  91. bucket->perm[bucket->perm[0]] = 0;
  92. bucket->perm_n = 1;
  93. }
  94. /* calculate permutation up to pr */
  95. for (i = 0; i < bucket->perm_n; i++)
  96. dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]);
  97. while (bucket->perm_n <= pr) {
  98. unsigned int p = bucket->perm_n;
  99. /* no point in swapping the final entry */
  100. if (p < bucket->size - 1) {
  101. i = crush_hash32_3(bucket->hash, x, bucket->id, p) %
  102. (bucket->size - p);
  103. if (i) {
  104. unsigned int t = bucket->perm[p + i];
  105. bucket->perm[p + i] = bucket->perm[p];
  106. bucket->perm[p] = t;
  107. }
  108. dprintk(" perm_choose swap %d with %d\n", p, p+i);
  109. }
  110. bucket->perm_n++;
  111. }
  112. for (i = 0; i < bucket->size; i++)
  113. dprintk(" perm_choose %d: %d\n", i, bucket->perm[i]);
  114. s = bucket->perm[pr];
  115. out:
  116. dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id,
  117. bucket->size, x, r, pr, s);
  118. return bucket->items[s];
  119. }
  120. /* uniform */
  121. static int bucket_uniform_choose(struct crush_bucket_uniform *bucket,
  122. int x, int r)
  123. {
  124. return bucket_perm_choose(&bucket->h, x, r);
  125. }
  126. /* list */
  127. static int bucket_list_choose(struct crush_bucket_list *bucket,
  128. int x, int r)
  129. {
  130. int i;
  131. for (i = bucket->h.size-1; i >= 0; i--) {
  132. __u64 w = crush_hash32_4(bucket->h.hash, x, bucket->h.items[i],
  133. r, bucket->h.id);
  134. w &= 0xffff;
  135. dprintk("list_choose i=%d x=%d r=%d item %d weight %x "
  136. "sw %x rand %llx",
  137. i, x, r, bucket->h.items[i], bucket->item_weights[i],
  138. bucket->sum_weights[i], w);
  139. w *= bucket->sum_weights[i];
  140. w = w >> 16;
  141. /*dprintk(" scaled %llx\n", w);*/
  142. if (w < bucket->item_weights[i])
  143. return bucket->h.items[i];
  144. }
  145. dprintk("bad list sums for bucket %d\n", bucket->h.id);
  146. return bucket->h.items[0];
  147. }
  148. /* (binary) tree */
  149. static int height(int n)
  150. {
  151. int h = 0;
  152. while ((n & 1) == 0) {
  153. h++;
  154. n = n >> 1;
  155. }
  156. return h;
  157. }
  158. static int left(int x)
  159. {
  160. int h = height(x);
  161. return x - (1 << (h-1));
  162. }
  163. static int right(int x)
  164. {
  165. int h = height(x);
  166. return x + (1 << (h-1));
  167. }
  168. static int terminal(int x)
  169. {
  170. return x & 1;
  171. }
  172. static int bucket_tree_choose(struct crush_bucket_tree *bucket,
  173. int x, int r)
  174. {
  175. int n;
  176. __u32 w;
  177. __u64 t;
  178. /* start at root */
  179. n = bucket->num_nodes >> 1;
  180. while (!terminal(n)) {
  181. int l;
  182. /* pick point in [0, w) */
  183. w = bucket->node_weights[n];
  184. t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r,
  185. bucket->h.id) * (__u64)w;
  186. t = t >> 32;
  187. /* descend to the left or right? */
  188. l = left(n);
  189. if (t < bucket->node_weights[l])
  190. n = l;
  191. else
  192. n = right(n);
  193. }
  194. return bucket->h.items[n >> 1];
  195. }
  196. /* straw */
  197. static int bucket_straw_choose(struct crush_bucket_straw *bucket,
  198. int x, int r)
  199. {
  200. __u32 i;
  201. int high = 0;
  202. __u64 high_draw = 0;
  203. __u64 draw;
  204. for (i = 0; i < bucket->h.size; i++) {
  205. draw = crush_hash32_3(bucket->h.hash, x, bucket->h.items[i], r);
  206. draw &= 0xffff;
  207. draw *= bucket->straws[i];
  208. if (i == 0 || draw > high_draw) {
  209. high = i;
  210. high_draw = draw;
  211. }
  212. }
  213. return bucket->h.items[high];
  214. }
  215. /* compute 2^44*log2(input+1) */
  216. static __u64 crush_ln(unsigned int xin)
  217. {
  218. unsigned int x = xin;
  219. int iexpon, index1, index2;
  220. __u64 RH, LH, LL, xl64, result;
  221. x++;
  222. /* normalize input */
  223. iexpon = 15;
  224. /*
  225. * figure out number of bits we need to shift and
  226. * do it in one step instead of iteratively
  227. */
  228. if (!(x & 0x18000)) {
  229. int bits = __builtin_clz(x & 0x1FFFF) - 16;
  230. x <<= bits;
  231. iexpon = 15 - bits;
  232. }
  233. index1 = (x >> 8) << 1;
  234. /* RH ~ 2^56/index1 */
  235. RH = __RH_LH_tbl[index1 - 256];
  236. /* LH ~ 2^48 * log2(index1/256) */
  237. LH = __RH_LH_tbl[index1 + 1 - 256];
  238. /* RH*x ~ 2^48 * (2^15 + xf), xf<2^8 */
  239. xl64 = (__s64)x * RH;
  240. xl64 >>= 48;
  241. result = iexpon;
  242. result <<= (12 + 32);
  243. index2 = xl64 & 0xff;
  244. /* LL ~ 2^48*log2(1.0+index2/2^15) */
  245. LL = __LL_tbl[index2];
  246. LH = LH + LL;
  247. LH >>= (48 - 12 - 32);
  248. result += LH;
  249. return result;
  250. }
  251. /*
  252. * straw2
  253. *
  254. * for reference, see:
  255. *
  256. * http://en.wikipedia.org/wiki/Exponential_distribution#Distribution_of_the_minimum_of_exponential_random_variables
  257. *
  258. */
  259. static int bucket_straw2_choose(struct crush_bucket_straw2 *bucket,
  260. int x, int r)
  261. {
  262. unsigned int i, high = 0;
  263. unsigned int u;
  264. unsigned int w;
  265. __s64 ln, draw, high_draw = 0;
  266. for (i = 0; i < bucket->h.size; i++) {
  267. w = bucket->item_weights[i];
  268. if (w) {
  269. u = crush_hash32_3(bucket->h.hash, x,
  270. bucket->h.items[i], r);
  271. u &= 0xffff;
  272. /*
  273. * for some reason slightly less than 0x10000 produces
  274. * a slightly more accurate distribution... probably a
  275. * rounding effect.
  276. *
  277. * the natural log lookup table maps [0,0xffff]
  278. * (corresponding to real numbers [1/0x10000, 1] to
  279. * [0, 0xffffffffffff] (corresponding to real numbers
  280. * [-11.090355,0]).
  281. */
  282. ln = crush_ln(u) - 0x1000000000000ll;
  283. /*
  284. * divide by 16.16 fixed-point weight. note
  285. * that the ln value is negative, so a larger
  286. * weight means a larger (less negative) value
  287. * for draw.
  288. */
  289. draw = div64_s64(ln, w);
  290. } else {
  291. draw = S64_MIN;
  292. }
  293. if (i == 0 || draw > high_draw) {
  294. high = i;
  295. high_draw = draw;
  296. }
  297. }
  298. return bucket->h.items[high];
  299. }
  300. static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
  301. {
  302. dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r);
  303. BUG_ON(in->size == 0);
  304. switch (in->alg) {
  305. case CRUSH_BUCKET_UNIFORM:
  306. return bucket_uniform_choose((struct crush_bucket_uniform *)in,
  307. x, r);
  308. case CRUSH_BUCKET_LIST:
  309. return bucket_list_choose((struct crush_bucket_list *)in,
  310. x, r);
  311. case CRUSH_BUCKET_TREE:
  312. return bucket_tree_choose((struct crush_bucket_tree *)in,
  313. x, r);
  314. case CRUSH_BUCKET_STRAW:
  315. return bucket_straw_choose((struct crush_bucket_straw *)in,
  316. x, r);
  317. case CRUSH_BUCKET_STRAW2:
  318. return bucket_straw2_choose((struct crush_bucket_straw2 *)in,
  319. x, r);
  320. default:
  321. dprintk("unknown bucket %d alg %d\n", in->id, in->alg);
  322. return in->items[0];
  323. }
  324. }
  325. /*
  326. * true if device is marked "out" (failed, fully offloaded)
  327. * of the cluster
  328. */
  329. static int is_out(const struct crush_map *map,
  330. const __u32 *weight, int weight_max,
  331. int item, int x)
  332. {
  333. if (item >= weight_max)
  334. return 1;
  335. if (weight[item] >= 0x10000)
  336. return 0;
  337. if (weight[item] == 0)
  338. return 1;
  339. if ((crush_hash32_2(CRUSH_HASH_RJENKINS1, x, item) & 0xffff)
  340. < weight[item])
  341. return 0;
  342. return 1;
  343. }
  344. /**
  345. * crush_choose_firstn - choose numrep distinct items of given type
  346. * @map: the crush_map
  347. * @bucket: the bucket we are choose an item from
  348. * @x: crush input value
  349. * @numrep: the number of items to choose
  350. * @type: the type of item to choose
  351. * @out: pointer to output vector
  352. * @outpos: our position in that vector
  353. * @out_size: size of the out vector
  354. * @tries: number of attempts to make
  355. * @recurse_tries: number of attempts to have recursive chooseleaf make
  356. * @local_retries: localized retries
  357. * @local_fallback_retries: localized fallback retries
  358. * @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose)
  359. * @stable: stable mode starts rep=0 in the recursive call for all replicas
  360. * @vary_r: pass r to recursive calls
  361. * @out2: second output vector for leaf items (if @recurse_to_leaf)
  362. * @parent_r: r value passed from the parent
  363. */
  364. static int crush_choose_firstn(const struct crush_map *map,
  365. struct crush_bucket *bucket,
  366. const __u32 *weight, int weight_max,
  367. int x, int numrep, int type,
  368. int *out, int outpos,
  369. int out_size,
  370. unsigned int tries,
  371. unsigned int recurse_tries,
  372. unsigned int local_retries,
  373. unsigned int local_fallback_retries,
  374. int recurse_to_leaf,
  375. unsigned int vary_r,
  376. unsigned int stable,
  377. int *out2,
  378. int parent_r)
  379. {
  380. int rep;
  381. unsigned int ftotal, flocal;
  382. int retry_descent, retry_bucket, skip_rep;
  383. struct crush_bucket *in = bucket;
  384. int r;
  385. int i;
  386. int item = 0;
  387. int itemtype;
  388. int collide, reject;
  389. int count = out_size;
  390. dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d stable %d\n",
  391. recurse_to_leaf ? "_LEAF" : "",
  392. bucket->id, x, outpos, numrep,
  393. tries, recurse_tries, local_retries, local_fallback_retries,
  394. parent_r, stable);
  395. for (rep = stable ? 0 : outpos; rep < numrep && count > 0 ; rep++) {
  396. /* keep trying until we get a non-out, non-colliding item */
  397. ftotal = 0;
  398. skip_rep = 0;
  399. do {
  400. retry_descent = 0;
  401. in = bucket; /* initial bucket */
  402. /* choose through intervening buckets */
  403. flocal = 0;
  404. do {
  405. collide = 0;
  406. retry_bucket = 0;
  407. r = rep + parent_r;
  408. /* r' = r + f_total */
  409. r += ftotal;
  410. /* bucket choose */
  411. if (in->size == 0) {
  412. reject = 1;
  413. goto reject;
  414. }
  415. if (local_fallback_retries > 0 &&
  416. flocal >= (in->size>>1) &&
  417. flocal > local_fallback_retries)
  418. item = bucket_perm_choose(in, x, r);
  419. else
  420. item = crush_bucket_choose(in, x, r);
  421. if (item >= map->max_devices) {
  422. dprintk(" bad item %d\n", item);
  423. skip_rep = 1;
  424. break;
  425. }
  426. /* desired type? */
  427. if (item < 0)
  428. itemtype = map->buckets[-1-item]->type;
  429. else
  430. itemtype = 0;
  431. dprintk(" item %d type %d\n", item, itemtype);
  432. /* keep going? */
  433. if (itemtype != type) {
  434. if (item >= 0 ||
  435. (-1-item) >= map->max_buckets) {
  436. dprintk(" bad item type %d\n", type);
  437. skip_rep = 1;
  438. break;
  439. }
  440. in = map->buckets[-1-item];
  441. retry_bucket = 1;
  442. continue;
  443. }
  444. /* collision? */
  445. for (i = 0; i < outpos; i++) {
  446. if (out[i] == item) {
  447. collide = 1;
  448. break;
  449. }
  450. }
  451. reject = 0;
  452. if (!collide && recurse_to_leaf) {
  453. if (item < 0) {
  454. int sub_r;
  455. if (vary_r)
  456. sub_r = r >> (vary_r-1);
  457. else
  458. sub_r = 0;
  459. if (crush_choose_firstn(map,
  460. map->buckets[-1-item],
  461. weight, weight_max,
  462. x, stable ? 1 : outpos+1, 0,
  463. out2, outpos, count,
  464. recurse_tries, 0,
  465. local_retries,
  466. local_fallback_retries,
  467. 0,
  468. vary_r,
  469. stable,
  470. NULL,
  471. sub_r) <= outpos)
  472. /* didn't get leaf */
  473. reject = 1;
  474. } else {
  475. /* we already have a leaf! */
  476. out2[outpos] = item;
  477. }
  478. }
  479. if (!reject) {
  480. /* out? */
  481. if (itemtype == 0)
  482. reject = is_out(map, weight,
  483. weight_max,
  484. item, x);
  485. else
  486. reject = 0;
  487. }
  488. reject:
  489. if (reject || collide) {
  490. ftotal++;
  491. flocal++;
  492. if (collide && flocal <= local_retries)
  493. /* retry locally a few times */
  494. retry_bucket = 1;
  495. else if (local_fallback_retries > 0 &&
  496. flocal <= in->size + local_fallback_retries)
  497. /* exhaustive bucket search */
  498. retry_bucket = 1;
  499. else if (ftotal < tries)
  500. /* then retry descent */
  501. retry_descent = 1;
  502. else
  503. /* else give up */
  504. skip_rep = 1;
  505. dprintk(" reject %d collide %d "
  506. "ftotal %u flocal %u\n",
  507. reject, collide, ftotal,
  508. flocal);
  509. }
  510. } while (retry_bucket);
  511. } while (retry_descent);
  512. if (skip_rep) {
  513. dprintk("skip rep\n");
  514. continue;
  515. }
  516. dprintk("CHOOSE got %d\n", item);
  517. out[outpos] = item;
  518. outpos++;
  519. count--;
  520. #ifndef __KERNEL__
  521. if (map->choose_tries && ftotal <= map->choose_total_tries)
  522. map->choose_tries[ftotal]++;
  523. #endif
  524. }
  525. dprintk("CHOOSE returns %d\n", outpos);
  526. return outpos;
  527. }
  528. /**
  529. * crush_choose_indep: alternative breadth-first positionally stable mapping
  530. *
  531. */
  532. static void crush_choose_indep(const struct crush_map *map,
  533. struct crush_bucket *bucket,
  534. const __u32 *weight, int weight_max,
  535. int x, int left, int numrep, int type,
  536. int *out, int outpos,
  537. unsigned int tries,
  538. unsigned int recurse_tries,
  539. int recurse_to_leaf,
  540. int *out2,
  541. int parent_r)
  542. {
  543. struct crush_bucket *in = bucket;
  544. int endpos = outpos + left;
  545. int rep;
  546. unsigned int ftotal;
  547. int r;
  548. int i;
  549. int item = 0;
  550. int itemtype;
  551. int collide;
  552. dprintk("CHOOSE%s INDEP bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "",
  553. bucket->id, x, outpos, numrep);
  554. /* initially my result is undefined */
  555. for (rep = outpos; rep < endpos; rep++) {
  556. out[rep] = CRUSH_ITEM_UNDEF;
  557. if (out2)
  558. out2[rep] = CRUSH_ITEM_UNDEF;
  559. }
  560. for (ftotal = 0; left > 0 && ftotal < tries; ftotal++) {
  561. #ifdef DEBUG_INDEP
  562. if (out2 && ftotal) {
  563. dprintk("%u %d a: ", ftotal, left);
  564. for (rep = outpos; rep < endpos; rep++) {
  565. dprintk(" %d", out[rep]);
  566. }
  567. dprintk("\n");
  568. dprintk("%u %d b: ", ftotal, left);
  569. for (rep = outpos; rep < endpos; rep++) {
  570. dprintk(" %d", out2[rep]);
  571. }
  572. dprintk("\n");
  573. }
  574. #endif
  575. for (rep = outpos; rep < endpos; rep++) {
  576. if (out[rep] != CRUSH_ITEM_UNDEF)
  577. continue;
  578. in = bucket; /* initial bucket */
  579. /* choose through intervening buckets */
  580. for (;;) {
  581. /* note: we base the choice on the position
  582. * even in the nested call. that means that
  583. * if the first layer chooses the same bucket
  584. * in a different position, we will tend to
  585. * choose a different item in that bucket.
  586. * this will involve more devices in data
  587. * movement and tend to distribute the load.
  588. */
  589. r = rep + parent_r;
  590. /* be careful */
  591. if (in->alg == CRUSH_BUCKET_UNIFORM &&
  592. in->size % numrep == 0)
  593. /* r'=r+(n+1)*f_total */
  594. r += (numrep+1) * ftotal;
  595. else
  596. /* r' = r + n*f_total */
  597. r += numrep * ftotal;
  598. /* bucket choose */
  599. if (in->size == 0) {
  600. dprintk(" empty bucket\n");
  601. break;
  602. }
  603. item = crush_bucket_choose(in, x, r);
  604. if (item >= map->max_devices) {
  605. dprintk(" bad item %d\n", item);
  606. out[rep] = CRUSH_ITEM_NONE;
  607. if (out2)
  608. out2[rep] = CRUSH_ITEM_NONE;
  609. left--;
  610. break;
  611. }
  612. /* desired type? */
  613. if (item < 0)
  614. itemtype = map->buckets[-1-item]->type;
  615. else
  616. itemtype = 0;
  617. dprintk(" item %d type %d\n", item, itemtype);
  618. /* keep going? */
  619. if (itemtype != type) {
  620. if (item >= 0 ||
  621. (-1-item) >= map->max_buckets) {
  622. dprintk(" bad item type %d\n", type);
  623. out[rep] = CRUSH_ITEM_NONE;
  624. if (out2)
  625. out2[rep] =
  626. CRUSH_ITEM_NONE;
  627. left--;
  628. break;
  629. }
  630. in = map->buckets[-1-item];
  631. continue;
  632. }
  633. /* collision? */
  634. collide = 0;
  635. for (i = outpos; i < endpos; i++) {
  636. if (out[i] == item) {
  637. collide = 1;
  638. break;
  639. }
  640. }
  641. if (collide)
  642. break;
  643. if (recurse_to_leaf) {
  644. if (item < 0) {
  645. crush_choose_indep(map,
  646. map->buckets[-1-item],
  647. weight, weight_max,
  648. x, 1, numrep, 0,
  649. out2, rep,
  650. recurse_tries, 0,
  651. 0, NULL, r);
  652. if (out2[rep] == CRUSH_ITEM_NONE) {
  653. /* placed nothing; no leaf */
  654. break;
  655. }
  656. } else {
  657. /* we already have a leaf! */
  658. out2[rep] = item;
  659. }
  660. }
  661. /* out? */
  662. if (itemtype == 0 &&
  663. is_out(map, weight, weight_max, item, x))
  664. break;
  665. /* yay! */
  666. out[rep] = item;
  667. left--;
  668. break;
  669. }
  670. }
  671. }
  672. for (rep = outpos; rep < endpos; rep++) {
  673. if (out[rep] == CRUSH_ITEM_UNDEF) {
  674. out[rep] = CRUSH_ITEM_NONE;
  675. }
  676. if (out2 && out2[rep] == CRUSH_ITEM_UNDEF) {
  677. out2[rep] = CRUSH_ITEM_NONE;
  678. }
  679. }
  680. #ifndef __KERNEL__
  681. if (map->choose_tries && ftotal <= map->choose_total_tries)
  682. map->choose_tries[ftotal]++;
  683. #endif
  684. #ifdef DEBUG_INDEP
  685. if (out2) {
  686. dprintk("%u %d a: ", ftotal, left);
  687. for (rep = outpos; rep < endpos; rep++) {
  688. dprintk(" %d", out[rep]);
  689. }
  690. dprintk("\n");
  691. dprintk("%u %d b: ", ftotal, left);
  692. for (rep = outpos; rep < endpos; rep++) {
  693. dprintk(" %d", out2[rep]);
  694. }
  695. dprintk("\n");
  696. }
  697. #endif
  698. }
  699. /**
  700. * crush_do_rule - calculate a mapping with the given input and rule
  701. * @map: the crush_map
  702. * @ruleno: the rule id
  703. * @x: hash input
  704. * @result: pointer to result vector
  705. * @result_max: maximum result size
  706. * @weight: weight vector (for map leaves)
  707. * @weight_max: size of weight vector
  708. * @scratch: scratch vector for private use; must be >= 3 * result_max
  709. */
  710. int crush_do_rule(const struct crush_map *map,
  711. int ruleno, int x, int *result, int result_max,
  712. const __u32 *weight, int weight_max,
  713. int *scratch)
  714. {
  715. int result_len;
  716. int *a = scratch;
  717. int *b = scratch + result_max;
  718. int *c = scratch + result_max*2;
  719. int recurse_to_leaf;
  720. int *w;
  721. int wsize = 0;
  722. int *o;
  723. int osize;
  724. int *tmp;
  725. struct crush_rule *rule;
  726. __u32 step;
  727. int i, j;
  728. int numrep;
  729. int out_size;
  730. /*
  731. * the original choose_total_tries value was off by one (it
  732. * counted "retries" and not "tries"). add one.
  733. */
  734. int choose_tries = map->choose_total_tries + 1;
  735. int choose_leaf_tries = 0;
  736. /*
  737. * the local tries values were counted as "retries", though,
  738. * and need no adjustment
  739. */
  740. int choose_local_retries = map->choose_local_tries;
  741. int choose_local_fallback_retries = map->choose_local_fallback_tries;
  742. int vary_r = map->chooseleaf_vary_r;
  743. int stable = map->chooseleaf_stable;
  744. if ((__u32)ruleno >= map->max_rules) {
  745. dprintk(" bad ruleno %d\n", ruleno);
  746. return 0;
  747. }
  748. rule = map->rules[ruleno];
  749. result_len = 0;
  750. w = a;
  751. o = b;
  752. for (step = 0; step < rule->len; step++) {
  753. int firstn = 0;
  754. struct crush_rule_step *curstep = &rule->steps[step];
  755. switch (curstep->op) {
  756. case CRUSH_RULE_TAKE:
  757. if ((curstep->arg1 >= 0 &&
  758. curstep->arg1 < map->max_devices) ||
  759. (-1-curstep->arg1 >= 0 &&
  760. -1-curstep->arg1 < map->max_buckets &&
  761. map->buckets[-1-curstep->arg1])) {
  762. w[0] = curstep->arg1;
  763. wsize = 1;
  764. } else {
  765. dprintk(" bad take value %d\n", curstep->arg1);
  766. }
  767. break;
  768. case CRUSH_RULE_SET_CHOOSE_TRIES:
  769. if (curstep->arg1 > 0)
  770. choose_tries = curstep->arg1;
  771. break;
  772. case CRUSH_RULE_SET_CHOOSELEAF_TRIES:
  773. if (curstep->arg1 > 0)
  774. choose_leaf_tries = curstep->arg1;
  775. break;
  776. case CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES:
  777. if (curstep->arg1 >= 0)
  778. choose_local_retries = curstep->arg1;
  779. break;
  780. case CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES:
  781. if (curstep->arg1 >= 0)
  782. choose_local_fallback_retries = curstep->arg1;
  783. break;
  784. case CRUSH_RULE_SET_CHOOSELEAF_VARY_R:
  785. if (curstep->arg1 >= 0)
  786. vary_r = curstep->arg1;
  787. break;
  788. case CRUSH_RULE_SET_CHOOSELEAF_STABLE:
  789. if (curstep->arg1 >= 0)
  790. stable = curstep->arg1;
  791. break;
  792. case CRUSH_RULE_CHOOSELEAF_FIRSTN:
  793. case CRUSH_RULE_CHOOSE_FIRSTN:
  794. firstn = 1;
  795. /* fall through */
  796. case CRUSH_RULE_CHOOSELEAF_INDEP:
  797. case CRUSH_RULE_CHOOSE_INDEP:
  798. if (wsize == 0)
  799. break;
  800. recurse_to_leaf =
  801. curstep->op ==
  802. CRUSH_RULE_CHOOSELEAF_FIRSTN ||
  803. curstep->op ==
  804. CRUSH_RULE_CHOOSELEAF_INDEP;
  805. /* reset output */
  806. osize = 0;
  807. for (i = 0; i < wsize; i++) {
  808. int bno;
  809. /*
  810. * see CRUSH_N, CRUSH_N_MINUS macros.
  811. * basically, numrep <= 0 means relative to
  812. * the provided result_max
  813. */
  814. numrep = curstep->arg1;
  815. if (numrep <= 0) {
  816. numrep += result_max;
  817. if (numrep <= 0)
  818. continue;
  819. }
  820. j = 0;
  821. /* make sure bucket id is valid */
  822. bno = -1 - w[i];
  823. if (bno < 0 || bno >= map->max_buckets) {
  824. /* w[i] is probably CRUSH_ITEM_NONE */
  825. dprintk(" bad w[i] %d\n", w[i]);
  826. continue;
  827. }
  828. if (firstn) {
  829. int recurse_tries;
  830. if (choose_leaf_tries)
  831. recurse_tries =
  832. choose_leaf_tries;
  833. else if (map->chooseleaf_descend_once)
  834. recurse_tries = 1;
  835. else
  836. recurse_tries = choose_tries;
  837. osize += crush_choose_firstn(
  838. map,
  839. map->buckets[bno],
  840. weight, weight_max,
  841. x, numrep,
  842. curstep->arg2,
  843. o+osize, j,
  844. result_max-osize,
  845. choose_tries,
  846. recurse_tries,
  847. choose_local_retries,
  848. choose_local_fallback_retries,
  849. recurse_to_leaf,
  850. vary_r,
  851. stable,
  852. c+osize,
  853. 0);
  854. } else {
  855. out_size = ((numrep < (result_max-osize)) ?
  856. numrep : (result_max-osize));
  857. crush_choose_indep(
  858. map,
  859. map->buckets[bno],
  860. weight, weight_max,
  861. x, out_size, numrep,
  862. curstep->arg2,
  863. o+osize, j,
  864. choose_tries,
  865. choose_leaf_tries ?
  866. choose_leaf_tries : 1,
  867. recurse_to_leaf,
  868. c+osize,
  869. 0);
  870. osize += out_size;
  871. }
  872. }
  873. if (recurse_to_leaf)
  874. /* copy final _leaf_ values to output set */
  875. memcpy(o, c, osize*sizeof(*o));
  876. /* swap o and w arrays */
  877. tmp = o;
  878. o = w;
  879. w = tmp;
  880. wsize = osize;
  881. break;
  882. case CRUSH_RULE_EMIT:
  883. for (i = 0; i < wsize && result_len < result_max; i++) {
  884. result[result_len] = w[i];
  885. result_len++;
  886. }
  887. wsize = 0;
  888. break;
  889. default:
  890. dprintk(" unknown op %d at step %d\n",
  891. curstep->op, step);
  892. break;
  893. }
  894. }
  895. return result_len;
  896. }