raid56.c 50 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097
  1. /*
  2. * Copyright (C) 2012 Fusion-io All rights reserved.
  3. * Copyright (C) 2012 Intel Corp. All rights reserved.
  4. *
  5. * This program is free software; you can redistribute it and/or
  6. * modify it under the terms of the GNU General Public
  7. * License v2 as published by the Free Software Foundation.
  8. *
  9. * This program is distributed in the hope that it will be useful,
  10. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  12. * General Public License for more details.
  13. *
  14. * You should have received a copy of the GNU General Public
  15. * License along with this program; if not, write to the
  16. * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  17. * Boston, MA 021110-1307, USA.
  18. */
  19. #include <linux/sched.h>
  20. #include <linux/wait.h>
  21. #include <linux/bio.h>
  22. #include <linux/slab.h>
  23. #include <linux/buffer_head.h>
  24. #include <linux/blkdev.h>
  25. #include <linux/random.h>
  26. #include <linux/iocontext.h>
  27. #include <linux/capability.h>
  28. #include <linux/ratelimit.h>
  29. #include <linux/kthread.h>
  30. #include <linux/raid/pq.h>
  31. #include <linux/hash.h>
  32. #include <linux/list_sort.h>
  33. #include <linux/raid/xor.h>
  34. #include <linux/vmalloc.h>
  35. #include <asm/div64.h>
  36. #include "ctree.h"
  37. #include "extent_map.h"
  38. #include "disk-io.h"
  39. #include "transaction.h"
  40. #include "print-tree.h"
  41. #include "volumes.h"
  42. #include "raid56.h"
  43. #include "async-thread.h"
  44. #include "check-integrity.h"
  45. #include "rcu-string.h"
  46. /* set when additional merges to this rbio are not allowed */
  47. #define RBIO_RMW_LOCKED_BIT 1
  48. /*
  49. * set when this rbio is sitting in the hash, but it is just a cache
  50. * of past RMW
  51. */
  52. #define RBIO_CACHE_BIT 2
  53. /*
  54. * set when it is safe to trust the stripe_pages for caching
  55. */
  56. #define RBIO_CACHE_READY_BIT 3
  57. #define RBIO_CACHE_SIZE 1024
  58. struct btrfs_raid_bio {
  59. struct btrfs_fs_info *fs_info;
  60. struct btrfs_bio *bbio;
  61. /*
  62. * logical block numbers for the start of each stripe
  63. * The last one or two are p/q. These are sorted,
  64. * so raid_map[0] is the start of our full stripe
  65. */
  66. u64 *raid_map;
  67. /* while we're doing rmw on a stripe
  68. * we put it into a hash table so we can
  69. * lock the stripe and merge more rbios
  70. * into it.
  71. */
  72. struct list_head hash_list;
  73. /*
  74. * LRU list for the stripe cache
  75. */
  76. struct list_head stripe_cache;
  77. /*
  78. * for scheduling work in the helper threads
  79. */
  80. struct btrfs_work work;
  81. /*
  82. * bio list and bio_list_lock are used
  83. * to add more bios into the stripe
  84. * in hopes of avoiding the full rmw
  85. */
  86. struct bio_list bio_list;
  87. spinlock_t bio_list_lock;
  88. /* also protected by the bio_list_lock, the
  89. * plug list is used by the plugging code
  90. * to collect partial bios while plugged. The
  91. * stripe locking code also uses it to hand off
  92. * the stripe lock to the next pending IO
  93. */
  94. struct list_head plug_list;
  95. /*
  96. * flags that tell us if it is safe to
  97. * merge with this bio
  98. */
  99. unsigned long flags;
  100. /* size of each individual stripe on disk */
  101. int stripe_len;
  102. /* number of data stripes (no p/q) */
  103. int nr_data;
  104. /*
  105. * set if we're doing a parity rebuild
  106. * for a read from higher up, which is handled
  107. * differently from a parity rebuild as part of
  108. * rmw
  109. */
  110. int read_rebuild;
  111. /* first bad stripe */
  112. int faila;
  113. /* second bad stripe (for raid6 use) */
  114. int failb;
  115. /*
  116. * number of pages needed to represent the full
  117. * stripe
  118. */
  119. int nr_pages;
  120. /*
  121. * size of all the bios in the bio_list. This
  122. * helps us decide if the rbio maps to a full
  123. * stripe or not
  124. */
  125. int bio_list_bytes;
  126. atomic_t refs;
  127. /*
  128. * these are two arrays of pointers. We allocate the
  129. * rbio big enough to hold them both and setup their
  130. * locations when the rbio is allocated
  131. */
  132. /* pointers to pages that we allocated for
  133. * reading/writing stripes directly from the disk (including P/Q)
  134. */
  135. struct page **stripe_pages;
  136. /*
  137. * pointers to the pages in the bio_list. Stored
  138. * here for faster lookup
  139. */
  140. struct page **bio_pages;
  141. };
  142. static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
  143. static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
  144. static void rmw_work(struct btrfs_work *work);
  145. static void read_rebuild_work(struct btrfs_work *work);
  146. static void async_rmw_stripe(struct btrfs_raid_bio *rbio);
  147. static void async_read_rebuild(struct btrfs_raid_bio *rbio);
  148. static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio);
  149. static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
  150. static void __free_raid_bio(struct btrfs_raid_bio *rbio);
  151. static void index_rbio_pages(struct btrfs_raid_bio *rbio);
  152. static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
  153. /*
  154. * the stripe hash table is used for locking, and to collect
  155. * bios in hopes of making a full stripe
  156. */
  157. int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
  158. {
  159. struct btrfs_stripe_hash_table *table;
  160. struct btrfs_stripe_hash_table *x;
  161. struct btrfs_stripe_hash *cur;
  162. struct btrfs_stripe_hash *h;
  163. int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
  164. int i;
  165. int table_size;
  166. if (info->stripe_hash_table)
  167. return 0;
  168. /*
  169. * The table is large, starting with order 4 and can go as high as
  170. * order 7 in case lock debugging is turned on.
  171. *
  172. * Try harder to allocate and fallback to vmalloc to lower the chance
  173. * of a failing mount.
  174. */
  175. table_size = sizeof(*table) + sizeof(*h) * num_entries;
  176. table = kzalloc(table_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
  177. if (!table) {
  178. table = vzalloc(table_size);
  179. if (!table)
  180. return -ENOMEM;
  181. }
  182. spin_lock_init(&table->cache_lock);
  183. INIT_LIST_HEAD(&table->stripe_cache);
  184. h = table->table;
  185. for (i = 0; i < num_entries; i++) {
  186. cur = h + i;
  187. INIT_LIST_HEAD(&cur->hash_list);
  188. spin_lock_init(&cur->lock);
  189. init_waitqueue_head(&cur->wait);
  190. }
  191. x = cmpxchg(&info->stripe_hash_table, NULL, table);
  192. if (x) {
  193. if (is_vmalloc_addr(x))
  194. vfree(x);
  195. else
  196. kfree(x);
  197. }
  198. return 0;
  199. }
  200. /*
  201. * caching an rbio means to copy anything from the
  202. * bio_pages array into the stripe_pages array. We
  203. * use the page uptodate bit in the stripe cache array
  204. * to indicate if it has valid data
  205. *
  206. * once the caching is done, we set the cache ready
  207. * bit.
  208. */
  209. static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
  210. {
  211. int i;
  212. char *s;
  213. char *d;
  214. int ret;
  215. ret = alloc_rbio_pages(rbio);
  216. if (ret)
  217. return;
  218. for (i = 0; i < rbio->nr_pages; i++) {
  219. if (!rbio->bio_pages[i])
  220. continue;
  221. s = kmap(rbio->bio_pages[i]);
  222. d = kmap(rbio->stripe_pages[i]);
  223. memcpy(d, s, PAGE_CACHE_SIZE);
  224. kunmap(rbio->bio_pages[i]);
  225. kunmap(rbio->stripe_pages[i]);
  226. SetPageUptodate(rbio->stripe_pages[i]);
  227. }
  228. set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
  229. }
  230. /*
  231. * we hash on the first logical address of the stripe
  232. */
  233. static int rbio_bucket(struct btrfs_raid_bio *rbio)
  234. {
  235. u64 num = rbio->raid_map[0];
  236. /*
  237. * we shift down quite a bit. We're using byte
  238. * addressing, and most of the lower bits are zeros.
  239. * This tends to upset hash_64, and it consistently
  240. * returns just one or two different values.
  241. *
  242. * shifting off the lower bits fixes things.
  243. */
  244. return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
  245. }
  246. /*
  247. * stealing an rbio means taking all the uptodate pages from the stripe
  248. * array in the source rbio and putting them into the destination rbio
  249. */
  250. static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
  251. {
  252. int i;
  253. struct page *s;
  254. struct page *d;
  255. if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
  256. return;
  257. for (i = 0; i < dest->nr_pages; i++) {
  258. s = src->stripe_pages[i];
  259. if (!s || !PageUptodate(s)) {
  260. continue;
  261. }
  262. d = dest->stripe_pages[i];
  263. if (d)
  264. __free_page(d);
  265. dest->stripe_pages[i] = s;
  266. src->stripe_pages[i] = NULL;
  267. }
  268. }
  269. /*
  270. * merging means we take the bio_list from the victim and
  271. * splice it into the destination. The victim should
  272. * be discarded afterwards.
  273. *
  274. * must be called with dest->rbio_list_lock held
  275. */
  276. static void merge_rbio(struct btrfs_raid_bio *dest,
  277. struct btrfs_raid_bio *victim)
  278. {
  279. bio_list_merge(&dest->bio_list, &victim->bio_list);
  280. dest->bio_list_bytes += victim->bio_list_bytes;
  281. bio_list_init(&victim->bio_list);
  282. }
  283. /*
  284. * used to prune items that are in the cache. The caller
  285. * must hold the hash table lock.
  286. */
  287. static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
  288. {
  289. int bucket = rbio_bucket(rbio);
  290. struct btrfs_stripe_hash_table *table;
  291. struct btrfs_stripe_hash *h;
  292. int freeit = 0;
  293. /*
  294. * check the bit again under the hash table lock.
  295. */
  296. if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
  297. return;
  298. table = rbio->fs_info->stripe_hash_table;
  299. h = table->table + bucket;
  300. /* hold the lock for the bucket because we may be
  301. * removing it from the hash table
  302. */
  303. spin_lock(&h->lock);
  304. /*
  305. * hold the lock for the bio list because we need
  306. * to make sure the bio list is empty
  307. */
  308. spin_lock(&rbio->bio_list_lock);
  309. if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
  310. list_del_init(&rbio->stripe_cache);
  311. table->cache_size -= 1;
  312. freeit = 1;
  313. /* if the bio list isn't empty, this rbio is
  314. * still involved in an IO. We take it out
  315. * of the cache list, and drop the ref that
  316. * was held for the list.
  317. *
  318. * If the bio_list was empty, we also remove
  319. * the rbio from the hash_table, and drop
  320. * the corresponding ref
  321. */
  322. if (bio_list_empty(&rbio->bio_list)) {
  323. if (!list_empty(&rbio->hash_list)) {
  324. list_del_init(&rbio->hash_list);
  325. atomic_dec(&rbio->refs);
  326. BUG_ON(!list_empty(&rbio->plug_list));
  327. }
  328. }
  329. }
  330. spin_unlock(&rbio->bio_list_lock);
  331. spin_unlock(&h->lock);
  332. if (freeit)
  333. __free_raid_bio(rbio);
  334. }
  335. /*
  336. * prune a given rbio from the cache
  337. */
  338. static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
  339. {
  340. struct btrfs_stripe_hash_table *table;
  341. unsigned long flags;
  342. if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
  343. return;
  344. table = rbio->fs_info->stripe_hash_table;
  345. spin_lock_irqsave(&table->cache_lock, flags);
  346. __remove_rbio_from_cache(rbio);
  347. spin_unlock_irqrestore(&table->cache_lock, flags);
  348. }
  349. /*
  350. * remove everything in the cache
  351. */
  352. static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
  353. {
  354. struct btrfs_stripe_hash_table *table;
  355. unsigned long flags;
  356. struct btrfs_raid_bio *rbio;
  357. table = info->stripe_hash_table;
  358. spin_lock_irqsave(&table->cache_lock, flags);
  359. while (!list_empty(&table->stripe_cache)) {
  360. rbio = list_entry(table->stripe_cache.next,
  361. struct btrfs_raid_bio,
  362. stripe_cache);
  363. __remove_rbio_from_cache(rbio);
  364. }
  365. spin_unlock_irqrestore(&table->cache_lock, flags);
  366. }
  367. /*
  368. * remove all cached entries and free the hash table
  369. * used by unmount
  370. */
  371. void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
  372. {
  373. if (!info->stripe_hash_table)
  374. return;
  375. btrfs_clear_rbio_cache(info);
  376. if (is_vmalloc_addr(info->stripe_hash_table))
  377. vfree(info->stripe_hash_table);
  378. else
  379. kfree(info->stripe_hash_table);
  380. info->stripe_hash_table = NULL;
  381. }
  382. /*
  383. * insert an rbio into the stripe cache. It
  384. * must have already been prepared by calling
  385. * cache_rbio_pages
  386. *
  387. * If this rbio was already cached, it gets
  388. * moved to the front of the lru.
  389. *
  390. * If the size of the rbio cache is too big, we
  391. * prune an item.
  392. */
  393. static void cache_rbio(struct btrfs_raid_bio *rbio)
  394. {
  395. struct btrfs_stripe_hash_table *table;
  396. unsigned long flags;
  397. if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
  398. return;
  399. table = rbio->fs_info->stripe_hash_table;
  400. spin_lock_irqsave(&table->cache_lock, flags);
  401. spin_lock(&rbio->bio_list_lock);
  402. /* bump our ref if we were not in the list before */
  403. if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
  404. atomic_inc(&rbio->refs);
  405. if (!list_empty(&rbio->stripe_cache)){
  406. list_move(&rbio->stripe_cache, &table->stripe_cache);
  407. } else {
  408. list_add(&rbio->stripe_cache, &table->stripe_cache);
  409. table->cache_size += 1;
  410. }
  411. spin_unlock(&rbio->bio_list_lock);
  412. if (table->cache_size > RBIO_CACHE_SIZE) {
  413. struct btrfs_raid_bio *found;
  414. found = list_entry(table->stripe_cache.prev,
  415. struct btrfs_raid_bio,
  416. stripe_cache);
  417. if (found != rbio)
  418. __remove_rbio_from_cache(found);
  419. }
  420. spin_unlock_irqrestore(&table->cache_lock, flags);
  421. return;
  422. }
  423. /*
  424. * helper function to run the xor_blocks api. It is only
  425. * able to do MAX_XOR_BLOCKS at a time, so we need to
  426. * loop through.
  427. */
  428. static void run_xor(void **pages, int src_cnt, ssize_t len)
  429. {
  430. int src_off = 0;
  431. int xor_src_cnt = 0;
  432. void *dest = pages[src_cnt];
  433. while(src_cnt > 0) {
  434. xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
  435. xor_blocks(xor_src_cnt, len, dest, pages + src_off);
  436. src_cnt -= xor_src_cnt;
  437. src_off += xor_src_cnt;
  438. }
  439. }
  440. /*
  441. * returns true if the bio list inside this rbio
  442. * covers an entire stripe (no rmw required).
  443. * Must be called with the bio list lock held, or
  444. * at a time when you know it is impossible to add
  445. * new bios into the list
  446. */
  447. static int __rbio_is_full(struct btrfs_raid_bio *rbio)
  448. {
  449. unsigned long size = rbio->bio_list_bytes;
  450. int ret = 1;
  451. if (size != rbio->nr_data * rbio->stripe_len)
  452. ret = 0;
  453. BUG_ON(size > rbio->nr_data * rbio->stripe_len);
  454. return ret;
  455. }
  456. static int rbio_is_full(struct btrfs_raid_bio *rbio)
  457. {
  458. unsigned long flags;
  459. int ret;
  460. spin_lock_irqsave(&rbio->bio_list_lock, flags);
  461. ret = __rbio_is_full(rbio);
  462. spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
  463. return ret;
  464. }
  465. /*
  466. * returns 1 if it is safe to merge two rbios together.
  467. * The merging is safe if the two rbios correspond to
  468. * the same stripe and if they are both going in the same
  469. * direction (read vs write), and if neither one is
  470. * locked for final IO
  471. *
  472. * The caller is responsible for locking such that
  473. * rmw_locked is safe to test
  474. */
  475. static int rbio_can_merge(struct btrfs_raid_bio *last,
  476. struct btrfs_raid_bio *cur)
  477. {
  478. if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) ||
  479. test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
  480. return 0;
  481. /*
  482. * we can't merge with cached rbios, since the
  483. * idea is that when we merge the destination
  484. * rbio is going to run our IO for us. We can
  485. * steal from cached rbio's though, other functions
  486. * handle that.
  487. */
  488. if (test_bit(RBIO_CACHE_BIT, &last->flags) ||
  489. test_bit(RBIO_CACHE_BIT, &cur->flags))
  490. return 0;
  491. if (last->raid_map[0] !=
  492. cur->raid_map[0])
  493. return 0;
  494. /* reads can't merge with writes */
  495. if (last->read_rebuild !=
  496. cur->read_rebuild) {
  497. return 0;
  498. }
  499. return 1;
  500. }
  501. /*
  502. * helper to index into the pstripe
  503. */
  504. static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index)
  505. {
  506. index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
  507. return rbio->stripe_pages[index];
  508. }
  509. /*
  510. * helper to index into the qstripe, returns null
  511. * if there is no qstripe
  512. */
  513. static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index)
  514. {
  515. if (rbio->nr_data + 1 == rbio->bbio->num_stripes)
  516. return NULL;
  517. index += ((rbio->nr_data + 1) * rbio->stripe_len) >>
  518. PAGE_CACHE_SHIFT;
  519. return rbio->stripe_pages[index];
  520. }
  521. /*
  522. * The first stripe in the table for a logical address
  523. * has the lock. rbios are added in one of three ways:
  524. *
  525. * 1) Nobody has the stripe locked yet. The rbio is given
  526. * the lock and 0 is returned. The caller must start the IO
  527. * themselves.
  528. *
  529. * 2) Someone has the stripe locked, but we're able to merge
  530. * with the lock owner. The rbio is freed and the IO will
  531. * start automatically along with the existing rbio. 1 is returned.
  532. *
  533. * 3) Someone has the stripe locked, but we're not able to merge.
  534. * The rbio is added to the lock owner's plug list, or merged into
  535. * an rbio already on the plug list. When the lock owner unlocks,
  536. * the next rbio on the list is run and the IO is started automatically.
  537. * 1 is returned
  538. *
  539. * If we return 0, the caller still owns the rbio and must continue with
  540. * IO submission. If we return 1, the caller must assume the rbio has
  541. * already been freed.
  542. */
  543. static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
  544. {
  545. int bucket = rbio_bucket(rbio);
  546. struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket;
  547. struct btrfs_raid_bio *cur;
  548. struct btrfs_raid_bio *pending;
  549. unsigned long flags;
  550. DEFINE_WAIT(wait);
  551. struct btrfs_raid_bio *freeit = NULL;
  552. struct btrfs_raid_bio *cache_drop = NULL;
  553. int ret = 0;
  554. int walk = 0;
  555. spin_lock_irqsave(&h->lock, flags);
  556. list_for_each_entry(cur, &h->hash_list, hash_list) {
  557. walk++;
  558. if (cur->raid_map[0] == rbio->raid_map[0]) {
  559. spin_lock(&cur->bio_list_lock);
  560. /* can we steal this cached rbio's pages? */
  561. if (bio_list_empty(&cur->bio_list) &&
  562. list_empty(&cur->plug_list) &&
  563. test_bit(RBIO_CACHE_BIT, &cur->flags) &&
  564. !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
  565. list_del_init(&cur->hash_list);
  566. atomic_dec(&cur->refs);
  567. steal_rbio(cur, rbio);
  568. cache_drop = cur;
  569. spin_unlock(&cur->bio_list_lock);
  570. goto lockit;
  571. }
  572. /* can we merge into the lock owner? */
  573. if (rbio_can_merge(cur, rbio)) {
  574. merge_rbio(cur, rbio);
  575. spin_unlock(&cur->bio_list_lock);
  576. freeit = rbio;
  577. ret = 1;
  578. goto out;
  579. }
  580. /*
  581. * we couldn't merge with the running
  582. * rbio, see if we can merge with the
  583. * pending ones. We don't have to
  584. * check for rmw_locked because there
  585. * is no way they are inside finish_rmw
  586. * right now
  587. */
  588. list_for_each_entry(pending, &cur->plug_list,
  589. plug_list) {
  590. if (rbio_can_merge(pending, rbio)) {
  591. merge_rbio(pending, rbio);
  592. spin_unlock(&cur->bio_list_lock);
  593. freeit = rbio;
  594. ret = 1;
  595. goto out;
  596. }
  597. }
  598. /* no merging, put us on the tail of the plug list,
  599. * our rbio will be started with the currently
  600. * running rbio unlocks
  601. */
  602. list_add_tail(&rbio->plug_list, &cur->plug_list);
  603. spin_unlock(&cur->bio_list_lock);
  604. ret = 1;
  605. goto out;
  606. }
  607. }
  608. lockit:
  609. atomic_inc(&rbio->refs);
  610. list_add(&rbio->hash_list, &h->hash_list);
  611. out:
  612. spin_unlock_irqrestore(&h->lock, flags);
  613. if (cache_drop)
  614. remove_rbio_from_cache(cache_drop);
  615. if (freeit)
  616. __free_raid_bio(freeit);
  617. return ret;
  618. }
  619. /*
  620. * called as rmw or parity rebuild is completed. If the plug list has more
  621. * rbios waiting for this stripe, the next one on the list will be started
  622. */
  623. static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
  624. {
  625. int bucket;
  626. struct btrfs_stripe_hash *h;
  627. unsigned long flags;
  628. int keep_cache = 0;
  629. bucket = rbio_bucket(rbio);
  630. h = rbio->fs_info->stripe_hash_table->table + bucket;
  631. if (list_empty(&rbio->plug_list))
  632. cache_rbio(rbio);
  633. spin_lock_irqsave(&h->lock, flags);
  634. spin_lock(&rbio->bio_list_lock);
  635. if (!list_empty(&rbio->hash_list)) {
  636. /*
  637. * if we're still cached and there is no other IO
  638. * to perform, just leave this rbio here for others
  639. * to steal from later
  640. */
  641. if (list_empty(&rbio->plug_list) &&
  642. test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
  643. keep_cache = 1;
  644. clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
  645. BUG_ON(!bio_list_empty(&rbio->bio_list));
  646. goto done;
  647. }
  648. list_del_init(&rbio->hash_list);
  649. atomic_dec(&rbio->refs);
  650. /*
  651. * we use the plug list to hold all the rbios
  652. * waiting for the chance to lock this stripe.
  653. * hand the lock over to one of them.
  654. */
  655. if (!list_empty(&rbio->plug_list)) {
  656. struct btrfs_raid_bio *next;
  657. struct list_head *head = rbio->plug_list.next;
  658. next = list_entry(head, struct btrfs_raid_bio,
  659. plug_list);
  660. list_del_init(&rbio->plug_list);
  661. list_add(&next->hash_list, &h->hash_list);
  662. atomic_inc(&next->refs);
  663. spin_unlock(&rbio->bio_list_lock);
  664. spin_unlock_irqrestore(&h->lock, flags);
  665. if (next->read_rebuild)
  666. async_read_rebuild(next);
  667. else {
  668. steal_rbio(rbio, next);
  669. async_rmw_stripe(next);
  670. }
  671. goto done_nolock;
  672. } else if (waitqueue_active(&h->wait)) {
  673. spin_unlock(&rbio->bio_list_lock);
  674. spin_unlock_irqrestore(&h->lock, flags);
  675. wake_up(&h->wait);
  676. goto done_nolock;
  677. }
  678. }
  679. done:
  680. spin_unlock(&rbio->bio_list_lock);
  681. spin_unlock_irqrestore(&h->lock, flags);
  682. done_nolock:
  683. if (!keep_cache)
  684. remove_rbio_from_cache(rbio);
  685. }
  686. static void __free_raid_bio(struct btrfs_raid_bio *rbio)
  687. {
  688. int i;
  689. WARN_ON(atomic_read(&rbio->refs) < 0);
  690. if (!atomic_dec_and_test(&rbio->refs))
  691. return;
  692. WARN_ON(!list_empty(&rbio->stripe_cache));
  693. WARN_ON(!list_empty(&rbio->hash_list));
  694. WARN_ON(!bio_list_empty(&rbio->bio_list));
  695. for (i = 0; i < rbio->nr_pages; i++) {
  696. if (rbio->stripe_pages[i]) {
  697. __free_page(rbio->stripe_pages[i]);
  698. rbio->stripe_pages[i] = NULL;
  699. }
  700. }
  701. kfree(rbio->raid_map);
  702. kfree(rbio->bbio);
  703. kfree(rbio);
  704. }
  705. static void free_raid_bio(struct btrfs_raid_bio *rbio)
  706. {
  707. unlock_stripe(rbio);
  708. __free_raid_bio(rbio);
  709. }
  710. /*
  711. * this frees the rbio and runs through all the bios in the
  712. * bio_list and calls end_io on them
  713. */
  714. static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate)
  715. {
  716. struct bio *cur = bio_list_get(&rbio->bio_list);
  717. struct bio *next;
  718. free_raid_bio(rbio);
  719. while (cur) {
  720. next = cur->bi_next;
  721. cur->bi_next = NULL;
  722. if (uptodate)
  723. set_bit(BIO_UPTODATE, &cur->bi_flags);
  724. bio_endio(cur, err);
  725. cur = next;
  726. }
  727. }
  728. /*
  729. * end io function used by finish_rmw. When we finally
  730. * get here, we've written a full stripe
  731. */
  732. static void raid_write_end_io(struct bio *bio, int err)
  733. {
  734. struct btrfs_raid_bio *rbio = bio->bi_private;
  735. if (err)
  736. fail_bio_stripe(rbio, bio);
  737. bio_put(bio);
  738. if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
  739. return;
  740. err = 0;
  741. /* OK, we have read all the stripes we need to. */
  742. if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
  743. err = -EIO;
  744. rbio_orig_end_io(rbio, err, 0);
  745. return;
  746. }
  747. /*
  748. * the read/modify/write code wants to use the original bio for
  749. * any pages it included, and then use the rbio for everything
  750. * else. This function decides if a given index (stripe number)
  751. * and page number in that stripe fall inside the original bio
  752. * or the rbio.
  753. *
  754. * if you set bio_list_only, you'll get a NULL back for any ranges
  755. * that are outside the bio_list
  756. *
  757. * This doesn't take any refs on anything, you get a bare page pointer
  758. * and the caller must bump refs as required.
  759. *
  760. * You must call index_rbio_pages once before you can trust
  761. * the answers from this function.
  762. */
  763. static struct page *page_in_rbio(struct btrfs_raid_bio *rbio,
  764. int index, int pagenr, int bio_list_only)
  765. {
  766. int chunk_page;
  767. struct page *p = NULL;
  768. chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr;
  769. spin_lock_irq(&rbio->bio_list_lock);
  770. p = rbio->bio_pages[chunk_page];
  771. spin_unlock_irq(&rbio->bio_list_lock);
  772. if (p || bio_list_only)
  773. return p;
  774. return rbio->stripe_pages[chunk_page];
  775. }
  776. /*
  777. * number of pages we need for the entire stripe across all the
  778. * drives
  779. */
  780. static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
  781. {
  782. unsigned long nr = stripe_len * nr_stripes;
  783. return (nr + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
  784. }
  785. /*
  786. * allocation and initial setup for the btrfs_raid_bio. Not
  787. * this does not allocate any pages for rbio->pages.
  788. */
  789. static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
  790. struct btrfs_bio *bbio, u64 *raid_map,
  791. u64 stripe_len)
  792. {
  793. struct btrfs_raid_bio *rbio;
  794. int nr_data = 0;
  795. int num_pages = rbio_nr_pages(stripe_len, bbio->num_stripes);
  796. void *p;
  797. rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2,
  798. GFP_NOFS);
  799. if (!rbio) {
  800. kfree(raid_map);
  801. kfree(bbio);
  802. return ERR_PTR(-ENOMEM);
  803. }
  804. bio_list_init(&rbio->bio_list);
  805. INIT_LIST_HEAD(&rbio->plug_list);
  806. spin_lock_init(&rbio->bio_list_lock);
  807. INIT_LIST_HEAD(&rbio->stripe_cache);
  808. INIT_LIST_HEAD(&rbio->hash_list);
  809. rbio->bbio = bbio;
  810. rbio->raid_map = raid_map;
  811. rbio->fs_info = root->fs_info;
  812. rbio->stripe_len = stripe_len;
  813. rbio->nr_pages = num_pages;
  814. rbio->faila = -1;
  815. rbio->failb = -1;
  816. atomic_set(&rbio->refs, 1);
  817. /*
  818. * the stripe_pages and bio_pages array point to the extra
  819. * memory we allocated past the end of the rbio
  820. */
  821. p = rbio + 1;
  822. rbio->stripe_pages = p;
  823. rbio->bio_pages = p + sizeof(struct page *) * num_pages;
  824. if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE)
  825. nr_data = bbio->num_stripes - 2;
  826. else
  827. nr_data = bbio->num_stripes - 1;
  828. rbio->nr_data = nr_data;
  829. return rbio;
  830. }
  831. /* allocate pages for all the stripes in the bio, including parity */
  832. static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
  833. {
  834. int i;
  835. struct page *page;
  836. for (i = 0; i < rbio->nr_pages; i++) {
  837. if (rbio->stripe_pages[i])
  838. continue;
  839. page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
  840. if (!page)
  841. return -ENOMEM;
  842. rbio->stripe_pages[i] = page;
  843. ClearPageUptodate(page);
  844. }
  845. return 0;
  846. }
  847. /* allocate pages for just the p/q stripes */
  848. static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
  849. {
  850. int i;
  851. struct page *page;
  852. i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
  853. for (; i < rbio->nr_pages; i++) {
  854. if (rbio->stripe_pages[i])
  855. continue;
  856. page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
  857. if (!page)
  858. return -ENOMEM;
  859. rbio->stripe_pages[i] = page;
  860. }
  861. return 0;
  862. }
  863. /*
  864. * add a single page from a specific stripe into our list of bios for IO
  865. * this will try to merge into existing bios if possible, and returns
  866. * zero if all went well.
  867. */
  868. static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
  869. struct bio_list *bio_list,
  870. struct page *page,
  871. int stripe_nr,
  872. unsigned long page_index,
  873. unsigned long bio_max_len)
  874. {
  875. struct bio *last = bio_list->tail;
  876. u64 last_end = 0;
  877. int ret;
  878. struct bio *bio;
  879. struct btrfs_bio_stripe *stripe;
  880. u64 disk_start;
  881. stripe = &rbio->bbio->stripes[stripe_nr];
  882. disk_start = stripe->physical + (page_index << PAGE_CACHE_SHIFT);
  883. /* if the device is missing, just fail this stripe */
  884. if (!stripe->dev->bdev)
  885. return fail_rbio_index(rbio, stripe_nr);
  886. /* see if we can add this page onto our existing bio */
  887. if (last) {
  888. last_end = (u64)last->bi_iter.bi_sector << 9;
  889. last_end += last->bi_iter.bi_size;
  890. /*
  891. * we can't merge these if they are from different
  892. * devices or if they are not contiguous
  893. */
  894. if (last_end == disk_start && stripe->dev->bdev &&
  895. test_bit(BIO_UPTODATE, &last->bi_flags) &&
  896. last->bi_bdev == stripe->dev->bdev) {
  897. ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0);
  898. if (ret == PAGE_CACHE_SIZE)
  899. return 0;
  900. }
  901. }
  902. /* put a new bio on the list */
  903. bio = btrfs_io_bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1);
  904. if (!bio)
  905. return -ENOMEM;
  906. bio->bi_iter.bi_size = 0;
  907. bio->bi_bdev = stripe->dev->bdev;
  908. bio->bi_iter.bi_sector = disk_start >> 9;
  909. set_bit(BIO_UPTODATE, &bio->bi_flags);
  910. bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
  911. bio_list_add(bio_list, bio);
  912. return 0;
  913. }
  914. /*
  915. * while we're doing the read/modify/write cycle, we could
  916. * have errors in reading pages off the disk. This checks
  917. * for errors and if we're not able to read the page it'll
  918. * trigger parity reconstruction. The rmw will be finished
  919. * after we've reconstructed the failed stripes
  920. */
  921. static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
  922. {
  923. if (rbio->faila >= 0 || rbio->failb >= 0) {
  924. BUG_ON(rbio->faila == rbio->bbio->num_stripes - 1);
  925. __raid56_parity_recover(rbio);
  926. } else {
  927. finish_rmw(rbio);
  928. }
  929. }
  930. /*
  931. * these are just the pages from the rbio array, not from anything
  932. * the FS sent down to us
  933. */
  934. static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, int page)
  935. {
  936. int index;
  937. index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT);
  938. index += page;
  939. return rbio->stripe_pages[index];
  940. }
  941. /*
  942. * helper function to walk our bio list and populate the bio_pages array with
  943. * the result. This seems expensive, but it is faster than constantly
  944. * searching through the bio list as we setup the IO in finish_rmw or stripe
  945. * reconstruction.
  946. *
  947. * This must be called before you trust the answers from page_in_rbio
  948. */
  949. static void index_rbio_pages(struct btrfs_raid_bio *rbio)
  950. {
  951. struct bio *bio;
  952. u64 start;
  953. unsigned long stripe_offset;
  954. unsigned long page_index;
  955. struct page *p;
  956. int i;
  957. spin_lock_irq(&rbio->bio_list_lock);
  958. bio_list_for_each(bio, &rbio->bio_list) {
  959. start = (u64)bio->bi_iter.bi_sector << 9;
  960. stripe_offset = start - rbio->raid_map[0];
  961. page_index = stripe_offset >> PAGE_CACHE_SHIFT;
  962. for (i = 0; i < bio->bi_vcnt; i++) {
  963. p = bio->bi_io_vec[i].bv_page;
  964. rbio->bio_pages[page_index + i] = p;
  965. }
  966. }
  967. spin_unlock_irq(&rbio->bio_list_lock);
  968. }
  969. /*
  970. * this is called from one of two situations. We either
  971. * have a full stripe from the higher layers, or we've read all
  972. * the missing bits off disk.
  973. *
  974. * This will calculate the parity and then send down any
  975. * changed blocks.
  976. */
  977. static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
  978. {
  979. struct btrfs_bio *bbio = rbio->bbio;
  980. void *pointers[bbio->num_stripes];
  981. int stripe_len = rbio->stripe_len;
  982. int nr_data = rbio->nr_data;
  983. int stripe;
  984. int pagenr;
  985. int p_stripe = -1;
  986. int q_stripe = -1;
  987. struct bio_list bio_list;
  988. struct bio *bio;
  989. int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT;
  990. int ret;
  991. bio_list_init(&bio_list);
  992. if (bbio->num_stripes - rbio->nr_data == 1) {
  993. p_stripe = bbio->num_stripes - 1;
  994. } else if (bbio->num_stripes - rbio->nr_data == 2) {
  995. p_stripe = bbio->num_stripes - 2;
  996. q_stripe = bbio->num_stripes - 1;
  997. } else {
  998. BUG();
  999. }
  1000. /* at this point we either have a full stripe,
  1001. * or we've read the full stripe from the drive.
  1002. * recalculate the parity and write the new results.
  1003. *
  1004. * We're not allowed to add any new bios to the
  1005. * bio list here, anyone else that wants to
  1006. * change this stripe needs to do their own rmw.
  1007. */
  1008. spin_lock_irq(&rbio->bio_list_lock);
  1009. set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
  1010. spin_unlock_irq(&rbio->bio_list_lock);
  1011. atomic_set(&rbio->bbio->error, 0);
  1012. /*
  1013. * now that we've set rmw_locked, run through the
  1014. * bio list one last time and map the page pointers
  1015. *
  1016. * We don't cache full rbios because we're assuming
  1017. * the higher layers are unlikely to use this area of
  1018. * the disk again soon. If they do use it again,
  1019. * hopefully they will send another full bio.
  1020. */
  1021. index_rbio_pages(rbio);
  1022. if (!rbio_is_full(rbio))
  1023. cache_rbio_pages(rbio);
  1024. else
  1025. clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
  1026. for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
  1027. struct page *p;
  1028. /* first collect one page from each data stripe */
  1029. for (stripe = 0; stripe < nr_data; stripe++) {
  1030. p = page_in_rbio(rbio, stripe, pagenr, 0);
  1031. pointers[stripe] = kmap(p);
  1032. }
  1033. /* then add the parity stripe */
  1034. p = rbio_pstripe_page(rbio, pagenr);
  1035. SetPageUptodate(p);
  1036. pointers[stripe++] = kmap(p);
  1037. if (q_stripe != -1) {
  1038. /*
  1039. * raid6, add the qstripe and call the
  1040. * library function to fill in our p/q
  1041. */
  1042. p = rbio_qstripe_page(rbio, pagenr);
  1043. SetPageUptodate(p);
  1044. pointers[stripe++] = kmap(p);
  1045. raid6_call.gen_syndrome(bbio->num_stripes, PAGE_SIZE,
  1046. pointers);
  1047. } else {
  1048. /* raid5 */
  1049. memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
  1050. run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE);
  1051. }
  1052. for (stripe = 0; stripe < bbio->num_stripes; stripe++)
  1053. kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
  1054. }
  1055. /*
  1056. * time to start writing. Make bios for everything from the
  1057. * higher layers (the bio_list in our rbio) and our p/q. Ignore
  1058. * everything else.
  1059. */
  1060. for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
  1061. for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
  1062. struct page *page;
  1063. if (stripe < rbio->nr_data) {
  1064. page = page_in_rbio(rbio, stripe, pagenr, 1);
  1065. if (!page)
  1066. continue;
  1067. } else {
  1068. page = rbio_stripe_page(rbio, stripe, pagenr);
  1069. }
  1070. ret = rbio_add_io_page(rbio, &bio_list,
  1071. page, stripe, pagenr, rbio->stripe_len);
  1072. if (ret)
  1073. goto cleanup;
  1074. }
  1075. }
  1076. atomic_set(&bbio->stripes_pending, bio_list_size(&bio_list));
  1077. BUG_ON(atomic_read(&bbio->stripes_pending) == 0);
  1078. while (1) {
  1079. bio = bio_list_pop(&bio_list);
  1080. if (!bio)
  1081. break;
  1082. bio->bi_private = rbio;
  1083. bio->bi_end_io = raid_write_end_io;
  1084. BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
  1085. submit_bio(WRITE, bio);
  1086. }
  1087. return;
  1088. cleanup:
  1089. rbio_orig_end_io(rbio, -EIO, 0);
  1090. }
  1091. /*
  1092. * helper to find the stripe number for a given bio. Used to figure out which
  1093. * stripe has failed. This expects the bio to correspond to a physical disk,
  1094. * so it looks up based on physical sector numbers.
  1095. */
  1096. static int find_bio_stripe(struct btrfs_raid_bio *rbio,
  1097. struct bio *bio)
  1098. {
  1099. u64 physical = bio->bi_iter.bi_sector;
  1100. u64 stripe_start;
  1101. int i;
  1102. struct btrfs_bio_stripe *stripe;
  1103. physical <<= 9;
  1104. for (i = 0; i < rbio->bbio->num_stripes; i++) {
  1105. stripe = &rbio->bbio->stripes[i];
  1106. stripe_start = stripe->physical;
  1107. if (physical >= stripe_start &&
  1108. physical < stripe_start + rbio->stripe_len) {
  1109. return i;
  1110. }
  1111. }
  1112. return -1;
  1113. }
  1114. /*
  1115. * helper to find the stripe number for a given
  1116. * bio (before mapping). Used to figure out which stripe has
  1117. * failed. This looks up based on logical block numbers.
  1118. */
  1119. static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
  1120. struct bio *bio)
  1121. {
  1122. u64 logical = bio->bi_iter.bi_sector;
  1123. u64 stripe_start;
  1124. int i;
  1125. logical <<= 9;
  1126. for (i = 0; i < rbio->nr_data; i++) {
  1127. stripe_start = rbio->raid_map[i];
  1128. if (logical >= stripe_start &&
  1129. logical < stripe_start + rbio->stripe_len) {
  1130. return i;
  1131. }
  1132. }
  1133. return -1;
  1134. }
  1135. /*
  1136. * returns -EIO if we had too many failures
  1137. */
  1138. static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
  1139. {
  1140. unsigned long flags;
  1141. int ret = 0;
  1142. spin_lock_irqsave(&rbio->bio_list_lock, flags);
  1143. /* we already know this stripe is bad, move on */
  1144. if (rbio->faila == failed || rbio->failb == failed)
  1145. goto out;
  1146. if (rbio->faila == -1) {
  1147. /* first failure on this rbio */
  1148. rbio->faila = failed;
  1149. atomic_inc(&rbio->bbio->error);
  1150. } else if (rbio->failb == -1) {
  1151. /* second failure on this rbio */
  1152. rbio->failb = failed;
  1153. atomic_inc(&rbio->bbio->error);
  1154. } else {
  1155. ret = -EIO;
  1156. }
  1157. out:
  1158. spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
  1159. return ret;
  1160. }
  1161. /*
  1162. * helper to fail a stripe based on a physical disk
  1163. * bio.
  1164. */
  1165. static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
  1166. struct bio *bio)
  1167. {
  1168. int failed = find_bio_stripe(rbio, bio);
  1169. if (failed < 0)
  1170. return -EIO;
  1171. return fail_rbio_index(rbio, failed);
  1172. }
  1173. /*
  1174. * this sets each page in the bio uptodate. It should only be used on private
  1175. * rbio pages, nothing that comes in from the higher layers
  1176. */
  1177. static void set_bio_pages_uptodate(struct bio *bio)
  1178. {
  1179. int i;
  1180. struct page *p;
  1181. for (i = 0; i < bio->bi_vcnt; i++) {
  1182. p = bio->bi_io_vec[i].bv_page;
  1183. SetPageUptodate(p);
  1184. }
  1185. }
  1186. /*
  1187. * end io for the read phase of the rmw cycle. All the bios here are physical
  1188. * stripe bios we've read from the disk so we can recalculate the parity of the
  1189. * stripe.
  1190. *
  1191. * This will usually kick off finish_rmw once all the bios are read in, but it
  1192. * may trigger parity reconstruction if we had any errors along the way
  1193. */
  1194. static void raid_rmw_end_io(struct bio *bio, int err)
  1195. {
  1196. struct btrfs_raid_bio *rbio = bio->bi_private;
  1197. if (err)
  1198. fail_bio_stripe(rbio, bio);
  1199. else
  1200. set_bio_pages_uptodate(bio);
  1201. bio_put(bio);
  1202. if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
  1203. return;
  1204. err = 0;
  1205. if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
  1206. goto cleanup;
  1207. /*
  1208. * this will normally call finish_rmw to start our write
  1209. * but if there are any failed stripes we'll reconstruct
  1210. * from parity first
  1211. */
  1212. validate_rbio_for_rmw(rbio);
  1213. return;
  1214. cleanup:
  1215. rbio_orig_end_io(rbio, -EIO, 0);
  1216. }
  1217. static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
  1218. {
  1219. btrfs_init_work(&rbio->work, rmw_work, NULL, NULL);
  1220. btrfs_queue_work(rbio->fs_info->rmw_workers,
  1221. &rbio->work);
  1222. }
  1223. static void async_read_rebuild(struct btrfs_raid_bio *rbio)
  1224. {
  1225. btrfs_init_work(&rbio->work, read_rebuild_work, NULL, NULL);
  1226. btrfs_queue_work(rbio->fs_info->rmw_workers,
  1227. &rbio->work);
  1228. }
  1229. /*
  1230. * the stripe must be locked by the caller. It will
  1231. * unlock after all the writes are done
  1232. */
  1233. static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
  1234. {
  1235. int bios_to_read = 0;
  1236. struct btrfs_bio *bbio = rbio->bbio;
  1237. struct bio_list bio_list;
  1238. int ret;
  1239. int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
  1240. int pagenr;
  1241. int stripe;
  1242. struct bio *bio;
  1243. bio_list_init(&bio_list);
  1244. ret = alloc_rbio_pages(rbio);
  1245. if (ret)
  1246. goto cleanup;
  1247. index_rbio_pages(rbio);
  1248. atomic_set(&rbio->bbio->error, 0);
  1249. /*
  1250. * build a list of bios to read all the missing parts of this
  1251. * stripe
  1252. */
  1253. for (stripe = 0; stripe < rbio->nr_data; stripe++) {
  1254. for (pagenr = 0; pagenr < nr_pages; pagenr++) {
  1255. struct page *page;
  1256. /*
  1257. * we want to find all the pages missing from
  1258. * the rbio and read them from the disk. If
  1259. * page_in_rbio finds a page in the bio list
  1260. * we don't need to read it off the stripe.
  1261. */
  1262. page = page_in_rbio(rbio, stripe, pagenr, 1);
  1263. if (page)
  1264. continue;
  1265. page = rbio_stripe_page(rbio, stripe, pagenr);
  1266. /*
  1267. * the bio cache may have handed us an uptodate
  1268. * page. If so, be happy and use it
  1269. */
  1270. if (PageUptodate(page))
  1271. continue;
  1272. ret = rbio_add_io_page(rbio, &bio_list, page,
  1273. stripe, pagenr, rbio->stripe_len);
  1274. if (ret)
  1275. goto cleanup;
  1276. }
  1277. }
  1278. bios_to_read = bio_list_size(&bio_list);
  1279. if (!bios_to_read) {
  1280. /*
  1281. * this can happen if others have merged with
  1282. * us, it means there is nothing left to read.
  1283. * But if there are missing devices it may not be
  1284. * safe to do the full stripe write yet.
  1285. */
  1286. goto finish;
  1287. }
  1288. /*
  1289. * the bbio may be freed once we submit the last bio. Make sure
  1290. * not to touch it after that
  1291. */
  1292. atomic_set(&bbio->stripes_pending, bios_to_read);
  1293. while (1) {
  1294. bio = bio_list_pop(&bio_list);
  1295. if (!bio)
  1296. break;
  1297. bio->bi_private = rbio;
  1298. bio->bi_end_io = raid_rmw_end_io;
  1299. btrfs_bio_wq_end_io(rbio->fs_info, bio,
  1300. BTRFS_WQ_ENDIO_RAID56);
  1301. BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
  1302. submit_bio(READ, bio);
  1303. }
  1304. /* the actual write will happen once the reads are done */
  1305. return 0;
  1306. cleanup:
  1307. rbio_orig_end_io(rbio, -EIO, 0);
  1308. return -EIO;
  1309. finish:
  1310. validate_rbio_for_rmw(rbio);
  1311. return 0;
  1312. }
  1313. /*
  1314. * if the upper layers pass in a full stripe, we thank them by only allocating
  1315. * enough pages to hold the parity, and sending it all down quickly.
  1316. */
  1317. static int full_stripe_write(struct btrfs_raid_bio *rbio)
  1318. {
  1319. int ret;
  1320. ret = alloc_rbio_parity_pages(rbio);
  1321. if (ret) {
  1322. __free_raid_bio(rbio);
  1323. return ret;
  1324. }
  1325. ret = lock_stripe_add(rbio);
  1326. if (ret == 0)
  1327. finish_rmw(rbio);
  1328. return 0;
  1329. }
  1330. /*
  1331. * partial stripe writes get handed over to async helpers.
  1332. * We're really hoping to merge a few more writes into this
  1333. * rbio before calculating new parity
  1334. */
  1335. static int partial_stripe_write(struct btrfs_raid_bio *rbio)
  1336. {
  1337. int ret;
  1338. ret = lock_stripe_add(rbio);
  1339. if (ret == 0)
  1340. async_rmw_stripe(rbio);
  1341. return 0;
  1342. }
  1343. /*
  1344. * sometimes while we were reading from the drive to
  1345. * recalculate parity, enough new bios come into create
  1346. * a full stripe. So we do a check here to see if we can
  1347. * go directly to finish_rmw
  1348. */
  1349. static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
  1350. {
  1351. /* head off into rmw land if we don't have a full stripe */
  1352. if (!rbio_is_full(rbio))
  1353. return partial_stripe_write(rbio);
  1354. return full_stripe_write(rbio);
  1355. }
  1356. /*
  1357. * We use plugging call backs to collect full stripes.
  1358. * Any time we get a partial stripe write while plugged
  1359. * we collect it into a list. When the unplug comes down,
  1360. * we sort the list by logical block number and merge
  1361. * everything we can into the same rbios
  1362. */
  1363. struct btrfs_plug_cb {
  1364. struct blk_plug_cb cb;
  1365. struct btrfs_fs_info *info;
  1366. struct list_head rbio_list;
  1367. struct btrfs_work work;
  1368. };
  1369. /*
  1370. * rbios on the plug list are sorted for easier merging.
  1371. */
  1372. static int plug_cmp(void *priv, struct list_head *a, struct list_head *b)
  1373. {
  1374. struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
  1375. plug_list);
  1376. struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
  1377. plug_list);
  1378. u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
  1379. u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
  1380. if (a_sector < b_sector)
  1381. return -1;
  1382. if (a_sector > b_sector)
  1383. return 1;
  1384. return 0;
  1385. }
  1386. static void run_plug(struct btrfs_plug_cb *plug)
  1387. {
  1388. struct btrfs_raid_bio *cur;
  1389. struct btrfs_raid_bio *last = NULL;
  1390. /*
  1391. * sort our plug list then try to merge
  1392. * everything we can in hopes of creating full
  1393. * stripes.
  1394. */
  1395. list_sort(NULL, &plug->rbio_list, plug_cmp);
  1396. while (!list_empty(&plug->rbio_list)) {
  1397. cur = list_entry(plug->rbio_list.next,
  1398. struct btrfs_raid_bio, plug_list);
  1399. list_del_init(&cur->plug_list);
  1400. if (rbio_is_full(cur)) {
  1401. /* we have a full stripe, send it down */
  1402. full_stripe_write(cur);
  1403. continue;
  1404. }
  1405. if (last) {
  1406. if (rbio_can_merge(last, cur)) {
  1407. merge_rbio(last, cur);
  1408. __free_raid_bio(cur);
  1409. continue;
  1410. }
  1411. __raid56_parity_write(last);
  1412. }
  1413. last = cur;
  1414. }
  1415. if (last) {
  1416. __raid56_parity_write(last);
  1417. }
  1418. kfree(plug);
  1419. }
  1420. /*
  1421. * if the unplug comes from schedule, we have to push the
  1422. * work off to a helper thread
  1423. */
  1424. static void unplug_work(struct btrfs_work *work)
  1425. {
  1426. struct btrfs_plug_cb *plug;
  1427. plug = container_of(work, struct btrfs_plug_cb, work);
  1428. run_plug(plug);
  1429. }
  1430. static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
  1431. {
  1432. struct btrfs_plug_cb *plug;
  1433. plug = container_of(cb, struct btrfs_plug_cb, cb);
  1434. if (from_schedule) {
  1435. btrfs_init_work(&plug->work, unplug_work, NULL, NULL);
  1436. btrfs_queue_work(plug->info->rmw_workers,
  1437. &plug->work);
  1438. return;
  1439. }
  1440. run_plug(plug);
  1441. }
  1442. /*
  1443. * our main entry point for writes from the rest of the FS.
  1444. */
  1445. int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
  1446. struct btrfs_bio *bbio, u64 *raid_map,
  1447. u64 stripe_len)
  1448. {
  1449. struct btrfs_raid_bio *rbio;
  1450. struct btrfs_plug_cb *plug = NULL;
  1451. struct blk_plug_cb *cb;
  1452. rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
  1453. if (IS_ERR(rbio))
  1454. return PTR_ERR(rbio);
  1455. bio_list_add(&rbio->bio_list, bio);
  1456. rbio->bio_list_bytes = bio->bi_iter.bi_size;
  1457. /*
  1458. * don't plug on full rbios, just get them out the door
  1459. * as quickly as we can
  1460. */
  1461. if (rbio_is_full(rbio))
  1462. return full_stripe_write(rbio);
  1463. cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info,
  1464. sizeof(*plug));
  1465. if (cb) {
  1466. plug = container_of(cb, struct btrfs_plug_cb, cb);
  1467. if (!plug->info) {
  1468. plug->info = root->fs_info;
  1469. INIT_LIST_HEAD(&plug->rbio_list);
  1470. }
  1471. list_add_tail(&rbio->plug_list, &plug->rbio_list);
  1472. } else {
  1473. return __raid56_parity_write(rbio);
  1474. }
  1475. return 0;
  1476. }
  1477. /*
  1478. * all parity reconstruction happens here. We've read in everything
  1479. * we can find from the drives and this does the heavy lifting of
  1480. * sorting the good from the bad.
  1481. */
  1482. static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
  1483. {
  1484. int pagenr, stripe;
  1485. void **pointers;
  1486. int faila = -1, failb = -1;
  1487. int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
  1488. struct page *page;
  1489. int err;
  1490. int i;
  1491. pointers = kzalloc(rbio->bbio->num_stripes * sizeof(void *),
  1492. GFP_NOFS);
  1493. if (!pointers) {
  1494. err = -ENOMEM;
  1495. goto cleanup_io;
  1496. }
  1497. faila = rbio->faila;
  1498. failb = rbio->failb;
  1499. if (rbio->read_rebuild) {
  1500. spin_lock_irq(&rbio->bio_list_lock);
  1501. set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
  1502. spin_unlock_irq(&rbio->bio_list_lock);
  1503. }
  1504. index_rbio_pages(rbio);
  1505. for (pagenr = 0; pagenr < nr_pages; pagenr++) {
  1506. /* setup our array of pointers with pages
  1507. * from each stripe
  1508. */
  1509. for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
  1510. /*
  1511. * if we're rebuilding a read, we have to use
  1512. * pages from the bio list
  1513. */
  1514. if (rbio->read_rebuild &&
  1515. (stripe == faila || stripe == failb)) {
  1516. page = page_in_rbio(rbio, stripe, pagenr, 0);
  1517. } else {
  1518. page = rbio_stripe_page(rbio, stripe, pagenr);
  1519. }
  1520. pointers[stripe] = kmap(page);
  1521. }
  1522. /* all raid6 handling here */
  1523. if (rbio->raid_map[rbio->bbio->num_stripes - 1] ==
  1524. RAID6_Q_STRIPE) {
  1525. /*
  1526. * single failure, rebuild from parity raid5
  1527. * style
  1528. */
  1529. if (failb < 0) {
  1530. if (faila == rbio->nr_data) {
  1531. /*
  1532. * Just the P stripe has failed, without
  1533. * a bad data or Q stripe.
  1534. * TODO, we should redo the xor here.
  1535. */
  1536. err = -EIO;
  1537. goto cleanup;
  1538. }
  1539. /*
  1540. * a single failure in raid6 is rebuilt
  1541. * in the pstripe code below
  1542. */
  1543. goto pstripe;
  1544. }
  1545. /* make sure our ps and qs are in order */
  1546. if (faila > failb) {
  1547. int tmp = failb;
  1548. failb = faila;
  1549. faila = tmp;
  1550. }
  1551. /* if the q stripe is failed, do a pstripe reconstruction
  1552. * from the xors.
  1553. * If both the q stripe and the P stripe are failed, we're
  1554. * here due to a crc mismatch and we can't give them the
  1555. * data they want
  1556. */
  1557. if (rbio->raid_map[failb] == RAID6_Q_STRIPE) {
  1558. if (rbio->raid_map[faila] == RAID5_P_STRIPE) {
  1559. err = -EIO;
  1560. goto cleanup;
  1561. }
  1562. /*
  1563. * otherwise we have one bad data stripe and
  1564. * a good P stripe. raid5!
  1565. */
  1566. goto pstripe;
  1567. }
  1568. if (rbio->raid_map[failb] == RAID5_P_STRIPE) {
  1569. raid6_datap_recov(rbio->bbio->num_stripes,
  1570. PAGE_SIZE, faila, pointers);
  1571. } else {
  1572. raid6_2data_recov(rbio->bbio->num_stripes,
  1573. PAGE_SIZE, faila, failb,
  1574. pointers);
  1575. }
  1576. } else {
  1577. void *p;
  1578. /* rebuild from P stripe here (raid5 or raid6) */
  1579. BUG_ON(failb != -1);
  1580. pstripe:
  1581. /* Copy parity block into failed block to start with */
  1582. memcpy(pointers[faila],
  1583. pointers[rbio->nr_data],
  1584. PAGE_CACHE_SIZE);
  1585. /* rearrange the pointer array */
  1586. p = pointers[faila];
  1587. for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
  1588. pointers[stripe] = pointers[stripe + 1];
  1589. pointers[rbio->nr_data - 1] = p;
  1590. /* xor in the rest */
  1591. run_xor(pointers, rbio->nr_data - 1, PAGE_CACHE_SIZE);
  1592. }
  1593. /* if we're doing this rebuild as part of an rmw, go through
  1594. * and set all of our private rbio pages in the
  1595. * failed stripes as uptodate. This way finish_rmw will
  1596. * know they can be trusted. If this was a read reconstruction,
  1597. * other endio functions will fiddle the uptodate bits
  1598. */
  1599. if (!rbio->read_rebuild) {
  1600. for (i = 0; i < nr_pages; i++) {
  1601. if (faila != -1) {
  1602. page = rbio_stripe_page(rbio, faila, i);
  1603. SetPageUptodate(page);
  1604. }
  1605. if (failb != -1) {
  1606. page = rbio_stripe_page(rbio, failb, i);
  1607. SetPageUptodate(page);
  1608. }
  1609. }
  1610. }
  1611. for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
  1612. /*
  1613. * if we're rebuilding a read, we have to use
  1614. * pages from the bio list
  1615. */
  1616. if (rbio->read_rebuild &&
  1617. (stripe == faila || stripe == failb)) {
  1618. page = page_in_rbio(rbio, stripe, pagenr, 0);
  1619. } else {
  1620. page = rbio_stripe_page(rbio, stripe, pagenr);
  1621. }
  1622. kunmap(page);
  1623. }
  1624. }
  1625. err = 0;
  1626. cleanup:
  1627. kfree(pointers);
  1628. cleanup_io:
  1629. if (rbio->read_rebuild) {
  1630. if (err == 0)
  1631. cache_rbio_pages(rbio);
  1632. else
  1633. clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
  1634. rbio_orig_end_io(rbio, err, err == 0);
  1635. } else if (err == 0) {
  1636. rbio->faila = -1;
  1637. rbio->failb = -1;
  1638. finish_rmw(rbio);
  1639. } else {
  1640. rbio_orig_end_io(rbio, err, 0);
  1641. }
  1642. }
  1643. /*
  1644. * This is called only for stripes we've read from disk to
  1645. * reconstruct the parity.
  1646. */
  1647. static void raid_recover_end_io(struct bio *bio, int err)
  1648. {
  1649. struct btrfs_raid_bio *rbio = bio->bi_private;
  1650. /*
  1651. * we only read stripe pages off the disk, set them
  1652. * up to date if there were no errors
  1653. */
  1654. if (err)
  1655. fail_bio_stripe(rbio, bio);
  1656. else
  1657. set_bio_pages_uptodate(bio);
  1658. bio_put(bio);
  1659. if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
  1660. return;
  1661. if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
  1662. rbio_orig_end_io(rbio, -EIO, 0);
  1663. else
  1664. __raid_recover_end_io(rbio);
  1665. }
  1666. /*
  1667. * reads everything we need off the disk to reconstruct
  1668. * the parity. endio handlers trigger final reconstruction
  1669. * when the IO is done.
  1670. *
  1671. * This is used both for reads from the higher layers and for
  1672. * parity construction required to finish a rmw cycle.
  1673. */
  1674. static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
  1675. {
  1676. int bios_to_read = 0;
  1677. struct btrfs_bio *bbio = rbio->bbio;
  1678. struct bio_list bio_list;
  1679. int ret;
  1680. int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
  1681. int pagenr;
  1682. int stripe;
  1683. struct bio *bio;
  1684. bio_list_init(&bio_list);
  1685. ret = alloc_rbio_pages(rbio);
  1686. if (ret)
  1687. goto cleanup;
  1688. atomic_set(&rbio->bbio->error, 0);
  1689. /*
  1690. * read everything that hasn't failed. Thanks to the
  1691. * stripe cache, it is possible that some or all of these
  1692. * pages are going to be uptodate.
  1693. */
  1694. for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
  1695. if (rbio->faila == stripe || rbio->failb == stripe) {
  1696. atomic_inc(&rbio->bbio->error);
  1697. continue;
  1698. }
  1699. for (pagenr = 0; pagenr < nr_pages; pagenr++) {
  1700. struct page *p;
  1701. /*
  1702. * the rmw code may have already read this
  1703. * page in
  1704. */
  1705. p = rbio_stripe_page(rbio, stripe, pagenr);
  1706. if (PageUptodate(p))
  1707. continue;
  1708. ret = rbio_add_io_page(rbio, &bio_list,
  1709. rbio_stripe_page(rbio, stripe, pagenr),
  1710. stripe, pagenr, rbio->stripe_len);
  1711. if (ret < 0)
  1712. goto cleanup;
  1713. }
  1714. }
  1715. bios_to_read = bio_list_size(&bio_list);
  1716. if (!bios_to_read) {
  1717. /*
  1718. * we might have no bios to read just because the pages
  1719. * were up to date, or we might have no bios to read because
  1720. * the devices were gone.
  1721. */
  1722. if (atomic_read(&rbio->bbio->error) <= rbio->bbio->max_errors) {
  1723. __raid_recover_end_io(rbio);
  1724. goto out;
  1725. } else {
  1726. goto cleanup;
  1727. }
  1728. }
  1729. /*
  1730. * the bbio may be freed once we submit the last bio. Make sure
  1731. * not to touch it after that
  1732. */
  1733. atomic_set(&bbio->stripes_pending, bios_to_read);
  1734. while (1) {
  1735. bio = bio_list_pop(&bio_list);
  1736. if (!bio)
  1737. break;
  1738. bio->bi_private = rbio;
  1739. bio->bi_end_io = raid_recover_end_io;
  1740. btrfs_bio_wq_end_io(rbio->fs_info, bio,
  1741. BTRFS_WQ_ENDIO_RAID56);
  1742. BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
  1743. submit_bio(READ, bio);
  1744. }
  1745. out:
  1746. return 0;
  1747. cleanup:
  1748. if (rbio->read_rebuild)
  1749. rbio_orig_end_io(rbio, -EIO, 0);
  1750. return -EIO;
  1751. }
  1752. /*
  1753. * the main entry point for reads from the higher layers. This
  1754. * is really only called when the normal read path had a failure,
  1755. * so we assume the bio they send down corresponds to a failed part
  1756. * of the drive.
  1757. */
  1758. int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
  1759. struct btrfs_bio *bbio, u64 *raid_map,
  1760. u64 stripe_len, int mirror_num)
  1761. {
  1762. struct btrfs_raid_bio *rbio;
  1763. int ret;
  1764. rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
  1765. if (IS_ERR(rbio))
  1766. return PTR_ERR(rbio);
  1767. rbio->read_rebuild = 1;
  1768. bio_list_add(&rbio->bio_list, bio);
  1769. rbio->bio_list_bytes = bio->bi_iter.bi_size;
  1770. rbio->faila = find_logical_bio_stripe(rbio, bio);
  1771. if (rbio->faila == -1) {
  1772. BUG();
  1773. kfree(raid_map);
  1774. kfree(bbio);
  1775. kfree(rbio);
  1776. return -EIO;
  1777. }
  1778. /*
  1779. * reconstruct from the q stripe if they are
  1780. * asking for mirror 3
  1781. */
  1782. if (mirror_num == 3)
  1783. rbio->failb = bbio->num_stripes - 2;
  1784. ret = lock_stripe_add(rbio);
  1785. /*
  1786. * __raid56_parity_recover will end the bio with
  1787. * any errors it hits. We don't want to return
  1788. * its error value up the stack because our caller
  1789. * will end up calling bio_endio with any nonzero
  1790. * return
  1791. */
  1792. if (ret == 0)
  1793. __raid56_parity_recover(rbio);
  1794. /*
  1795. * our rbio has been added to the list of
  1796. * rbios that will be handled after the
  1797. * currently lock owner is done
  1798. */
  1799. return 0;
  1800. }
  1801. static void rmw_work(struct btrfs_work *work)
  1802. {
  1803. struct btrfs_raid_bio *rbio;
  1804. rbio = container_of(work, struct btrfs_raid_bio, work);
  1805. raid56_rmw_stripe(rbio);
  1806. }
  1807. static void read_rebuild_work(struct btrfs_work *work)
  1808. {
  1809. struct btrfs_raid_bio *rbio;
  1810. rbio = container_of(work, struct btrfs_raid_bio, work);
  1811. __raid56_parity_recover(rbio);
  1812. }