raid5-ppl.c 36 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271
  1. /*
  2. * Partial Parity Log for closing the RAID5 write hole
  3. * Copyright (c) 2017, Intel Corporation.
  4. *
  5. * This program is free software; you can redistribute it and/or modify it
  6. * under the terms and conditions of the GNU General Public License,
  7. * version 2, as published by the Free Software Foundation.
  8. *
  9. * This program is distributed in the hope it will be useful, but WITHOUT
  10. * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11. * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
  12. * more details.
  13. */
  14. #include <linux/kernel.h>
  15. #include <linux/blkdev.h>
  16. #include <linux/slab.h>
  17. #include <linux/crc32c.h>
  18. #include <linux/flex_array.h>
  19. #include <linux/async_tx.h>
  20. #include <linux/raid/md_p.h>
  21. #include "md.h"
  22. #include "raid5.h"
  23. /*
  24. * PPL consists of a 4KB header (struct ppl_header) and at least 128KB for
  25. * partial parity data. The header contains an array of entries
  26. * (struct ppl_header_entry) which describe the logged write requests.
  27. * Partial parity for the entries comes after the header, written in the same
  28. * sequence as the entries:
  29. *
  30. * Header
  31. * entry0
  32. * ...
  33. * entryN
  34. * PP data
  35. * PP for entry0
  36. * ...
  37. * PP for entryN
  38. *
  39. * An entry describes one or more consecutive stripe_heads, up to a full
  40. * stripe. The modifed raid data chunks form an m-by-n matrix, where m is the
  41. * number of stripe_heads in the entry and n is the number of modified data
  42. * disks. Every stripe_head in the entry must write to the same data disks.
  43. * An example of a valid case described by a single entry (writes to the first
  44. * stripe of a 4 disk array, 16k chunk size):
  45. *
  46. * sh->sector dd0 dd1 dd2 ppl
  47. * +-----+-----+-----+
  48. * 0 | --- | --- | --- | +----+
  49. * 8 | -W- | -W- | --- | | pp | data_sector = 8
  50. * 16 | -W- | -W- | --- | | pp | data_size = 3 * 2 * 4k
  51. * 24 | -W- | -W- | --- | | pp | pp_size = 3 * 4k
  52. * +-----+-----+-----+ +----+
  53. *
  54. * data_sector is the first raid sector of the modified data, data_size is the
  55. * total size of modified data and pp_size is the size of partial parity for
  56. * this entry. Entries for full stripe writes contain no partial parity
  57. * (pp_size = 0), they only mark the stripes for which parity should be
  58. * recalculated after an unclean shutdown. Every entry holds a checksum of its
  59. * partial parity, the header also has a checksum of the header itself.
  60. *
  61. * A write request is always logged to the PPL instance stored on the parity
  62. * disk of the corresponding stripe. For each member disk there is one ppl_log
  63. * used to handle logging for this disk, independently from others. They are
  64. * grouped in child_logs array in struct ppl_conf, which is assigned to
  65. * r5conf->log_private.
  66. *
  67. * ppl_io_unit represents a full PPL write, header_page contains the ppl_header.
  68. * PPL entries for logged stripes are added in ppl_log_stripe(). A stripe_head
  69. * can be appended to the last entry if it meets the conditions for a valid
  70. * entry described above, otherwise a new entry is added. Checksums of entries
  71. * are calculated incrementally as stripes containing partial parity are being
  72. * added. ppl_submit_iounit() calculates the checksum of the header and submits
  73. * a bio containing the header page and partial parity pages (sh->ppl_page) for
  74. * all stripes of the io_unit. When the PPL write completes, the stripes
  75. * associated with the io_unit are released and raid5d starts writing their data
  76. * and parity. When all stripes are written, the io_unit is freed and the next
  77. * can be submitted.
  78. *
  79. * An io_unit is used to gather stripes until it is submitted or becomes full
  80. * (if the maximum number of entries or size of PPL is reached). Another io_unit
  81. * can't be submitted until the previous has completed (PPL and stripe
  82. * data+parity is written). The log->io_list tracks all io_units of a log
  83. * (for a single member disk). New io_units are added to the end of the list
  84. * and the first io_unit is submitted, if it is not submitted already.
  85. * The current io_unit accepting new stripes is always at the end of the list.
  86. */
  87. struct ppl_conf {
  88. struct mddev *mddev;
  89. /* array of child logs, one for each raid disk */
  90. struct ppl_log *child_logs;
  91. int count;
  92. int block_size; /* the logical block size used for data_sector
  93. * in ppl_header_entry */
  94. u32 signature; /* raid array identifier */
  95. atomic64_t seq; /* current log write sequence number */
  96. struct kmem_cache *io_kc;
  97. mempool_t *io_pool;
  98. struct bio_set *bs;
  99. /* used only for recovery */
  100. int recovered_entries;
  101. int mismatch_count;
  102. /* stripes to retry if failed to allocate io_unit */
  103. struct list_head no_mem_stripes;
  104. spinlock_t no_mem_stripes_lock;
  105. };
  106. struct ppl_log {
  107. struct ppl_conf *ppl_conf; /* shared between all log instances */
  108. struct md_rdev *rdev; /* array member disk associated with
  109. * this log instance */
  110. struct mutex io_mutex;
  111. struct ppl_io_unit *current_io; /* current io_unit accepting new data
  112. * always at the end of io_list */
  113. spinlock_t io_list_lock;
  114. struct list_head io_list; /* all io_units of this log */
  115. };
  116. #define PPL_IO_INLINE_BVECS 32
  117. struct ppl_io_unit {
  118. struct ppl_log *log;
  119. struct page *header_page; /* for ppl_header */
  120. unsigned int entries_count; /* number of entries in ppl_header */
  121. unsigned int pp_size; /* total size current of partial parity */
  122. u64 seq; /* sequence number of this log write */
  123. struct list_head log_sibling; /* log->io_list */
  124. struct list_head stripe_list; /* stripes added to the io_unit */
  125. atomic_t pending_stripes; /* how many stripes not written to raid */
  126. bool submitted; /* true if write to log started */
  127. /* inline bio and its biovec for submitting the iounit */
  128. struct bio bio;
  129. struct bio_vec biovec[PPL_IO_INLINE_BVECS];
  130. };
  131. struct dma_async_tx_descriptor *
  132. ops_run_partial_parity(struct stripe_head *sh, struct raid5_percpu *percpu,
  133. struct dma_async_tx_descriptor *tx)
  134. {
  135. int disks = sh->disks;
  136. struct page **srcs = flex_array_get(percpu->scribble, 0);
  137. int count = 0, pd_idx = sh->pd_idx, i;
  138. struct async_submit_ctl submit;
  139. pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
  140. /*
  141. * Partial parity is the XOR of stripe data chunks that are not changed
  142. * during the write request. Depending on available data
  143. * (read-modify-write vs. reconstruct-write case) we calculate it
  144. * differently.
  145. */
  146. if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
  147. /*
  148. * rmw: xor old data and parity from updated disks
  149. * This is calculated earlier by ops_run_prexor5() so just copy
  150. * the parity dev page.
  151. */
  152. srcs[count++] = sh->dev[pd_idx].page;
  153. } else if (sh->reconstruct_state == reconstruct_state_drain_run) {
  154. /* rcw: xor data from all not updated disks */
  155. for (i = disks; i--;) {
  156. struct r5dev *dev = &sh->dev[i];
  157. if (test_bit(R5_UPTODATE, &dev->flags))
  158. srcs[count++] = dev->page;
  159. }
  160. } else {
  161. return tx;
  162. }
  163. init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, tx,
  164. NULL, sh, flex_array_get(percpu->scribble, 0)
  165. + sizeof(struct page *) * (sh->disks + 2));
  166. if (count == 1)
  167. tx = async_memcpy(sh->ppl_page, srcs[0], 0, 0, PAGE_SIZE,
  168. &submit);
  169. else
  170. tx = async_xor(sh->ppl_page, srcs, 0, count, PAGE_SIZE,
  171. &submit);
  172. return tx;
  173. }
  174. static void *ppl_io_pool_alloc(gfp_t gfp_mask, void *pool_data)
  175. {
  176. struct kmem_cache *kc = pool_data;
  177. struct ppl_io_unit *io;
  178. io = kmem_cache_alloc(kc, gfp_mask);
  179. if (!io)
  180. return NULL;
  181. io->header_page = alloc_page(gfp_mask);
  182. if (!io->header_page) {
  183. kmem_cache_free(kc, io);
  184. return NULL;
  185. }
  186. return io;
  187. }
  188. static void ppl_io_pool_free(void *element, void *pool_data)
  189. {
  190. struct kmem_cache *kc = pool_data;
  191. struct ppl_io_unit *io = element;
  192. __free_page(io->header_page);
  193. kmem_cache_free(kc, io);
  194. }
  195. static struct ppl_io_unit *ppl_new_iounit(struct ppl_log *log,
  196. struct stripe_head *sh)
  197. {
  198. struct ppl_conf *ppl_conf = log->ppl_conf;
  199. struct ppl_io_unit *io;
  200. struct ppl_header *pplhdr;
  201. struct page *header_page;
  202. io = mempool_alloc(ppl_conf->io_pool, GFP_NOWAIT);
  203. if (!io)
  204. return NULL;
  205. header_page = io->header_page;
  206. memset(io, 0, sizeof(*io));
  207. io->header_page = header_page;
  208. io->log = log;
  209. INIT_LIST_HEAD(&io->log_sibling);
  210. INIT_LIST_HEAD(&io->stripe_list);
  211. atomic_set(&io->pending_stripes, 0);
  212. bio_init(&io->bio, io->biovec, PPL_IO_INLINE_BVECS);
  213. pplhdr = page_address(io->header_page);
  214. clear_page(pplhdr);
  215. memset(pplhdr->reserved, 0xff, PPL_HDR_RESERVED);
  216. pplhdr->signature = cpu_to_le32(ppl_conf->signature);
  217. io->seq = atomic64_add_return(1, &ppl_conf->seq);
  218. pplhdr->generation = cpu_to_le64(io->seq);
  219. return io;
  220. }
  221. static int ppl_log_stripe(struct ppl_log *log, struct stripe_head *sh)
  222. {
  223. struct ppl_io_unit *io = log->current_io;
  224. struct ppl_header_entry *e = NULL;
  225. struct ppl_header *pplhdr;
  226. int i;
  227. sector_t data_sector = 0;
  228. int data_disks = 0;
  229. unsigned int entry_space = (log->rdev->ppl.size << 9) - PPL_HEADER_SIZE;
  230. struct r5conf *conf = sh->raid_conf;
  231. pr_debug("%s: stripe: %llu\n", __func__, (unsigned long long)sh->sector);
  232. /* check if current io_unit is full */
  233. if (io && (io->pp_size == entry_space ||
  234. io->entries_count == PPL_HDR_MAX_ENTRIES)) {
  235. pr_debug("%s: add io_unit blocked by seq: %llu\n",
  236. __func__, io->seq);
  237. io = NULL;
  238. }
  239. /* add a new unit if there is none or the current is full */
  240. if (!io) {
  241. io = ppl_new_iounit(log, sh);
  242. if (!io)
  243. return -ENOMEM;
  244. spin_lock_irq(&log->io_list_lock);
  245. list_add_tail(&io->log_sibling, &log->io_list);
  246. spin_unlock_irq(&log->io_list_lock);
  247. log->current_io = io;
  248. }
  249. for (i = 0; i < sh->disks; i++) {
  250. struct r5dev *dev = &sh->dev[i];
  251. if (i != sh->pd_idx && test_bit(R5_Wantwrite, &dev->flags)) {
  252. if (!data_disks || dev->sector < data_sector)
  253. data_sector = dev->sector;
  254. data_disks++;
  255. }
  256. }
  257. BUG_ON(!data_disks);
  258. pr_debug("%s: seq: %llu data_sector: %llu data_disks: %d\n", __func__,
  259. io->seq, (unsigned long long)data_sector, data_disks);
  260. pplhdr = page_address(io->header_page);
  261. if (io->entries_count > 0) {
  262. struct ppl_header_entry *last =
  263. &pplhdr->entries[io->entries_count - 1];
  264. struct stripe_head *sh_last = list_last_entry(
  265. &io->stripe_list, struct stripe_head, log_list);
  266. u64 data_sector_last = le64_to_cpu(last->data_sector);
  267. u32 data_size_last = le32_to_cpu(last->data_size);
  268. /*
  269. * Check if we can append the stripe to the last entry. It must
  270. * be just after the last logged stripe and write to the same
  271. * disks. Use bit shift and logarithm to avoid 64-bit division.
  272. */
  273. if ((sh->sector == sh_last->sector + STRIPE_SECTORS) &&
  274. (data_sector >> ilog2(conf->chunk_sectors) ==
  275. data_sector_last >> ilog2(conf->chunk_sectors)) &&
  276. ((data_sector - data_sector_last) * data_disks ==
  277. data_size_last >> 9))
  278. e = last;
  279. }
  280. if (!e) {
  281. e = &pplhdr->entries[io->entries_count++];
  282. e->data_sector = cpu_to_le64(data_sector);
  283. e->parity_disk = cpu_to_le32(sh->pd_idx);
  284. e->checksum = cpu_to_le32(~0);
  285. }
  286. le32_add_cpu(&e->data_size, data_disks << PAGE_SHIFT);
  287. /* don't write any PP if full stripe write */
  288. if (!test_bit(STRIPE_FULL_WRITE, &sh->state)) {
  289. le32_add_cpu(&e->pp_size, PAGE_SIZE);
  290. io->pp_size += PAGE_SIZE;
  291. e->checksum = cpu_to_le32(crc32c_le(le32_to_cpu(e->checksum),
  292. page_address(sh->ppl_page),
  293. PAGE_SIZE));
  294. }
  295. list_add_tail(&sh->log_list, &io->stripe_list);
  296. atomic_inc(&io->pending_stripes);
  297. sh->ppl_io = io;
  298. return 0;
  299. }
  300. int ppl_write_stripe(struct r5conf *conf, struct stripe_head *sh)
  301. {
  302. struct ppl_conf *ppl_conf = conf->log_private;
  303. struct ppl_io_unit *io = sh->ppl_io;
  304. struct ppl_log *log;
  305. if (io || test_bit(STRIPE_SYNCING, &sh->state) || !sh->ppl_page ||
  306. !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) ||
  307. !test_bit(R5_Insync, &sh->dev[sh->pd_idx].flags)) {
  308. clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
  309. return -EAGAIN;
  310. }
  311. log = &ppl_conf->child_logs[sh->pd_idx];
  312. mutex_lock(&log->io_mutex);
  313. if (!log->rdev || test_bit(Faulty, &log->rdev->flags)) {
  314. mutex_unlock(&log->io_mutex);
  315. return -EAGAIN;
  316. }
  317. set_bit(STRIPE_LOG_TRAPPED, &sh->state);
  318. clear_bit(STRIPE_DELAYED, &sh->state);
  319. atomic_inc(&sh->count);
  320. if (ppl_log_stripe(log, sh)) {
  321. spin_lock_irq(&ppl_conf->no_mem_stripes_lock);
  322. list_add_tail(&sh->log_list, &ppl_conf->no_mem_stripes);
  323. spin_unlock_irq(&ppl_conf->no_mem_stripes_lock);
  324. }
  325. mutex_unlock(&log->io_mutex);
  326. return 0;
  327. }
  328. static void ppl_log_endio(struct bio *bio)
  329. {
  330. struct ppl_io_unit *io = bio->bi_private;
  331. struct ppl_log *log = io->log;
  332. struct ppl_conf *ppl_conf = log->ppl_conf;
  333. struct stripe_head *sh, *next;
  334. pr_debug("%s: seq: %llu\n", __func__, io->seq);
  335. if (bio->bi_error)
  336. md_error(ppl_conf->mddev, log->rdev);
  337. list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) {
  338. list_del_init(&sh->log_list);
  339. set_bit(STRIPE_HANDLE, &sh->state);
  340. raid5_release_stripe(sh);
  341. }
  342. }
  343. static void ppl_submit_iounit_bio(struct ppl_io_unit *io, struct bio *bio)
  344. {
  345. char b[BDEVNAME_SIZE];
  346. pr_debug("%s: seq: %llu size: %u sector: %llu dev: %s\n",
  347. __func__, io->seq, bio->bi_iter.bi_size,
  348. (unsigned long long)bio->bi_iter.bi_sector,
  349. bdevname(bio->bi_bdev, b));
  350. submit_bio(bio);
  351. }
  352. static void ppl_submit_iounit(struct ppl_io_unit *io)
  353. {
  354. struct ppl_log *log = io->log;
  355. struct ppl_conf *ppl_conf = log->ppl_conf;
  356. struct ppl_header *pplhdr = page_address(io->header_page);
  357. struct bio *bio = &io->bio;
  358. struct stripe_head *sh;
  359. int i;
  360. bio->bi_private = io;
  361. if (!log->rdev || test_bit(Faulty, &log->rdev->flags)) {
  362. ppl_log_endio(bio);
  363. return;
  364. }
  365. for (i = 0; i < io->entries_count; i++) {
  366. struct ppl_header_entry *e = &pplhdr->entries[i];
  367. pr_debug("%s: seq: %llu entry: %d data_sector: %llu pp_size: %u data_size: %u\n",
  368. __func__, io->seq, i, le64_to_cpu(e->data_sector),
  369. le32_to_cpu(e->pp_size), le32_to_cpu(e->data_size));
  370. e->data_sector = cpu_to_le64(le64_to_cpu(e->data_sector) >>
  371. ilog2(ppl_conf->block_size >> 9));
  372. e->checksum = cpu_to_le32(~le32_to_cpu(e->checksum));
  373. }
  374. pplhdr->entries_count = cpu_to_le32(io->entries_count);
  375. pplhdr->checksum = cpu_to_le32(~crc32c_le(~0, pplhdr, PPL_HEADER_SIZE));
  376. bio->bi_end_io = ppl_log_endio;
  377. bio->bi_opf = REQ_OP_WRITE | REQ_FUA;
  378. bio->bi_bdev = log->rdev->bdev;
  379. bio->bi_iter.bi_sector = log->rdev->ppl.sector;
  380. bio_add_page(bio, io->header_page, PAGE_SIZE, 0);
  381. list_for_each_entry(sh, &io->stripe_list, log_list) {
  382. /* entries for full stripe writes have no partial parity */
  383. if (test_bit(STRIPE_FULL_WRITE, &sh->state))
  384. continue;
  385. if (!bio_add_page(bio, sh->ppl_page, PAGE_SIZE, 0)) {
  386. struct bio *prev = bio;
  387. bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES,
  388. ppl_conf->bs);
  389. bio->bi_opf = prev->bi_opf;
  390. bio->bi_bdev = prev->bi_bdev;
  391. bio->bi_iter.bi_sector = bio_end_sector(prev);
  392. bio_add_page(bio, sh->ppl_page, PAGE_SIZE, 0);
  393. bio_chain(bio, prev);
  394. ppl_submit_iounit_bio(io, prev);
  395. }
  396. }
  397. ppl_submit_iounit_bio(io, bio);
  398. }
  399. static void ppl_submit_current_io(struct ppl_log *log)
  400. {
  401. struct ppl_io_unit *io;
  402. spin_lock_irq(&log->io_list_lock);
  403. io = list_first_entry_or_null(&log->io_list, struct ppl_io_unit,
  404. log_sibling);
  405. if (io && io->submitted)
  406. io = NULL;
  407. spin_unlock_irq(&log->io_list_lock);
  408. if (io) {
  409. io->submitted = true;
  410. if (io == log->current_io)
  411. log->current_io = NULL;
  412. ppl_submit_iounit(io);
  413. }
  414. }
  415. void ppl_write_stripe_run(struct r5conf *conf)
  416. {
  417. struct ppl_conf *ppl_conf = conf->log_private;
  418. struct ppl_log *log;
  419. int i;
  420. for (i = 0; i < ppl_conf->count; i++) {
  421. log = &ppl_conf->child_logs[i];
  422. mutex_lock(&log->io_mutex);
  423. ppl_submit_current_io(log);
  424. mutex_unlock(&log->io_mutex);
  425. }
  426. }
  427. static void ppl_io_unit_finished(struct ppl_io_unit *io)
  428. {
  429. struct ppl_log *log = io->log;
  430. struct ppl_conf *ppl_conf = log->ppl_conf;
  431. unsigned long flags;
  432. pr_debug("%s: seq: %llu\n", __func__, io->seq);
  433. local_irq_save(flags);
  434. spin_lock(&log->io_list_lock);
  435. list_del(&io->log_sibling);
  436. spin_unlock(&log->io_list_lock);
  437. mempool_free(io, ppl_conf->io_pool);
  438. spin_lock(&ppl_conf->no_mem_stripes_lock);
  439. if (!list_empty(&ppl_conf->no_mem_stripes)) {
  440. struct stripe_head *sh;
  441. sh = list_first_entry(&ppl_conf->no_mem_stripes,
  442. struct stripe_head, log_list);
  443. list_del_init(&sh->log_list);
  444. set_bit(STRIPE_HANDLE, &sh->state);
  445. raid5_release_stripe(sh);
  446. }
  447. spin_unlock(&ppl_conf->no_mem_stripes_lock);
  448. local_irq_restore(flags);
  449. }
  450. void ppl_stripe_write_finished(struct stripe_head *sh)
  451. {
  452. struct ppl_io_unit *io;
  453. io = sh->ppl_io;
  454. sh->ppl_io = NULL;
  455. if (io && atomic_dec_and_test(&io->pending_stripes))
  456. ppl_io_unit_finished(io);
  457. }
  458. static void ppl_xor(int size, struct page *page1, struct page *page2)
  459. {
  460. struct async_submit_ctl submit;
  461. struct dma_async_tx_descriptor *tx;
  462. struct page *xor_srcs[] = { page1, page2 };
  463. init_async_submit(&submit, ASYNC_TX_ACK|ASYNC_TX_XOR_DROP_DST,
  464. NULL, NULL, NULL, NULL);
  465. tx = async_xor(page1, xor_srcs, 0, 2, size, &submit);
  466. async_tx_quiesce(&tx);
  467. }
  468. /*
  469. * PPL recovery strategy: xor partial parity and data from all modified data
  470. * disks within a stripe and write the result as the new stripe parity. If all
  471. * stripe data disks are modified (full stripe write), no partial parity is
  472. * available, so just xor the data disks.
  473. *
  474. * Recovery of a PPL entry shall occur only if all modified data disks are
  475. * available and read from all of them succeeds.
  476. *
  477. * A PPL entry applies to a stripe, partial parity size for an entry is at most
  478. * the size of the chunk. Examples of possible cases for a single entry:
  479. *
  480. * case 0: single data disk write:
  481. * data0 data1 data2 ppl parity
  482. * +--------+--------+--------+ +--------------------+
  483. * | ------ | ------ | ------ | +----+ | (no change) |
  484. * | ------ | -data- | ------ | | pp | -> | data1 ^ pp |
  485. * | ------ | -data- | ------ | | pp | -> | data1 ^ pp |
  486. * | ------ | ------ | ------ | +----+ | (no change) |
  487. * +--------+--------+--------+ +--------------------+
  488. * pp_size = data_size
  489. *
  490. * case 1: more than one data disk write:
  491. * data0 data1 data2 ppl parity
  492. * +--------+--------+--------+ +--------------------+
  493. * | ------ | ------ | ------ | +----+ | (no change) |
  494. * | -data- | -data- | ------ | | pp | -> | data0 ^ data1 ^ pp |
  495. * | -data- | -data- | ------ | | pp | -> | data0 ^ data1 ^ pp |
  496. * | ------ | ------ | ------ | +----+ | (no change) |
  497. * +--------+--------+--------+ +--------------------+
  498. * pp_size = data_size / modified_data_disks
  499. *
  500. * case 2: write to all data disks (also full stripe write):
  501. * data0 data1 data2 parity
  502. * +--------+--------+--------+ +--------------------+
  503. * | ------ | ------ | ------ | | (no change) |
  504. * | -data- | -data- | -data- | --------> | xor all data |
  505. * | ------ | ------ | ------ | --------> | (no change) |
  506. * | ------ | ------ | ------ | | (no change) |
  507. * +--------+--------+--------+ +--------------------+
  508. * pp_size = 0
  509. *
  510. * The following cases are possible only in other implementations. The recovery
  511. * code can handle them, but they are not generated at runtime because they can
  512. * be reduced to cases 0, 1 and 2:
  513. *
  514. * case 3:
  515. * data0 data1 data2 ppl parity
  516. * +--------+--------+--------+ +----+ +--------------------+
  517. * | ------ | -data- | -data- | | pp | | data1 ^ data2 ^ pp |
  518. * | ------ | -data- | -data- | | pp | -> | data1 ^ data2 ^ pp |
  519. * | -data- | -data- | -data- | | -- | -> | xor all data |
  520. * | -data- | -data- | ------ | | pp | | data0 ^ data1 ^ pp |
  521. * +--------+--------+--------+ +----+ +--------------------+
  522. * pp_size = chunk_size
  523. *
  524. * case 4:
  525. * data0 data1 data2 ppl parity
  526. * +--------+--------+--------+ +----+ +--------------------+
  527. * | ------ | -data- | ------ | | pp | | data1 ^ pp |
  528. * | ------ | ------ | ------ | | -- | -> | (no change) |
  529. * | ------ | ------ | ------ | | -- | -> | (no change) |
  530. * | -data- | ------ | ------ | | pp | | data0 ^ pp |
  531. * +--------+--------+--------+ +----+ +--------------------+
  532. * pp_size = chunk_size
  533. */
  534. static int ppl_recover_entry(struct ppl_log *log, struct ppl_header_entry *e,
  535. sector_t ppl_sector)
  536. {
  537. struct ppl_conf *ppl_conf = log->ppl_conf;
  538. struct mddev *mddev = ppl_conf->mddev;
  539. struct r5conf *conf = mddev->private;
  540. int block_size = ppl_conf->block_size;
  541. struct page *page1;
  542. struct page *page2;
  543. sector_t r_sector_first;
  544. sector_t r_sector_last;
  545. int strip_sectors;
  546. int data_disks;
  547. int i;
  548. int ret = 0;
  549. char b[BDEVNAME_SIZE];
  550. unsigned int pp_size = le32_to_cpu(e->pp_size);
  551. unsigned int data_size = le32_to_cpu(e->data_size);
  552. page1 = alloc_page(GFP_KERNEL);
  553. page2 = alloc_page(GFP_KERNEL);
  554. if (!page1 || !page2) {
  555. ret = -ENOMEM;
  556. goto out;
  557. }
  558. r_sector_first = le64_to_cpu(e->data_sector) * (block_size >> 9);
  559. if ((pp_size >> 9) < conf->chunk_sectors) {
  560. if (pp_size > 0) {
  561. data_disks = data_size / pp_size;
  562. strip_sectors = pp_size >> 9;
  563. } else {
  564. data_disks = conf->raid_disks - conf->max_degraded;
  565. strip_sectors = (data_size >> 9) / data_disks;
  566. }
  567. r_sector_last = r_sector_first +
  568. (data_disks - 1) * conf->chunk_sectors +
  569. strip_sectors;
  570. } else {
  571. data_disks = conf->raid_disks - conf->max_degraded;
  572. strip_sectors = conf->chunk_sectors;
  573. r_sector_last = r_sector_first + (data_size >> 9);
  574. }
  575. pr_debug("%s: array sector first: %llu last: %llu\n", __func__,
  576. (unsigned long long)r_sector_first,
  577. (unsigned long long)r_sector_last);
  578. /* if start and end is 4k aligned, use a 4k block */
  579. if (block_size == 512 &&
  580. (r_sector_first & (STRIPE_SECTORS - 1)) == 0 &&
  581. (r_sector_last & (STRIPE_SECTORS - 1)) == 0)
  582. block_size = STRIPE_SIZE;
  583. /* iterate through blocks in strip */
  584. for (i = 0; i < strip_sectors; i += (block_size >> 9)) {
  585. bool update_parity = false;
  586. sector_t parity_sector;
  587. struct md_rdev *parity_rdev;
  588. struct stripe_head sh;
  589. int disk;
  590. int indent = 0;
  591. pr_debug("%s:%*s iter %d start\n", __func__, indent, "", i);
  592. indent += 2;
  593. memset(page_address(page1), 0, PAGE_SIZE);
  594. /* iterate through data member disks */
  595. for (disk = 0; disk < data_disks; disk++) {
  596. int dd_idx;
  597. struct md_rdev *rdev;
  598. sector_t sector;
  599. sector_t r_sector = r_sector_first + i +
  600. (disk * conf->chunk_sectors);
  601. pr_debug("%s:%*s data member disk %d start\n",
  602. __func__, indent, "", disk);
  603. indent += 2;
  604. if (r_sector >= r_sector_last) {
  605. pr_debug("%s:%*s array sector %llu doesn't need parity update\n",
  606. __func__, indent, "",
  607. (unsigned long long)r_sector);
  608. indent -= 2;
  609. continue;
  610. }
  611. update_parity = true;
  612. /* map raid sector to member disk */
  613. sector = raid5_compute_sector(conf, r_sector, 0,
  614. &dd_idx, NULL);
  615. pr_debug("%s:%*s processing array sector %llu => data member disk %d, sector %llu\n",
  616. __func__, indent, "",
  617. (unsigned long long)r_sector, dd_idx,
  618. (unsigned long long)sector);
  619. rdev = conf->disks[dd_idx].rdev;
  620. if (!rdev) {
  621. pr_debug("%s:%*s data member disk %d missing\n",
  622. __func__, indent, "", dd_idx);
  623. update_parity = false;
  624. break;
  625. }
  626. pr_debug("%s:%*s reading data member disk %s sector %llu\n",
  627. __func__, indent, "", bdevname(rdev->bdev, b),
  628. (unsigned long long)sector);
  629. if (!sync_page_io(rdev, sector, block_size, page2,
  630. REQ_OP_READ, 0, false)) {
  631. md_error(mddev, rdev);
  632. pr_debug("%s:%*s read failed!\n", __func__,
  633. indent, "");
  634. ret = -EIO;
  635. goto out;
  636. }
  637. ppl_xor(block_size, page1, page2);
  638. indent -= 2;
  639. }
  640. if (!update_parity)
  641. continue;
  642. if (pp_size > 0) {
  643. pr_debug("%s:%*s reading pp disk sector %llu\n",
  644. __func__, indent, "",
  645. (unsigned long long)(ppl_sector + i));
  646. if (!sync_page_io(log->rdev,
  647. ppl_sector - log->rdev->data_offset + i,
  648. block_size, page2, REQ_OP_READ, 0,
  649. false)) {
  650. pr_debug("%s:%*s read failed!\n", __func__,
  651. indent, "");
  652. md_error(mddev, log->rdev);
  653. ret = -EIO;
  654. goto out;
  655. }
  656. ppl_xor(block_size, page1, page2);
  657. }
  658. /* map raid sector to parity disk */
  659. parity_sector = raid5_compute_sector(conf, r_sector_first + i,
  660. 0, &disk, &sh);
  661. BUG_ON(sh.pd_idx != le32_to_cpu(e->parity_disk));
  662. parity_rdev = conf->disks[sh.pd_idx].rdev;
  663. BUG_ON(parity_rdev->bdev->bd_dev != log->rdev->bdev->bd_dev);
  664. pr_debug("%s:%*s write parity at sector %llu, disk %s\n",
  665. __func__, indent, "",
  666. (unsigned long long)parity_sector,
  667. bdevname(parity_rdev->bdev, b));
  668. if (!sync_page_io(parity_rdev, parity_sector, block_size,
  669. page1, REQ_OP_WRITE, 0, false)) {
  670. pr_debug("%s:%*s parity write error!\n", __func__,
  671. indent, "");
  672. md_error(mddev, parity_rdev);
  673. ret = -EIO;
  674. goto out;
  675. }
  676. }
  677. out:
  678. if (page1)
  679. __free_page(page1);
  680. if (page2)
  681. __free_page(page2);
  682. return ret;
  683. }
  684. static int ppl_recover(struct ppl_log *log, struct ppl_header *pplhdr)
  685. {
  686. struct ppl_conf *ppl_conf = log->ppl_conf;
  687. struct md_rdev *rdev = log->rdev;
  688. struct mddev *mddev = rdev->mddev;
  689. sector_t ppl_sector = rdev->ppl.sector + (PPL_HEADER_SIZE >> 9);
  690. struct page *page;
  691. int i;
  692. int ret = 0;
  693. page = alloc_page(GFP_KERNEL);
  694. if (!page)
  695. return -ENOMEM;
  696. /* iterate through all PPL entries saved */
  697. for (i = 0; i < le32_to_cpu(pplhdr->entries_count); i++) {
  698. struct ppl_header_entry *e = &pplhdr->entries[i];
  699. u32 pp_size = le32_to_cpu(e->pp_size);
  700. sector_t sector = ppl_sector;
  701. int ppl_entry_sectors = pp_size >> 9;
  702. u32 crc, crc_stored;
  703. pr_debug("%s: disk: %d entry: %d ppl_sector: %llu pp_size: %u\n",
  704. __func__, rdev->raid_disk, i,
  705. (unsigned long long)ppl_sector, pp_size);
  706. crc = ~0;
  707. crc_stored = le32_to_cpu(e->checksum);
  708. /* read parial parity for this entry and calculate its checksum */
  709. while (pp_size) {
  710. int s = pp_size > PAGE_SIZE ? PAGE_SIZE : pp_size;
  711. if (!sync_page_io(rdev, sector - rdev->data_offset,
  712. s, page, REQ_OP_READ, 0, false)) {
  713. md_error(mddev, rdev);
  714. ret = -EIO;
  715. goto out;
  716. }
  717. crc = crc32c_le(crc, page_address(page), s);
  718. pp_size -= s;
  719. sector += s >> 9;
  720. }
  721. crc = ~crc;
  722. if (crc != crc_stored) {
  723. /*
  724. * Don't recover this entry if the checksum does not
  725. * match, but keep going and try to recover other
  726. * entries.
  727. */
  728. pr_debug("%s: ppl entry crc does not match: stored: 0x%x calculated: 0x%x\n",
  729. __func__, crc_stored, crc);
  730. ppl_conf->mismatch_count++;
  731. } else {
  732. ret = ppl_recover_entry(log, e, ppl_sector);
  733. if (ret)
  734. goto out;
  735. ppl_conf->recovered_entries++;
  736. }
  737. ppl_sector += ppl_entry_sectors;
  738. }
  739. /* flush the disk cache after recovery if necessary */
  740. ret = blkdev_issue_flush(rdev->bdev, GFP_KERNEL, NULL);
  741. out:
  742. __free_page(page);
  743. return ret;
  744. }
  745. static int ppl_write_empty_header(struct ppl_log *log)
  746. {
  747. struct page *page;
  748. struct ppl_header *pplhdr;
  749. struct md_rdev *rdev = log->rdev;
  750. int ret = 0;
  751. pr_debug("%s: disk: %d ppl_sector: %llu\n", __func__,
  752. rdev->raid_disk, (unsigned long long)rdev->ppl.sector);
  753. page = alloc_page(GFP_NOIO | __GFP_ZERO);
  754. if (!page)
  755. return -ENOMEM;
  756. pplhdr = page_address(page);
  757. memset(pplhdr->reserved, 0xff, PPL_HDR_RESERVED);
  758. pplhdr->signature = cpu_to_le32(log->ppl_conf->signature);
  759. pplhdr->checksum = cpu_to_le32(~crc32c_le(~0, pplhdr, PAGE_SIZE));
  760. if (!sync_page_io(rdev, rdev->ppl.sector - rdev->data_offset,
  761. PPL_HEADER_SIZE, page, REQ_OP_WRITE | REQ_SYNC |
  762. REQ_FUA, 0, false)) {
  763. md_error(rdev->mddev, rdev);
  764. ret = -EIO;
  765. }
  766. __free_page(page);
  767. return ret;
  768. }
  769. static int ppl_load_distributed(struct ppl_log *log)
  770. {
  771. struct ppl_conf *ppl_conf = log->ppl_conf;
  772. struct md_rdev *rdev = log->rdev;
  773. struct mddev *mddev = rdev->mddev;
  774. struct page *page;
  775. struct ppl_header *pplhdr;
  776. u32 crc, crc_stored;
  777. u32 signature;
  778. int ret = 0;
  779. pr_debug("%s: disk: %d\n", __func__, rdev->raid_disk);
  780. /* read PPL header */
  781. page = alloc_page(GFP_KERNEL);
  782. if (!page)
  783. return -ENOMEM;
  784. if (!sync_page_io(rdev, rdev->ppl.sector - rdev->data_offset,
  785. PAGE_SIZE, page, REQ_OP_READ, 0, false)) {
  786. md_error(mddev, rdev);
  787. ret = -EIO;
  788. goto out;
  789. }
  790. pplhdr = page_address(page);
  791. /* check header validity */
  792. crc_stored = le32_to_cpu(pplhdr->checksum);
  793. pplhdr->checksum = 0;
  794. crc = ~crc32c_le(~0, pplhdr, PAGE_SIZE);
  795. if (crc_stored != crc) {
  796. pr_debug("%s: ppl header crc does not match: stored: 0x%x calculated: 0x%x\n",
  797. __func__, crc_stored, crc);
  798. ppl_conf->mismatch_count++;
  799. goto out;
  800. }
  801. signature = le32_to_cpu(pplhdr->signature);
  802. if (mddev->external) {
  803. /*
  804. * For external metadata the header signature is set and
  805. * validated in userspace.
  806. */
  807. ppl_conf->signature = signature;
  808. } else if (ppl_conf->signature != signature) {
  809. pr_debug("%s: ppl header signature does not match: stored: 0x%x configured: 0x%x\n",
  810. __func__, signature, ppl_conf->signature);
  811. ppl_conf->mismatch_count++;
  812. goto out;
  813. }
  814. /* attempt to recover from log if we are starting a dirty array */
  815. if (!mddev->pers && mddev->recovery_cp != MaxSector)
  816. ret = ppl_recover(log, pplhdr);
  817. out:
  818. /* write empty header if we are starting the array */
  819. if (!ret && !mddev->pers)
  820. ret = ppl_write_empty_header(log);
  821. __free_page(page);
  822. pr_debug("%s: return: %d mismatch_count: %d recovered_entries: %d\n",
  823. __func__, ret, ppl_conf->mismatch_count,
  824. ppl_conf->recovered_entries);
  825. return ret;
  826. }
  827. static int ppl_load(struct ppl_conf *ppl_conf)
  828. {
  829. int ret = 0;
  830. u32 signature = 0;
  831. bool signature_set = false;
  832. int i;
  833. for (i = 0; i < ppl_conf->count; i++) {
  834. struct ppl_log *log = &ppl_conf->child_logs[i];
  835. /* skip missing drive */
  836. if (!log->rdev)
  837. continue;
  838. ret = ppl_load_distributed(log);
  839. if (ret)
  840. break;
  841. /*
  842. * For external metadata we can't check if the signature is
  843. * correct on a single drive, but we can check if it is the same
  844. * on all drives.
  845. */
  846. if (ppl_conf->mddev->external) {
  847. if (!signature_set) {
  848. signature = ppl_conf->signature;
  849. signature_set = true;
  850. } else if (signature != ppl_conf->signature) {
  851. pr_warn("md/raid:%s: PPL header signature does not match on all member drives\n",
  852. mdname(ppl_conf->mddev));
  853. ret = -EINVAL;
  854. break;
  855. }
  856. }
  857. }
  858. pr_debug("%s: return: %d mismatch_count: %d recovered_entries: %d\n",
  859. __func__, ret, ppl_conf->mismatch_count,
  860. ppl_conf->recovered_entries);
  861. return ret;
  862. }
  863. static void __ppl_exit_log(struct ppl_conf *ppl_conf)
  864. {
  865. clear_bit(MD_HAS_PPL, &ppl_conf->mddev->flags);
  866. kfree(ppl_conf->child_logs);
  867. if (ppl_conf->bs)
  868. bioset_free(ppl_conf->bs);
  869. mempool_destroy(ppl_conf->io_pool);
  870. kmem_cache_destroy(ppl_conf->io_kc);
  871. kfree(ppl_conf);
  872. }
  873. void ppl_exit_log(struct r5conf *conf)
  874. {
  875. struct ppl_conf *ppl_conf = conf->log_private;
  876. if (ppl_conf) {
  877. __ppl_exit_log(ppl_conf);
  878. conf->log_private = NULL;
  879. }
  880. }
  881. static int ppl_validate_rdev(struct md_rdev *rdev)
  882. {
  883. char b[BDEVNAME_SIZE];
  884. int ppl_data_sectors;
  885. int ppl_size_new;
  886. /*
  887. * The configured PPL size must be enough to store
  888. * the header and (at the very least) partial parity
  889. * for one stripe. Round it down to ensure the data
  890. * space is cleanly divisible by stripe size.
  891. */
  892. ppl_data_sectors = rdev->ppl.size - (PPL_HEADER_SIZE >> 9);
  893. if (ppl_data_sectors > 0)
  894. ppl_data_sectors = rounddown(ppl_data_sectors, STRIPE_SECTORS);
  895. if (ppl_data_sectors <= 0) {
  896. pr_warn("md/raid:%s: PPL space too small on %s\n",
  897. mdname(rdev->mddev), bdevname(rdev->bdev, b));
  898. return -ENOSPC;
  899. }
  900. ppl_size_new = ppl_data_sectors + (PPL_HEADER_SIZE >> 9);
  901. if ((rdev->ppl.sector < rdev->data_offset &&
  902. rdev->ppl.sector + ppl_size_new > rdev->data_offset) ||
  903. (rdev->ppl.sector >= rdev->data_offset &&
  904. rdev->data_offset + rdev->sectors > rdev->ppl.sector)) {
  905. pr_warn("md/raid:%s: PPL space overlaps with data on %s\n",
  906. mdname(rdev->mddev), bdevname(rdev->bdev, b));
  907. return -EINVAL;
  908. }
  909. if (!rdev->mddev->external &&
  910. ((rdev->ppl.offset > 0 && rdev->ppl.offset < (rdev->sb_size >> 9)) ||
  911. (rdev->ppl.offset <= 0 && rdev->ppl.offset + ppl_size_new > 0))) {
  912. pr_warn("md/raid:%s: PPL space overlaps with superblock on %s\n",
  913. mdname(rdev->mddev), bdevname(rdev->bdev, b));
  914. return -EINVAL;
  915. }
  916. rdev->ppl.size = ppl_size_new;
  917. return 0;
  918. }
  919. int ppl_init_log(struct r5conf *conf)
  920. {
  921. struct ppl_conf *ppl_conf;
  922. struct mddev *mddev = conf->mddev;
  923. int ret = 0;
  924. int i;
  925. bool need_cache_flush = false;
  926. pr_debug("md/raid:%s: enabling distributed Partial Parity Log\n",
  927. mdname(conf->mddev));
  928. if (PAGE_SIZE != 4096)
  929. return -EINVAL;
  930. if (mddev->level != 5) {
  931. pr_warn("md/raid:%s PPL is not compatible with raid level %d\n",
  932. mdname(mddev), mddev->level);
  933. return -EINVAL;
  934. }
  935. if (mddev->bitmap_info.file || mddev->bitmap_info.offset) {
  936. pr_warn("md/raid:%s PPL is not compatible with bitmap\n",
  937. mdname(mddev));
  938. return -EINVAL;
  939. }
  940. if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
  941. pr_warn("md/raid:%s PPL is not compatible with journal\n",
  942. mdname(mddev));
  943. return -EINVAL;
  944. }
  945. ppl_conf = kzalloc(sizeof(struct ppl_conf), GFP_KERNEL);
  946. if (!ppl_conf)
  947. return -ENOMEM;
  948. ppl_conf->mddev = mddev;
  949. ppl_conf->io_kc = KMEM_CACHE(ppl_io_unit, 0);
  950. if (!ppl_conf->io_kc) {
  951. ret = -ENOMEM;
  952. goto err;
  953. }
  954. ppl_conf->io_pool = mempool_create(conf->raid_disks, ppl_io_pool_alloc,
  955. ppl_io_pool_free, ppl_conf->io_kc);
  956. if (!ppl_conf->io_pool) {
  957. ret = -ENOMEM;
  958. goto err;
  959. }
  960. ppl_conf->bs = bioset_create(conf->raid_disks, 0);
  961. if (!ppl_conf->bs) {
  962. ret = -ENOMEM;
  963. goto err;
  964. }
  965. ppl_conf->count = conf->raid_disks;
  966. ppl_conf->child_logs = kcalloc(ppl_conf->count, sizeof(struct ppl_log),
  967. GFP_KERNEL);
  968. if (!ppl_conf->child_logs) {
  969. ret = -ENOMEM;
  970. goto err;
  971. }
  972. atomic64_set(&ppl_conf->seq, 0);
  973. INIT_LIST_HEAD(&ppl_conf->no_mem_stripes);
  974. spin_lock_init(&ppl_conf->no_mem_stripes_lock);
  975. if (!mddev->external) {
  976. ppl_conf->signature = ~crc32c_le(~0, mddev->uuid, sizeof(mddev->uuid));
  977. ppl_conf->block_size = 512;
  978. } else {
  979. ppl_conf->block_size = queue_logical_block_size(mddev->queue);
  980. }
  981. for (i = 0; i < ppl_conf->count; i++) {
  982. struct ppl_log *log = &ppl_conf->child_logs[i];
  983. struct md_rdev *rdev = conf->disks[i].rdev;
  984. mutex_init(&log->io_mutex);
  985. spin_lock_init(&log->io_list_lock);
  986. INIT_LIST_HEAD(&log->io_list);
  987. log->ppl_conf = ppl_conf;
  988. log->rdev = rdev;
  989. if (rdev) {
  990. struct request_queue *q;
  991. ret = ppl_validate_rdev(rdev);
  992. if (ret)
  993. goto err;
  994. q = bdev_get_queue(rdev->bdev);
  995. if (test_bit(QUEUE_FLAG_WC, &q->queue_flags))
  996. need_cache_flush = true;
  997. }
  998. }
  999. if (need_cache_flush)
  1000. pr_warn("md/raid:%s: Volatile write-back cache should be disabled on all member drives when using PPL!\n",
  1001. mdname(mddev));
  1002. /* load and possibly recover the logs from the member disks */
  1003. ret = ppl_load(ppl_conf);
  1004. if (ret) {
  1005. goto err;
  1006. } else if (!mddev->pers &&
  1007. mddev->recovery_cp == 0 && !mddev->degraded &&
  1008. ppl_conf->recovered_entries > 0 &&
  1009. ppl_conf->mismatch_count == 0) {
  1010. /*
  1011. * If we are starting a dirty array and the recovery succeeds
  1012. * without any issues, set the array as clean.
  1013. */
  1014. mddev->recovery_cp = MaxSector;
  1015. set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
  1016. } else if (mddev->pers && ppl_conf->mismatch_count > 0) {
  1017. /* no mismatch allowed when enabling PPL for a running array */
  1018. ret = -EINVAL;
  1019. goto err;
  1020. }
  1021. conf->log_private = ppl_conf;
  1022. set_bit(MD_HAS_PPL, &ppl_conf->mddev->flags);
  1023. return 0;
  1024. err:
  1025. __ppl_exit_log(ppl_conf);
  1026. return ret;
  1027. }
  1028. int ppl_modify_log(struct r5conf *conf, struct md_rdev *rdev, bool add)
  1029. {
  1030. struct ppl_conf *ppl_conf = conf->log_private;
  1031. struct ppl_log *log;
  1032. int ret = 0;
  1033. char b[BDEVNAME_SIZE];
  1034. if (!rdev)
  1035. return -EINVAL;
  1036. pr_debug("%s: disk: %d operation: %s dev: %s\n",
  1037. __func__, rdev->raid_disk, add ? "add" : "remove",
  1038. bdevname(rdev->bdev, b));
  1039. if (rdev->raid_disk < 0)
  1040. return 0;
  1041. if (rdev->raid_disk >= ppl_conf->count)
  1042. return -ENODEV;
  1043. log = &ppl_conf->child_logs[rdev->raid_disk];
  1044. mutex_lock(&log->io_mutex);
  1045. if (add) {
  1046. ret = ppl_validate_rdev(rdev);
  1047. if (!ret) {
  1048. log->rdev = rdev;
  1049. ret = ppl_write_empty_header(log);
  1050. }
  1051. } else {
  1052. log->rdev = NULL;
  1053. }
  1054. mutex_unlock(&log->io_mutex);
  1055. return ret;
  1056. }