raid5-cache.c 30 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103
  1. /*
  2. * Copyright (C) 2015 Shaohua Li <shli@fb.com>
  3. *
  4. * This program is free software; you can redistribute it and/or modify it
  5. * under the terms and conditions of the GNU General Public License,
  6. * version 2, as published by the Free Software Foundation.
  7. *
  8. * This program is distributed in the hope it will be useful, but WITHOUT
  9. * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  10. * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
  11. * more details.
  12. *
  13. */
  14. #include <linux/kernel.h>
  15. #include <linux/wait.h>
  16. #include <linux/blkdev.h>
  17. #include <linux/slab.h>
  18. #include <linux/raid/md_p.h>
  19. #include <linux/crc32c.h>
  20. #include <linux/random.h>
  21. #include "md.h"
  22. #include "raid5.h"
  23. /*
  24. * metadata/data stored in disk with 4k size unit (a block) regardless
  25. * underneath hardware sector size. only works with PAGE_SIZE == 4096
  26. */
  27. #define BLOCK_SECTORS (8)
  28. /*
  29. * reclaim runs every 1/4 disk size or 10G reclaimable space. This can prevent
  30. * recovery scans a very long log
  31. */
  32. #define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */
  33. #define RECLAIM_MAX_FREE_SPACE_SHIFT (2)
  34. struct r5l_log {
  35. struct md_rdev *rdev;
  36. u32 uuid_checksum;
  37. sector_t device_size; /* log device size, round to
  38. * BLOCK_SECTORS */
  39. sector_t max_free_space; /* reclaim run if free space is at
  40. * this size */
  41. sector_t last_checkpoint; /* log tail. where recovery scan
  42. * starts from */
  43. u64 last_cp_seq; /* log tail sequence */
  44. sector_t log_start; /* log head. where new data appends */
  45. u64 seq; /* log head sequence */
  46. struct mutex io_mutex;
  47. struct r5l_io_unit *current_io; /* current io_unit accepting new data */
  48. spinlock_t io_list_lock;
  49. struct list_head running_ios; /* io_units which are still running,
  50. * and have not yet been completely
  51. * written to the log */
  52. struct list_head io_end_ios; /* io_units which have been completely
  53. * written to the log but not yet written
  54. * to the RAID */
  55. struct list_head stripe_end_ios;/* io_units which have been completely
  56. * written to the RAID but have not yet
  57. * been considered for updating super */
  58. struct kmem_cache *io_kc;
  59. struct md_thread *reclaim_thread;
  60. unsigned long reclaim_target; /* number of space that need to be
  61. * reclaimed. if it's 0, reclaim spaces
  62. * used by io_units which are in
  63. * IO_UNIT_STRIPE_END state (eg, reclaim
  64. * dones't wait for specific io_unit
  65. * switching to IO_UNIT_STRIPE_END
  66. * state) */
  67. struct list_head no_space_stripes; /* pending stripes, log has no space */
  68. spinlock_t no_space_stripes_lock;
  69. };
  70. /*
  71. * an IO range starts from a meta data block and end at the next meta data
  72. * block. The io unit's the meta data block tracks data/parity followed it. io
  73. * unit is written to log disk with normal write, as we always flush log disk
  74. * first and then start move data to raid disks, there is no requirement to
  75. * write io unit with FLUSH/FUA
  76. */
  77. struct r5l_io_unit {
  78. struct r5l_log *log;
  79. struct page *meta_page; /* store meta block */
  80. int meta_offset; /* current offset in meta_page */
  81. struct bio_list bios;
  82. atomic_t pending_io; /* pending bios not written to log yet */
  83. struct bio *current_bio;/* current_bio accepting new data */
  84. atomic_t pending_stripe;/* how many stripes not flushed to raid */
  85. u64 seq; /* seq number of the metablock */
  86. sector_t log_start; /* where the io_unit starts */
  87. sector_t log_end; /* where the io_unit ends */
  88. struct list_head log_sibling; /* log->running_ios */
  89. struct list_head stripe_list; /* stripes added to the io_unit */
  90. int state;
  91. wait_queue_head_t wait_state;
  92. };
  93. /* r5l_io_unit state */
  94. enum r5l_io_unit_state {
  95. IO_UNIT_RUNNING = 0, /* accepting new IO */
  96. IO_UNIT_IO_START = 1, /* io_unit bio start writing to log,
  97. * don't accepting new bio */
  98. IO_UNIT_IO_END = 2, /* io_unit bio finish writing to log */
  99. IO_UNIT_STRIPE_START = 3, /* stripes of io_unit are flushing to raid */
  100. IO_UNIT_STRIPE_END = 4, /* stripes data finished writing to raid */
  101. };
  102. static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc)
  103. {
  104. start += inc;
  105. if (start >= log->device_size)
  106. start = start - log->device_size;
  107. return start;
  108. }
  109. static sector_t r5l_ring_distance(struct r5l_log *log, sector_t start,
  110. sector_t end)
  111. {
  112. if (end >= start)
  113. return end - start;
  114. else
  115. return end + log->device_size - start;
  116. }
  117. static bool r5l_has_free_space(struct r5l_log *log, sector_t size)
  118. {
  119. sector_t used_size;
  120. used_size = r5l_ring_distance(log, log->last_checkpoint,
  121. log->log_start);
  122. return log->device_size > used_size + size;
  123. }
  124. static struct r5l_io_unit *r5l_alloc_io_unit(struct r5l_log *log)
  125. {
  126. struct r5l_io_unit *io;
  127. /* We can't handle memory allocate failure so far */
  128. gfp_t gfp = GFP_NOIO | __GFP_NOFAIL;
  129. io = kmem_cache_zalloc(log->io_kc, gfp);
  130. io->log = log;
  131. io->meta_page = alloc_page(gfp | __GFP_ZERO);
  132. bio_list_init(&io->bios);
  133. INIT_LIST_HEAD(&io->log_sibling);
  134. INIT_LIST_HEAD(&io->stripe_list);
  135. io->state = IO_UNIT_RUNNING;
  136. init_waitqueue_head(&io->wait_state);
  137. return io;
  138. }
  139. static void r5l_free_io_unit(struct r5l_log *log, struct r5l_io_unit *io)
  140. {
  141. __free_page(io->meta_page);
  142. kmem_cache_free(log->io_kc, io);
  143. }
  144. static void r5l_move_io_unit_list(struct list_head *from, struct list_head *to,
  145. enum r5l_io_unit_state state)
  146. {
  147. struct r5l_io_unit *io;
  148. while (!list_empty(from)) {
  149. io = list_first_entry(from, struct r5l_io_unit, log_sibling);
  150. /* don't change list order */
  151. if (io->state >= state)
  152. list_move_tail(&io->log_sibling, to);
  153. else
  154. break;
  155. }
  156. }
  157. /*
  158. * We don't want too many io_units reside in stripe_end_ios list, which will
  159. * waste a lot of memory. So we try to remove some. But we must keep at least 2
  160. * io_units. The superblock must point to a valid meta, if it's the last meta,
  161. * recovery can scan less
  162. */
  163. static void r5l_compress_stripe_end_list(struct r5l_log *log)
  164. {
  165. struct r5l_io_unit *first, *last, *io;
  166. first = list_first_entry(&log->stripe_end_ios,
  167. struct r5l_io_unit, log_sibling);
  168. last = list_last_entry(&log->stripe_end_ios,
  169. struct r5l_io_unit, log_sibling);
  170. if (first == last)
  171. return;
  172. list_del(&first->log_sibling);
  173. list_del(&last->log_sibling);
  174. while (!list_empty(&log->stripe_end_ios)) {
  175. io = list_first_entry(&log->stripe_end_ios,
  176. struct r5l_io_unit, log_sibling);
  177. list_del(&io->log_sibling);
  178. first->log_end = io->log_end;
  179. r5l_free_io_unit(log, io);
  180. }
  181. list_add_tail(&first->log_sibling, &log->stripe_end_ios);
  182. list_add_tail(&last->log_sibling, &log->stripe_end_ios);
  183. }
  184. static void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
  185. static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
  186. enum r5l_io_unit_state state)
  187. {
  188. struct r5l_log *log = io->log;
  189. if (WARN_ON(io->state >= state))
  190. return;
  191. io->state = state;
  192. if (state == IO_UNIT_IO_END)
  193. r5l_move_io_unit_list(&log->running_ios, &log->io_end_ios,
  194. IO_UNIT_IO_END);
  195. if (state == IO_UNIT_STRIPE_END) {
  196. struct r5l_io_unit *last;
  197. sector_t reclaimable_space;
  198. r5l_move_io_unit_list(&log->io_end_ios, &log->stripe_end_ios,
  199. IO_UNIT_STRIPE_END);
  200. last = list_last_entry(&log->stripe_end_ios,
  201. struct r5l_io_unit, log_sibling);
  202. reclaimable_space = r5l_ring_distance(log, log->last_checkpoint,
  203. last->log_end);
  204. if (reclaimable_space >= log->max_free_space)
  205. r5l_wake_reclaim(log, 0);
  206. r5l_compress_stripe_end_list(log);
  207. }
  208. wake_up(&io->wait_state);
  209. }
  210. static void r5l_set_io_unit_state(struct r5l_io_unit *io,
  211. enum r5l_io_unit_state state)
  212. {
  213. struct r5l_log *log = io->log;
  214. unsigned long flags;
  215. spin_lock_irqsave(&log->io_list_lock, flags);
  216. __r5l_set_io_unit_state(io, state);
  217. spin_unlock_irqrestore(&log->io_list_lock, flags);
  218. }
  219. /* XXX: totally ignores I/O errors */
  220. static void r5l_log_endio(struct bio *bio)
  221. {
  222. struct r5l_io_unit *io = bio->bi_private;
  223. struct r5l_log *log = io->log;
  224. bio_put(bio);
  225. if (!atomic_dec_and_test(&io->pending_io))
  226. return;
  227. r5l_set_io_unit_state(io, IO_UNIT_IO_END);
  228. md_wakeup_thread(log->rdev->mddev->thread);
  229. }
  230. static void r5l_submit_current_io(struct r5l_log *log)
  231. {
  232. struct r5l_io_unit *io = log->current_io;
  233. struct r5l_meta_block *block;
  234. struct bio *bio;
  235. u32 crc;
  236. if (!io)
  237. return;
  238. block = page_address(io->meta_page);
  239. block->meta_size = cpu_to_le32(io->meta_offset);
  240. crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE);
  241. block->checksum = cpu_to_le32(crc);
  242. log->current_io = NULL;
  243. r5l_set_io_unit_state(io, IO_UNIT_IO_START);
  244. while ((bio = bio_list_pop(&io->bios))) {
  245. /* all IO must start from rdev->data_offset */
  246. bio->bi_iter.bi_sector += log->rdev->data_offset;
  247. submit_bio(WRITE, bio);
  248. }
  249. }
  250. static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
  251. {
  252. struct r5l_io_unit *io;
  253. struct r5l_meta_block *block;
  254. struct bio *bio;
  255. io = r5l_alloc_io_unit(log);
  256. block = page_address(io->meta_page);
  257. block->magic = cpu_to_le32(R5LOG_MAGIC);
  258. block->version = R5LOG_VERSION;
  259. block->seq = cpu_to_le64(log->seq);
  260. block->position = cpu_to_le64(log->log_start);
  261. io->log_start = log->log_start;
  262. io->meta_offset = sizeof(struct r5l_meta_block);
  263. io->seq = log->seq;
  264. bio = bio_kmalloc(GFP_NOIO | __GFP_NOFAIL, BIO_MAX_PAGES);
  265. io->current_bio = bio;
  266. bio->bi_rw = WRITE;
  267. bio->bi_bdev = log->rdev->bdev;
  268. bio->bi_iter.bi_sector = log->log_start;
  269. bio_add_page(bio, io->meta_page, PAGE_SIZE, 0);
  270. bio->bi_end_io = r5l_log_endio;
  271. bio->bi_private = io;
  272. bio_list_add(&io->bios, bio);
  273. atomic_inc(&io->pending_io);
  274. log->seq++;
  275. log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS);
  276. io->log_end = log->log_start;
  277. /* current bio hit disk end */
  278. if (log->log_start == 0)
  279. io->current_bio = NULL;
  280. spin_lock_irq(&log->io_list_lock);
  281. list_add_tail(&io->log_sibling, &log->running_ios);
  282. spin_unlock_irq(&log->io_list_lock);
  283. return io;
  284. }
  285. static int r5l_get_meta(struct r5l_log *log, unsigned int payload_size)
  286. {
  287. struct r5l_io_unit *io;
  288. io = log->current_io;
  289. if (io && io->meta_offset + payload_size > PAGE_SIZE)
  290. r5l_submit_current_io(log);
  291. io = log->current_io;
  292. if (io)
  293. return 0;
  294. log->current_io = r5l_new_meta(log);
  295. return 0;
  296. }
  297. static void r5l_append_payload_meta(struct r5l_log *log, u16 type,
  298. sector_t location,
  299. u32 checksum1, u32 checksum2,
  300. bool checksum2_valid)
  301. {
  302. struct r5l_io_unit *io = log->current_io;
  303. struct r5l_payload_data_parity *payload;
  304. payload = page_address(io->meta_page) + io->meta_offset;
  305. payload->header.type = cpu_to_le16(type);
  306. payload->header.flags = cpu_to_le16(0);
  307. payload->size = cpu_to_le32((1 + !!checksum2_valid) <<
  308. (PAGE_SHIFT - 9));
  309. payload->location = cpu_to_le64(location);
  310. payload->checksum[0] = cpu_to_le32(checksum1);
  311. if (checksum2_valid)
  312. payload->checksum[1] = cpu_to_le32(checksum2);
  313. io->meta_offset += sizeof(struct r5l_payload_data_parity) +
  314. sizeof(__le32) * (1 + !!checksum2_valid);
  315. }
  316. static void r5l_append_payload_page(struct r5l_log *log, struct page *page)
  317. {
  318. struct r5l_io_unit *io = log->current_io;
  319. alloc_bio:
  320. if (!io->current_bio) {
  321. struct bio *bio;
  322. bio = bio_kmalloc(GFP_NOIO | __GFP_NOFAIL, BIO_MAX_PAGES);
  323. bio->bi_rw = WRITE;
  324. bio->bi_bdev = log->rdev->bdev;
  325. bio->bi_iter.bi_sector = log->log_start;
  326. bio->bi_end_io = r5l_log_endio;
  327. bio->bi_private = io;
  328. bio_list_add(&io->bios, bio);
  329. atomic_inc(&io->pending_io);
  330. io->current_bio = bio;
  331. }
  332. if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0)) {
  333. io->current_bio = NULL;
  334. goto alloc_bio;
  335. }
  336. log->log_start = r5l_ring_add(log, log->log_start,
  337. BLOCK_SECTORS);
  338. /* current bio hit disk end */
  339. if (log->log_start == 0)
  340. io->current_bio = NULL;
  341. io->log_end = log->log_start;
  342. }
  343. static void r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
  344. int data_pages, int parity_pages)
  345. {
  346. int i;
  347. int meta_size;
  348. struct r5l_io_unit *io;
  349. meta_size =
  350. ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
  351. * data_pages) +
  352. sizeof(struct r5l_payload_data_parity) +
  353. sizeof(__le32) * parity_pages;
  354. r5l_get_meta(log, meta_size);
  355. io = log->current_io;
  356. for (i = 0; i < sh->disks; i++) {
  357. if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
  358. continue;
  359. if (i == sh->pd_idx || i == sh->qd_idx)
  360. continue;
  361. r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA,
  362. raid5_compute_blocknr(sh, i, 0),
  363. sh->dev[i].log_checksum, 0, false);
  364. r5l_append_payload_page(log, sh->dev[i].page);
  365. }
  366. if (sh->qd_idx >= 0) {
  367. r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
  368. sh->sector, sh->dev[sh->pd_idx].log_checksum,
  369. sh->dev[sh->qd_idx].log_checksum, true);
  370. r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
  371. r5l_append_payload_page(log, sh->dev[sh->qd_idx].page);
  372. } else {
  373. r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
  374. sh->sector, sh->dev[sh->pd_idx].log_checksum,
  375. 0, false);
  376. r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
  377. }
  378. list_add_tail(&sh->log_list, &io->stripe_list);
  379. atomic_inc(&io->pending_stripe);
  380. sh->log_io = io;
  381. }
  382. /*
  383. * running in raid5d, where reclaim could wait for raid5d too (when it flushes
  384. * data from log to raid disks), so we shouldn't wait for reclaim here
  385. */
  386. int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
  387. {
  388. int write_disks = 0;
  389. int data_pages, parity_pages;
  390. int meta_size;
  391. int reserve;
  392. int i;
  393. if (!log)
  394. return -EAGAIN;
  395. /* Don't support stripe batch */
  396. if (sh->log_io || !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) ||
  397. test_bit(STRIPE_SYNCING, &sh->state)) {
  398. /* the stripe is written to log, we start writing it to raid */
  399. clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
  400. return -EAGAIN;
  401. }
  402. for (i = 0; i < sh->disks; i++) {
  403. void *addr;
  404. if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
  405. continue;
  406. write_disks++;
  407. /* checksum is already calculated in last run */
  408. if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
  409. continue;
  410. addr = kmap_atomic(sh->dev[i].page);
  411. sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum,
  412. addr, PAGE_SIZE);
  413. kunmap_atomic(addr);
  414. }
  415. parity_pages = 1 + !!(sh->qd_idx >= 0);
  416. data_pages = write_disks - parity_pages;
  417. meta_size =
  418. ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
  419. * data_pages) +
  420. sizeof(struct r5l_payload_data_parity) +
  421. sizeof(__le32) * parity_pages;
  422. /* Doesn't work with very big raid array */
  423. if (meta_size + sizeof(struct r5l_meta_block) > PAGE_SIZE)
  424. return -EINVAL;
  425. set_bit(STRIPE_LOG_TRAPPED, &sh->state);
  426. atomic_inc(&sh->count);
  427. mutex_lock(&log->io_mutex);
  428. /* meta + data */
  429. reserve = (1 + write_disks) << (PAGE_SHIFT - 9);
  430. if (r5l_has_free_space(log, reserve))
  431. r5l_log_stripe(log, sh, data_pages, parity_pages);
  432. else {
  433. spin_lock(&log->no_space_stripes_lock);
  434. list_add_tail(&sh->log_list, &log->no_space_stripes);
  435. spin_unlock(&log->no_space_stripes_lock);
  436. r5l_wake_reclaim(log, reserve);
  437. }
  438. mutex_unlock(&log->io_mutex);
  439. return 0;
  440. }
  441. void r5l_write_stripe_run(struct r5l_log *log)
  442. {
  443. if (!log)
  444. return;
  445. mutex_lock(&log->io_mutex);
  446. r5l_submit_current_io(log);
  447. mutex_unlock(&log->io_mutex);
  448. }
  449. /* This will run after log space is reclaimed */
  450. static void r5l_run_no_space_stripes(struct r5l_log *log)
  451. {
  452. struct stripe_head *sh;
  453. spin_lock(&log->no_space_stripes_lock);
  454. while (!list_empty(&log->no_space_stripes)) {
  455. sh = list_first_entry(&log->no_space_stripes,
  456. struct stripe_head, log_list);
  457. list_del_init(&sh->log_list);
  458. set_bit(STRIPE_HANDLE, &sh->state);
  459. raid5_release_stripe(sh);
  460. }
  461. spin_unlock(&log->no_space_stripes_lock);
  462. }
  463. void r5l_stripe_write_finished(struct stripe_head *sh)
  464. {
  465. struct r5l_io_unit *io;
  466. /* Don't support stripe batch */
  467. io = sh->log_io;
  468. if (!io)
  469. return;
  470. sh->log_io = NULL;
  471. if (atomic_dec_and_test(&io->pending_stripe))
  472. r5l_set_io_unit_state(io, IO_UNIT_STRIPE_END);
  473. }
  474. /*
  475. * Starting dispatch IO to raid.
  476. * io_unit(meta) consists of a log. There is one situation we want to avoid. A
  477. * broken meta in the middle of a log causes recovery can't find meta at the
  478. * head of log. If operations require meta at the head persistent in log, we
  479. * must make sure meta before it persistent in log too. A case is:
  480. *
  481. * stripe data/parity is in log, we start write stripe to raid disks. stripe
  482. * data/parity must be persistent in log before we do the write to raid disks.
  483. *
  484. * The solution is we restrictly maintain io_unit list order. In this case, we
  485. * only write stripes of an io_unit to raid disks till the io_unit is the first
  486. * one whose data/parity is in log.
  487. */
  488. void r5l_flush_stripe_to_raid(struct r5l_log *log)
  489. {
  490. struct r5l_io_unit *io;
  491. struct stripe_head *sh;
  492. bool run_stripe;
  493. if (!log)
  494. return;
  495. spin_lock_irq(&log->io_list_lock);
  496. run_stripe = !list_empty(&log->io_end_ios);
  497. spin_unlock_irq(&log->io_list_lock);
  498. if (!run_stripe)
  499. return;
  500. blkdev_issue_flush(log->rdev->bdev, GFP_NOIO, NULL);
  501. spin_lock_irq(&log->io_list_lock);
  502. list_for_each_entry(io, &log->io_end_ios, log_sibling) {
  503. if (io->state >= IO_UNIT_STRIPE_START)
  504. continue;
  505. __r5l_set_io_unit_state(io, IO_UNIT_STRIPE_START);
  506. while (!list_empty(&io->stripe_list)) {
  507. sh = list_first_entry(&io->stripe_list,
  508. struct stripe_head, log_list);
  509. list_del_init(&sh->log_list);
  510. set_bit(STRIPE_HANDLE, &sh->state);
  511. raid5_release_stripe(sh);
  512. }
  513. }
  514. spin_unlock_irq(&log->io_list_lock);
  515. }
  516. static void r5l_kick_io_unit(struct r5l_log *log, struct r5l_io_unit *io)
  517. {
  518. /* the log thread will write the io unit */
  519. wait_event(io->wait_state, io->state >= IO_UNIT_IO_END);
  520. if (io->state < IO_UNIT_STRIPE_START)
  521. r5l_flush_stripe_to_raid(log);
  522. wait_event(io->wait_state, io->state >= IO_UNIT_STRIPE_END);
  523. }
  524. static void r5l_write_super(struct r5l_log *log, sector_t cp);
  525. static void r5l_do_reclaim(struct r5l_log *log)
  526. {
  527. struct r5l_io_unit *io, *last;
  528. LIST_HEAD(list);
  529. sector_t free = 0;
  530. sector_t reclaim_target = xchg(&log->reclaim_target, 0);
  531. spin_lock_irq(&log->io_list_lock);
  532. /*
  533. * move proper io_unit to reclaim list. We should not change the order.
  534. * reclaimable/unreclaimable io_unit can be mixed in the list, we
  535. * shouldn't reuse space of an unreclaimable io_unit
  536. */
  537. while (1) {
  538. while (!list_empty(&log->stripe_end_ios)) {
  539. io = list_first_entry(&log->stripe_end_ios,
  540. struct r5l_io_unit, log_sibling);
  541. list_move_tail(&io->log_sibling, &list);
  542. free += r5l_ring_distance(log, io->log_start,
  543. io->log_end);
  544. }
  545. if (free >= reclaim_target ||
  546. (list_empty(&log->running_ios) &&
  547. list_empty(&log->io_end_ios) &&
  548. list_empty(&log->stripe_end_ios)))
  549. break;
  550. /* Below waiting mostly happens when we shutdown the raid */
  551. if (!list_empty(&log->io_end_ios)) {
  552. io = list_first_entry(&log->io_end_ios,
  553. struct r5l_io_unit, log_sibling);
  554. spin_unlock_irq(&log->io_list_lock);
  555. /* nobody else can delete the io, we are safe */
  556. r5l_kick_io_unit(log, io);
  557. spin_lock_irq(&log->io_list_lock);
  558. continue;
  559. }
  560. if (!list_empty(&log->running_ios)) {
  561. io = list_first_entry(&log->running_ios,
  562. struct r5l_io_unit, log_sibling);
  563. spin_unlock_irq(&log->io_list_lock);
  564. /* nobody else can delete the io, we are safe */
  565. r5l_kick_io_unit(log, io);
  566. spin_lock_irq(&log->io_list_lock);
  567. continue;
  568. }
  569. }
  570. spin_unlock_irq(&log->io_list_lock);
  571. if (list_empty(&list))
  572. return;
  573. /* super always point to last valid meta */
  574. last = list_last_entry(&list, struct r5l_io_unit, log_sibling);
  575. /*
  576. * write_super will flush cache of each raid disk. We must write super
  577. * here, because the log area might be reused soon and we don't want to
  578. * confuse recovery
  579. */
  580. r5l_write_super(log, last->log_start);
  581. mutex_lock(&log->io_mutex);
  582. log->last_checkpoint = last->log_start;
  583. log->last_cp_seq = last->seq;
  584. mutex_unlock(&log->io_mutex);
  585. r5l_run_no_space_stripes(log);
  586. while (!list_empty(&list)) {
  587. io = list_first_entry(&list, struct r5l_io_unit, log_sibling);
  588. list_del(&io->log_sibling);
  589. r5l_free_io_unit(log, io);
  590. }
  591. }
  592. static void r5l_reclaim_thread(struct md_thread *thread)
  593. {
  594. struct mddev *mddev = thread->mddev;
  595. struct r5conf *conf = mddev->private;
  596. struct r5l_log *log = conf->log;
  597. if (!log)
  598. return;
  599. r5l_do_reclaim(log);
  600. }
  601. static void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
  602. {
  603. unsigned long target;
  604. unsigned long new = (unsigned long)space; /* overflow in theory */
  605. do {
  606. target = log->reclaim_target;
  607. if (new < target)
  608. return;
  609. } while (cmpxchg(&log->reclaim_target, target, new) != target);
  610. md_wakeup_thread(log->reclaim_thread);
  611. }
  612. struct r5l_recovery_ctx {
  613. struct page *meta_page; /* current meta */
  614. sector_t meta_total_blocks; /* total size of current meta and data */
  615. sector_t pos; /* recovery position */
  616. u64 seq; /* recovery position seq */
  617. };
  618. static int r5l_read_meta_block(struct r5l_log *log,
  619. struct r5l_recovery_ctx *ctx)
  620. {
  621. struct page *page = ctx->meta_page;
  622. struct r5l_meta_block *mb;
  623. u32 crc, stored_crc;
  624. if (!sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, READ, false))
  625. return -EIO;
  626. mb = page_address(page);
  627. stored_crc = le32_to_cpu(mb->checksum);
  628. mb->checksum = 0;
  629. if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
  630. le64_to_cpu(mb->seq) != ctx->seq ||
  631. mb->version != R5LOG_VERSION ||
  632. le64_to_cpu(mb->position) != ctx->pos)
  633. return -EINVAL;
  634. crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
  635. if (stored_crc != crc)
  636. return -EINVAL;
  637. if (le32_to_cpu(mb->meta_size) > PAGE_SIZE)
  638. return -EINVAL;
  639. ctx->meta_total_blocks = BLOCK_SECTORS;
  640. return 0;
  641. }
  642. static int r5l_recovery_flush_one_stripe(struct r5l_log *log,
  643. struct r5l_recovery_ctx *ctx,
  644. sector_t stripe_sect,
  645. int *offset, sector_t *log_offset)
  646. {
  647. struct r5conf *conf = log->rdev->mddev->private;
  648. struct stripe_head *sh;
  649. struct r5l_payload_data_parity *payload;
  650. int disk_index;
  651. sh = raid5_get_active_stripe(conf, stripe_sect, 0, 0, 0);
  652. while (1) {
  653. payload = page_address(ctx->meta_page) + *offset;
  654. if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) {
  655. raid5_compute_sector(conf,
  656. le64_to_cpu(payload->location), 0,
  657. &disk_index, sh);
  658. sync_page_io(log->rdev, *log_offset, PAGE_SIZE,
  659. sh->dev[disk_index].page, READ, false);
  660. sh->dev[disk_index].log_checksum =
  661. le32_to_cpu(payload->checksum[0]);
  662. set_bit(R5_Wantwrite, &sh->dev[disk_index].flags);
  663. ctx->meta_total_blocks += BLOCK_SECTORS;
  664. } else {
  665. disk_index = sh->pd_idx;
  666. sync_page_io(log->rdev, *log_offset, PAGE_SIZE,
  667. sh->dev[disk_index].page, READ, false);
  668. sh->dev[disk_index].log_checksum =
  669. le32_to_cpu(payload->checksum[0]);
  670. set_bit(R5_Wantwrite, &sh->dev[disk_index].flags);
  671. if (sh->qd_idx >= 0) {
  672. disk_index = sh->qd_idx;
  673. sync_page_io(log->rdev,
  674. r5l_ring_add(log, *log_offset, BLOCK_SECTORS),
  675. PAGE_SIZE, sh->dev[disk_index].page,
  676. READ, false);
  677. sh->dev[disk_index].log_checksum =
  678. le32_to_cpu(payload->checksum[1]);
  679. set_bit(R5_Wantwrite,
  680. &sh->dev[disk_index].flags);
  681. }
  682. ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded;
  683. }
  684. *log_offset = r5l_ring_add(log, *log_offset,
  685. le32_to_cpu(payload->size));
  686. *offset += sizeof(struct r5l_payload_data_parity) +
  687. sizeof(__le32) *
  688. (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
  689. if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY)
  690. break;
  691. }
  692. for (disk_index = 0; disk_index < sh->disks; disk_index++) {
  693. void *addr;
  694. u32 checksum;
  695. if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags))
  696. continue;
  697. addr = kmap_atomic(sh->dev[disk_index].page);
  698. checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE);
  699. kunmap_atomic(addr);
  700. if (checksum != sh->dev[disk_index].log_checksum)
  701. goto error;
  702. }
  703. for (disk_index = 0; disk_index < sh->disks; disk_index++) {
  704. struct md_rdev *rdev, *rrdev;
  705. if (!test_and_clear_bit(R5_Wantwrite,
  706. &sh->dev[disk_index].flags))
  707. continue;
  708. /* in case device is broken */
  709. rdev = rcu_dereference(conf->disks[disk_index].rdev);
  710. if (rdev)
  711. sync_page_io(rdev, stripe_sect, PAGE_SIZE,
  712. sh->dev[disk_index].page, WRITE, false);
  713. rrdev = rcu_dereference(conf->disks[disk_index].replacement);
  714. if (rrdev)
  715. sync_page_io(rrdev, stripe_sect, PAGE_SIZE,
  716. sh->dev[disk_index].page, WRITE, false);
  717. }
  718. raid5_release_stripe(sh);
  719. return 0;
  720. error:
  721. for (disk_index = 0; disk_index < sh->disks; disk_index++)
  722. sh->dev[disk_index].flags = 0;
  723. raid5_release_stripe(sh);
  724. return -EINVAL;
  725. }
  726. static int r5l_recovery_flush_one_meta(struct r5l_log *log,
  727. struct r5l_recovery_ctx *ctx)
  728. {
  729. struct r5conf *conf = log->rdev->mddev->private;
  730. struct r5l_payload_data_parity *payload;
  731. struct r5l_meta_block *mb;
  732. int offset;
  733. sector_t log_offset;
  734. sector_t stripe_sector;
  735. mb = page_address(ctx->meta_page);
  736. offset = sizeof(struct r5l_meta_block);
  737. log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
  738. while (offset < le32_to_cpu(mb->meta_size)) {
  739. int dd;
  740. payload = (void *)mb + offset;
  741. stripe_sector = raid5_compute_sector(conf,
  742. le64_to_cpu(payload->location), 0, &dd, NULL);
  743. if (r5l_recovery_flush_one_stripe(log, ctx, stripe_sector,
  744. &offset, &log_offset))
  745. return -EINVAL;
  746. }
  747. return 0;
  748. }
  749. /* copy data/parity from log to raid disks */
  750. static void r5l_recovery_flush_log(struct r5l_log *log,
  751. struct r5l_recovery_ctx *ctx)
  752. {
  753. while (1) {
  754. if (r5l_read_meta_block(log, ctx))
  755. return;
  756. if (r5l_recovery_flush_one_meta(log, ctx))
  757. return;
  758. ctx->seq++;
  759. ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks);
  760. }
  761. }
  762. static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos,
  763. u64 seq)
  764. {
  765. struct page *page;
  766. struct r5l_meta_block *mb;
  767. u32 crc;
  768. page = alloc_page(GFP_KERNEL | __GFP_ZERO);
  769. if (!page)
  770. return -ENOMEM;
  771. mb = page_address(page);
  772. mb->magic = cpu_to_le32(R5LOG_MAGIC);
  773. mb->version = R5LOG_VERSION;
  774. mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block));
  775. mb->seq = cpu_to_le64(seq);
  776. mb->position = cpu_to_le64(pos);
  777. crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
  778. mb->checksum = cpu_to_le32(crc);
  779. if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, WRITE_FUA, false)) {
  780. __free_page(page);
  781. return -EIO;
  782. }
  783. __free_page(page);
  784. return 0;
  785. }
  786. static int r5l_recovery_log(struct r5l_log *log)
  787. {
  788. struct r5l_recovery_ctx ctx;
  789. ctx.pos = log->last_checkpoint;
  790. ctx.seq = log->last_cp_seq;
  791. ctx.meta_page = alloc_page(GFP_KERNEL);
  792. if (!ctx.meta_page)
  793. return -ENOMEM;
  794. r5l_recovery_flush_log(log, &ctx);
  795. __free_page(ctx.meta_page);
  796. /*
  797. * we did a recovery. Now ctx.pos points to an invalid meta block. New
  798. * log will start here. but we can't let superblock point to last valid
  799. * meta block. The log might looks like:
  800. * | meta 1| meta 2| meta 3|
  801. * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If
  802. * superblock points to meta 1, we write a new valid meta 2n. if crash
  803. * happens again, new recovery will start from meta 1. Since meta 2n is
  804. * valid now, recovery will think meta 3 is valid, which is wrong.
  805. * The solution is we create a new meta in meta2 with its seq == meta
  806. * 1's seq + 10 and let superblock points to meta2. The same recovery will
  807. * not think meta 3 is a valid meta, because its seq doesn't match
  808. */
  809. if (ctx.seq > log->last_cp_seq + 1) {
  810. int ret;
  811. ret = r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq + 10);
  812. if (ret)
  813. return ret;
  814. log->seq = ctx.seq + 11;
  815. log->log_start = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS);
  816. r5l_write_super(log, ctx.pos);
  817. } else {
  818. log->log_start = ctx.pos;
  819. log->seq = ctx.seq;
  820. }
  821. return 0;
  822. }
  823. static void r5l_write_super(struct r5l_log *log, sector_t cp)
  824. {
  825. struct mddev *mddev = log->rdev->mddev;
  826. log->rdev->journal_tail = cp;
  827. set_bit(MD_CHANGE_DEVS, &mddev->flags);
  828. }
  829. static int r5l_load_log(struct r5l_log *log)
  830. {
  831. struct md_rdev *rdev = log->rdev;
  832. struct page *page;
  833. struct r5l_meta_block *mb;
  834. sector_t cp = log->rdev->journal_tail;
  835. u32 stored_crc, expected_crc;
  836. bool create_super = false;
  837. int ret;
  838. /* Make sure it's valid */
  839. if (cp >= rdev->sectors || round_down(cp, BLOCK_SECTORS) != cp)
  840. cp = 0;
  841. page = alloc_page(GFP_KERNEL);
  842. if (!page)
  843. return -ENOMEM;
  844. if (!sync_page_io(rdev, cp, PAGE_SIZE, page, READ, false)) {
  845. ret = -EIO;
  846. goto ioerr;
  847. }
  848. mb = page_address(page);
  849. if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
  850. mb->version != R5LOG_VERSION) {
  851. create_super = true;
  852. goto create;
  853. }
  854. stored_crc = le32_to_cpu(mb->checksum);
  855. mb->checksum = 0;
  856. expected_crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
  857. if (stored_crc != expected_crc) {
  858. create_super = true;
  859. goto create;
  860. }
  861. if (le64_to_cpu(mb->position) != cp) {
  862. create_super = true;
  863. goto create;
  864. }
  865. create:
  866. if (create_super) {
  867. log->last_cp_seq = prandom_u32();
  868. cp = 0;
  869. /*
  870. * Make sure super points to correct address. Log might have
  871. * data very soon. If super hasn't correct log tail address,
  872. * recovery can't find the log
  873. */
  874. r5l_write_super(log, cp);
  875. } else
  876. log->last_cp_seq = le64_to_cpu(mb->seq);
  877. log->device_size = round_down(rdev->sectors, BLOCK_SECTORS);
  878. log->max_free_space = log->device_size >> RECLAIM_MAX_FREE_SPACE_SHIFT;
  879. if (log->max_free_space > RECLAIM_MAX_FREE_SPACE)
  880. log->max_free_space = RECLAIM_MAX_FREE_SPACE;
  881. log->last_checkpoint = cp;
  882. __free_page(page);
  883. return r5l_recovery_log(log);
  884. ioerr:
  885. __free_page(page);
  886. return ret;
  887. }
  888. int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
  889. {
  890. struct r5l_log *log;
  891. if (PAGE_SIZE != 4096)
  892. return -EINVAL;
  893. log = kzalloc(sizeof(*log), GFP_KERNEL);
  894. if (!log)
  895. return -ENOMEM;
  896. log->rdev = rdev;
  897. log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid,
  898. sizeof(rdev->mddev->uuid));
  899. mutex_init(&log->io_mutex);
  900. spin_lock_init(&log->io_list_lock);
  901. INIT_LIST_HEAD(&log->running_ios);
  902. INIT_LIST_HEAD(&log->io_end_ios);
  903. INIT_LIST_HEAD(&log->stripe_end_ios);
  904. log->io_kc = KMEM_CACHE(r5l_io_unit, 0);
  905. if (!log->io_kc)
  906. goto io_kc;
  907. log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
  908. log->rdev->mddev, "reclaim");
  909. if (!log->reclaim_thread)
  910. goto reclaim_thread;
  911. INIT_LIST_HEAD(&log->no_space_stripes);
  912. spin_lock_init(&log->no_space_stripes_lock);
  913. if (r5l_load_log(log))
  914. goto error;
  915. conf->log = log;
  916. return 0;
  917. error:
  918. md_unregister_thread(&log->reclaim_thread);
  919. reclaim_thread:
  920. kmem_cache_destroy(log->io_kc);
  921. io_kc:
  922. kfree(log);
  923. return -EINVAL;
  924. }
  925. void r5l_exit_log(struct r5l_log *log)
  926. {
  927. /*
  928. * at this point all stripes are finished, so io_unit is at least in
  929. * STRIPE_END state
  930. */
  931. r5l_wake_reclaim(log, -1L);
  932. md_unregister_thread(&log->reclaim_thread);
  933. r5l_do_reclaim(log);
  934. /*
  935. * force a super update, r5l_do_reclaim might updated the super.
  936. * mddev->thread is already stopped
  937. */
  938. md_update_sb(log->rdev->mddev, 1);
  939. kmem_cache_destroy(log->io_kc);
  940. kfree(log);
  941. }