writeback.c 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * background writeback - scan btree for dirty data and write it to the backing
  4. * device
  5. *
  6. * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
  7. * Copyright 2012 Google, Inc.
  8. */
  9. #include "bcache.h"
  10. #include "btree.h"
  11. #include "debug.h"
  12. #include "writeback.h"
  13. #include <linux/delay.h>
  14. #include <linux/kthread.h>
  15. #include <linux/sched/clock.h>
  16. #include <trace/events/bcache.h>
  17. /* Rate limiting */
  18. static uint64_t __calc_target_rate(struct cached_dev *dc)
  19. {
  20. struct cache_set *c = dc->disk.c;
  21. /*
  22. * This is the size of the cache, minus the amount used for
  23. * flash-only devices
  24. */
  25. uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size -
  26. bcache_flash_devs_sectors_dirty(c);
  27. /*
  28. * Unfortunately there is no control of global dirty data. If the
  29. * user states that they want 10% dirty data in the cache, and has,
  30. * e.g., 5 backing volumes of equal size, we try and ensure each
  31. * backing volume uses about 2% of the cache for dirty data.
  32. */
  33. uint32_t bdev_share =
  34. div64_u64(bdev_sectors(dc->bdev) << WRITEBACK_SHARE_SHIFT,
  35. c->cached_dev_sectors);
  36. uint64_t cache_dirty_target =
  37. div_u64(cache_sectors * dc->writeback_percent, 100);
  38. /* Ensure each backing dev gets at least one dirty share */
  39. if (bdev_share < 1)
  40. bdev_share = 1;
  41. return (cache_dirty_target * bdev_share) >> WRITEBACK_SHARE_SHIFT;
  42. }
  43. static void __update_writeback_rate(struct cached_dev *dc)
  44. {
  45. /*
  46. * PI controller:
  47. * Figures out the amount that should be written per second.
  48. *
  49. * First, the error (number of sectors that are dirty beyond our
  50. * target) is calculated. The error is accumulated (numerically
  51. * integrated).
  52. *
  53. * Then, the proportional value and integral value are scaled
  54. * based on configured values. These are stored as inverses to
  55. * avoid fixed point math and to make configuration easy-- e.g.
  56. * the default value of 40 for writeback_rate_p_term_inverse
  57. * attempts to write at a rate that would retire all the dirty
  58. * blocks in 40 seconds.
  59. *
  60. * The writeback_rate_i_inverse value of 10000 means that 1/10000th
  61. * of the error is accumulated in the integral term per second.
  62. * This acts as a slow, long-term average that is not subject to
  63. * variations in usage like the p term.
  64. */
  65. int64_t target = __calc_target_rate(dc);
  66. int64_t dirty = bcache_dev_sectors_dirty(&dc->disk);
  67. int64_t error = dirty - target;
  68. int64_t proportional_scaled =
  69. div_s64(error, dc->writeback_rate_p_term_inverse);
  70. int64_t integral_scaled;
  71. uint32_t new_rate;
  72. if ((error < 0 && dc->writeback_rate_integral > 0) ||
  73. (error > 0 && time_before64(local_clock(),
  74. dc->writeback_rate.next + NSEC_PER_MSEC))) {
  75. /*
  76. * Only decrease the integral term if it's more than
  77. * zero. Only increase the integral term if the device
  78. * is keeping up. (Don't wind up the integral
  79. * ineffectively in either case).
  80. *
  81. * It's necessary to scale this by
  82. * writeback_rate_update_seconds to keep the integral
  83. * term dimensioned properly.
  84. */
  85. dc->writeback_rate_integral += error *
  86. dc->writeback_rate_update_seconds;
  87. }
  88. integral_scaled = div_s64(dc->writeback_rate_integral,
  89. dc->writeback_rate_i_term_inverse);
  90. new_rate = clamp_t(int32_t, (proportional_scaled + integral_scaled),
  91. dc->writeback_rate_minimum, NSEC_PER_SEC);
  92. dc->writeback_rate_proportional = proportional_scaled;
  93. dc->writeback_rate_integral_scaled = integral_scaled;
  94. dc->writeback_rate_change = new_rate - dc->writeback_rate.rate;
  95. dc->writeback_rate.rate = new_rate;
  96. dc->writeback_rate_target = target;
  97. }
  98. static void update_writeback_rate(struct work_struct *work)
  99. {
  100. struct cached_dev *dc = container_of(to_delayed_work(work),
  101. struct cached_dev,
  102. writeback_rate_update);
  103. struct cache_set *c = dc->disk.c;
  104. /*
  105. * should check BCACHE_DEV_RATE_DW_RUNNING before calling
  106. * cancel_delayed_work_sync().
  107. */
  108. set_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags);
  109. /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */
  110. smp_mb();
  111. /*
  112. * CACHE_SET_IO_DISABLE might be set via sysfs interface,
  113. * check it here too.
  114. */
  115. if (!test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags) ||
  116. test_bit(CACHE_SET_IO_DISABLE, &c->flags)) {
  117. clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags);
  118. /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */
  119. smp_mb();
  120. return;
  121. }
  122. down_read(&dc->writeback_lock);
  123. if (atomic_read(&dc->has_dirty) &&
  124. dc->writeback_percent)
  125. __update_writeback_rate(dc);
  126. up_read(&dc->writeback_lock);
  127. /*
  128. * CACHE_SET_IO_DISABLE might be set via sysfs interface,
  129. * check it here too.
  130. */
  131. if (test_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags) &&
  132. !test_bit(CACHE_SET_IO_DISABLE, &c->flags)) {
  133. schedule_delayed_work(&dc->writeback_rate_update,
  134. dc->writeback_rate_update_seconds * HZ);
  135. }
  136. /*
  137. * should check BCACHE_DEV_RATE_DW_RUNNING before calling
  138. * cancel_delayed_work_sync().
  139. */
  140. clear_bit(BCACHE_DEV_RATE_DW_RUNNING, &dc->disk.flags);
  141. /* paired with where BCACHE_DEV_RATE_DW_RUNNING is tested */
  142. smp_mb();
  143. }
  144. static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors)
  145. {
  146. if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) ||
  147. !dc->writeback_percent)
  148. return 0;
  149. return bch_next_delay(&dc->writeback_rate, sectors);
  150. }
  151. struct dirty_io {
  152. struct closure cl;
  153. struct cached_dev *dc;
  154. uint16_t sequence;
  155. struct bio bio;
  156. };
  157. static void dirty_init(struct keybuf_key *w)
  158. {
  159. struct dirty_io *io = w->private;
  160. struct bio *bio = &io->bio;
  161. bio_init(bio, bio->bi_inline_vecs,
  162. DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS));
  163. if (!io->dc->writeback_percent)
  164. bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
  165. bio->bi_iter.bi_size = KEY_SIZE(&w->key) << 9;
  166. bio->bi_private = w;
  167. bch_bio_map(bio, NULL);
  168. }
  169. static void dirty_io_destructor(struct closure *cl)
  170. {
  171. struct dirty_io *io = container_of(cl, struct dirty_io, cl);
  172. kfree(io);
  173. }
  174. static void write_dirty_finish(struct closure *cl)
  175. {
  176. struct dirty_io *io = container_of(cl, struct dirty_io, cl);
  177. struct keybuf_key *w = io->bio.bi_private;
  178. struct cached_dev *dc = io->dc;
  179. bio_free_pages(&io->bio);
  180. /* This is kind of a dumb way of signalling errors. */
  181. if (KEY_DIRTY(&w->key)) {
  182. int ret;
  183. unsigned i;
  184. struct keylist keys;
  185. bch_keylist_init(&keys);
  186. bkey_copy(keys.top, &w->key);
  187. SET_KEY_DIRTY(keys.top, false);
  188. bch_keylist_push(&keys);
  189. for (i = 0; i < KEY_PTRS(&w->key); i++)
  190. atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin);
  191. ret = bch_btree_insert(dc->disk.c, &keys, NULL, &w->key);
  192. if (ret)
  193. trace_bcache_writeback_collision(&w->key);
  194. atomic_long_inc(ret
  195. ? &dc->disk.c->writeback_keys_failed
  196. : &dc->disk.c->writeback_keys_done);
  197. }
  198. bch_keybuf_del(&dc->writeback_keys, w);
  199. up(&dc->in_flight);
  200. closure_return_with_destructor(cl, dirty_io_destructor);
  201. }
  202. static void dirty_endio(struct bio *bio)
  203. {
  204. struct keybuf_key *w = bio->bi_private;
  205. struct dirty_io *io = w->private;
  206. if (bio->bi_status) {
  207. SET_KEY_DIRTY(&w->key, false);
  208. bch_count_backing_io_errors(io->dc, bio);
  209. }
  210. closure_put(&io->cl);
  211. }
  212. static void write_dirty(struct closure *cl)
  213. {
  214. struct dirty_io *io = container_of(cl, struct dirty_io, cl);
  215. struct keybuf_key *w = io->bio.bi_private;
  216. struct cached_dev *dc = io->dc;
  217. uint16_t next_sequence;
  218. if (atomic_read(&dc->writeback_sequence_next) != io->sequence) {
  219. /* Not our turn to write; wait for a write to complete */
  220. closure_wait(&dc->writeback_ordering_wait, cl);
  221. if (atomic_read(&dc->writeback_sequence_next) == io->sequence) {
  222. /*
  223. * Edge case-- it happened in indeterminate order
  224. * relative to when we were added to wait list..
  225. */
  226. closure_wake_up(&dc->writeback_ordering_wait);
  227. }
  228. continue_at(cl, write_dirty, io->dc->writeback_write_wq);
  229. return;
  230. }
  231. next_sequence = io->sequence + 1;
  232. /*
  233. * IO errors are signalled using the dirty bit on the key.
  234. * If we failed to read, we should not attempt to write to the
  235. * backing device. Instead, immediately go to write_dirty_finish
  236. * to clean up.
  237. */
  238. if (KEY_DIRTY(&w->key)) {
  239. dirty_init(w);
  240. bio_set_op_attrs(&io->bio, REQ_OP_WRITE, 0);
  241. io->bio.bi_iter.bi_sector = KEY_START(&w->key);
  242. bio_set_dev(&io->bio, io->dc->bdev);
  243. io->bio.bi_end_io = dirty_endio;
  244. /* I/O request sent to backing device */
  245. closure_bio_submit(io->dc->disk.c, &io->bio, cl);
  246. }
  247. atomic_set(&dc->writeback_sequence_next, next_sequence);
  248. closure_wake_up(&dc->writeback_ordering_wait);
  249. continue_at(cl, write_dirty_finish, io->dc->writeback_write_wq);
  250. }
  251. static void read_dirty_endio(struct bio *bio)
  252. {
  253. struct keybuf_key *w = bio->bi_private;
  254. struct dirty_io *io = w->private;
  255. /* is_read = 1 */
  256. bch_count_io_errors(PTR_CACHE(io->dc->disk.c, &w->key, 0),
  257. bio->bi_status, 1,
  258. "reading dirty data from cache");
  259. dirty_endio(bio);
  260. }
  261. static void read_dirty_submit(struct closure *cl)
  262. {
  263. struct dirty_io *io = container_of(cl, struct dirty_io, cl);
  264. closure_bio_submit(io->dc->disk.c, &io->bio, cl);
  265. continue_at(cl, write_dirty, io->dc->writeback_write_wq);
  266. }
  267. static void read_dirty(struct cached_dev *dc)
  268. {
  269. unsigned delay = 0;
  270. struct keybuf_key *next, *keys[MAX_WRITEBACKS_IN_PASS], *w;
  271. size_t size;
  272. int nk, i;
  273. struct dirty_io *io;
  274. struct closure cl;
  275. uint16_t sequence = 0;
  276. BUG_ON(!llist_empty(&dc->writeback_ordering_wait.list));
  277. atomic_set(&dc->writeback_sequence_next, sequence);
  278. closure_init_stack(&cl);
  279. /*
  280. * XXX: if we error, background writeback just spins. Should use some
  281. * mempools.
  282. */
  283. next = bch_keybuf_next(&dc->writeback_keys);
  284. while (!kthread_should_stop() &&
  285. !test_bit(CACHE_SET_IO_DISABLE, &dc->disk.c->flags) &&
  286. next) {
  287. size = 0;
  288. nk = 0;
  289. do {
  290. BUG_ON(ptr_stale(dc->disk.c, &next->key, 0));
  291. /*
  292. * Don't combine too many operations, even if they
  293. * are all small.
  294. */
  295. if (nk >= MAX_WRITEBACKS_IN_PASS)
  296. break;
  297. /*
  298. * If the current operation is very large, don't
  299. * further combine operations.
  300. */
  301. if (size >= MAX_WRITESIZE_IN_PASS)
  302. break;
  303. /*
  304. * Operations are only eligible to be combined
  305. * if they are contiguous.
  306. *
  307. * TODO: add a heuristic willing to fire a
  308. * certain amount of non-contiguous IO per pass,
  309. * so that we can benefit from backing device
  310. * command queueing.
  311. */
  312. if ((nk != 0) && bkey_cmp(&keys[nk-1]->key,
  313. &START_KEY(&next->key)))
  314. break;
  315. size += KEY_SIZE(&next->key);
  316. keys[nk++] = next;
  317. } while ((next = bch_keybuf_next(&dc->writeback_keys)));
  318. /* Now we have gathered a set of 1..5 keys to write back. */
  319. for (i = 0; i < nk; i++) {
  320. w = keys[i];
  321. io = kzalloc(sizeof(struct dirty_io) +
  322. sizeof(struct bio_vec) *
  323. DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
  324. GFP_KERNEL);
  325. if (!io)
  326. goto err;
  327. w->private = io;
  328. io->dc = dc;
  329. io->sequence = sequence++;
  330. dirty_init(w);
  331. bio_set_op_attrs(&io->bio, REQ_OP_READ, 0);
  332. io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0);
  333. bio_set_dev(&io->bio,
  334. PTR_CACHE(dc->disk.c, &w->key, 0)->bdev);
  335. io->bio.bi_end_io = read_dirty_endio;
  336. if (bch_bio_alloc_pages(&io->bio, GFP_KERNEL))
  337. goto err_free;
  338. trace_bcache_writeback(&w->key);
  339. down(&dc->in_flight);
  340. /* We've acquired a semaphore for the maximum
  341. * simultaneous number of writebacks; from here
  342. * everything happens asynchronously.
  343. */
  344. closure_call(&io->cl, read_dirty_submit, NULL, &cl);
  345. }
  346. delay = writeback_delay(dc, size);
  347. /* If the control system would wait for at least half a
  348. * second, and there's been no reqs hitting the backing disk
  349. * for awhile: use an alternate mode where we have at most
  350. * one contiguous set of writebacks in flight at a time. If
  351. * someone wants to do IO it will be quick, as it will only
  352. * have to contend with one operation in flight, and we'll
  353. * be round-tripping data to the backing disk as quickly as
  354. * it can accept it.
  355. */
  356. if (delay >= HZ / 2) {
  357. /* 3 means at least 1.5 seconds, up to 7.5 if we
  358. * have slowed way down.
  359. */
  360. if (atomic_inc_return(&dc->backing_idle) >= 3) {
  361. /* Wait for current I/Os to finish */
  362. closure_sync(&cl);
  363. /* And immediately launch a new set. */
  364. delay = 0;
  365. }
  366. }
  367. while (!kthread_should_stop() &&
  368. !test_bit(CACHE_SET_IO_DISABLE, &dc->disk.c->flags) &&
  369. delay) {
  370. schedule_timeout_interruptible(delay);
  371. delay = writeback_delay(dc, 0);
  372. }
  373. }
  374. if (0) {
  375. err_free:
  376. kfree(w->private);
  377. err:
  378. bch_keybuf_del(&dc->writeback_keys, w);
  379. }
  380. /*
  381. * Wait for outstanding writeback IOs to finish (and keybuf slots to be
  382. * freed) before refilling again
  383. */
  384. closure_sync(&cl);
  385. }
  386. /* Scan for dirty data */
  387. void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode,
  388. uint64_t offset, int nr_sectors)
  389. {
  390. struct bcache_device *d = c->devices[inode];
  391. unsigned stripe_offset, stripe, sectors_dirty;
  392. if (!d)
  393. return;
  394. stripe = offset_to_stripe(d, offset);
  395. stripe_offset = offset & (d->stripe_size - 1);
  396. while (nr_sectors) {
  397. int s = min_t(unsigned, abs(nr_sectors),
  398. d->stripe_size - stripe_offset);
  399. if (nr_sectors < 0)
  400. s = -s;
  401. if (stripe >= d->nr_stripes)
  402. return;
  403. sectors_dirty = atomic_add_return(s,
  404. d->stripe_sectors_dirty + stripe);
  405. if (sectors_dirty == d->stripe_size)
  406. set_bit(stripe, d->full_dirty_stripes);
  407. else
  408. clear_bit(stripe, d->full_dirty_stripes);
  409. nr_sectors -= s;
  410. stripe_offset = 0;
  411. stripe++;
  412. }
  413. }
  414. static bool dirty_pred(struct keybuf *buf, struct bkey *k)
  415. {
  416. struct cached_dev *dc = container_of(buf, struct cached_dev, writeback_keys);
  417. BUG_ON(KEY_INODE(k) != dc->disk.id);
  418. return KEY_DIRTY(k);
  419. }
  420. static void refill_full_stripes(struct cached_dev *dc)
  421. {
  422. struct keybuf *buf = &dc->writeback_keys;
  423. unsigned start_stripe, stripe, next_stripe;
  424. bool wrapped = false;
  425. stripe = offset_to_stripe(&dc->disk, KEY_OFFSET(&buf->last_scanned));
  426. if (stripe >= dc->disk.nr_stripes)
  427. stripe = 0;
  428. start_stripe = stripe;
  429. while (1) {
  430. stripe = find_next_bit(dc->disk.full_dirty_stripes,
  431. dc->disk.nr_stripes, stripe);
  432. if (stripe == dc->disk.nr_stripes)
  433. goto next;
  434. next_stripe = find_next_zero_bit(dc->disk.full_dirty_stripes,
  435. dc->disk.nr_stripes, stripe);
  436. buf->last_scanned = KEY(dc->disk.id,
  437. stripe * dc->disk.stripe_size, 0);
  438. bch_refill_keybuf(dc->disk.c, buf,
  439. &KEY(dc->disk.id,
  440. next_stripe * dc->disk.stripe_size, 0),
  441. dirty_pred);
  442. if (array_freelist_empty(&buf->freelist))
  443. return;
  444. stripe = next_stripe;
  445. next:
  446. if (wrapped && stripe > start_stripe)
  447. return;
  448. if (stripe == dc->disk.nr_stripes) {
  449. stripe = 0;
  450. wrapped = true;
  451. }
  452. }
  453. }
  454. /*
  455. * Returns true if we scanned the entire disk
  456. */
  457. static bool refill_dirty(struct cached_dev *dc)
  458. {
  459. struct keybuf *buf = &dc->writeback_keys;
  460. struct bkey start = KEY(dc->disk.id, 0, 0);
  461. struct bkey end = KEY(dc->disk.id, MAX_KEY_OFFSET, 0);
  462. struct bkey start_pos;
  463. /*
  464. * make sure keybuf pos is inside the range for this disk - at bringup
  465. * we might not be attached yet so this disk's inode nr isn't
  466. * initialized then
  467. */
  468. if (bkey_cmp(&buf->last_scanned, &start) < 0 ||
  469. bkey_cmp(&buf->last_scanned, &end) > 0)
  470. buf->last_scanned = start;
  471. if (dc->partial_stripes_expensive) {
  472. refill_full_stripes(dc);
  473. if (array_freelist_empty(&buf->freelist))
  474. return false;
  475. }
  476. start_pos = buf->last_scanned;
  477. bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred);
  478. if (bkey_cmp(&buf->last_scanned, &end) < 0)
  479. return false;
  480. /*
  481. * If we get to the end start scanning again from the beginning, and
  482. * only scan up to where we initially started scanning from:
  483. */
  484. buf->last_scanned = start;
  485. bch_refill_keybuf(dc->disk.c, buf, &start_pos, dirty_pred);
  486. return bkey_cmp(&buf->last_scanned, &start_pos) >= 0;
  487. }
  488. static int bch_writeback_thread(void *arg)
  489. {
  490. struct cached_dev *dc = arg;
  491. struct cache_set *c = dc->disk.c;
  492. bool searched_full_index;
  493. bch_ratelimit_reset(&dc->writeback_rate);
  494. while (!kthread_should_stop() &&
  495. !test_bit(CACHE_SET_IO_DISABLE, &c->flags)) {
  496. down_write(&dc->writeback_lock);
  497. set_current_state(TASK_INTERRUPTIBLE);
  498. /*
  499. * If the bache device is detaching, skip here and continue
  500. * to perform writeback. Otherwise, if no dirty data on cache,
  501. * or there is dirty data on cache but writeback is disabled,
  502. * the writeback thread should sleep here and wait for others
  503. * to wake up it.
  504. */
  505. if (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) &&
  506. (!atomic_read(&dc->has_dirty) || !dc->writeback_running)) {
  507. up_write(&dc->writeback_lock);
  508. if (kthread_should_stop() ||
  509. test_bit(CACHE_SET_IO_DISABLE, &c->flags)) {
  510. set_current_state(TASK_RUNNING);
  511. break;
  512. }
  513. schedule();
  514. continue;
  515. }
  516. set_current_state(TASK_RUNNING);
  517. searched_full_index = refill_dirty(dc);
  518. if (searched_full_index &&
  519. RB_EMPTY_ROOT(&dc->writeback_keys.keys)) {
  520. atomic_set(&dc->has_dirty, 0);
  521. SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN);
  522. bch_write_bdev_super(dc, NULL);
  523. /*
  524. * If bcache device is detaching via sysfs interface,
  525. * writeback thread should stop after there is no dirty
  526. * data on cache. BCACHE_DEV_DETACHING flag is set in
  527. * bch_cached_dev_detach().
  528. */
  529. if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
  530. break;
  531. }
  532. up_write(&dc->writeback_lock);
  533. read_dirty(dc);
  534. if (searched_full_index) {
  535. unsigned delay = dc->writeback_delay * HZ;
  536. while (delay &&
  537. !kthread_should_stop() &&
  538. !test_bit(CACHE_SET_IO_DISABLE, &c->flags) &&
  539. !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
  540. delay = schedule_timeout_interruptible(delay);
  541. bch_ratelimit_reset(&dc->writeback_rate);
  542. }
  543. }
  544. cached_dev_put(dc);
  545. wait_for_kthread_stop();
  546. return 0;
  547. }
  548. /* Init */
  549. struct sectors_dirty_init {
  550. struct btree_op op;
  551. unsigned inode;
  552. };
  553. static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b,
  554. struct bkey *k)
  555. {
  556. struct sectors_dirty_init *op = container_of(_op,
  557. struct sectors_dirty_init, op);
  558. if (KEY_INODE(k) > op->inode)
  559. return MAP_DONE;
  560. if (KEY_DIRTY(k))
  561. bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
  562. KEY_START(k), KEY_SIZE(k));
  563. return MAP_CONTINUE;
  564. }
  565. void bch_sectors_dirty_init(struct bcache_device *d)
  566. {
  567. struct sectors_dirty_init op;
  568. bch_btree_op_init(&op.op, -1);
  569. op.inode = d->id;
  570. bch_btree_map_keys(&op.op, d->c, &KEY(op.inode, 0, 0),
  571. sectors_dirty_init_fn, 0);
  572. }
  573. void bch_cached_dev_writeback_init(struct cached_dev *dc)
  574. {
  575. sema_init(&dc->in_flight, 64);
  576. init_rwsem(&dc->writeback_lock);
  577. bch_keybuf_init(&dc->writeback_keys);
  578. dc->writeback_metadata = true;
  579. dc->writeback_running = true;
  580. dc->writeback_percent = 10;
  581. dc->writeback_delay = 30;
  582. dc->writeback_rate.rate = 1024;
  583. dc->writeback_rate_minimum = 8;
  584. dc->writeback_rate_update_seconds = WRITEBACK_RATE_UPDATE_SECS_DEFAULT;
  585. dc->writeback_rate_p_term_inverse = 40;
  586. dc->writeback_rate_i_term_inverse = 10000;
  587. WARN_ON(test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags));
  588. INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate);
  589. }
  590. int bch_cached_dev_writeback_start(struct cached_dev *dc)
  591. {
  592. dc->writeback_write_wq = alloc_workqueue("bcache_writeback_wq",
  593. WQ_MEM_RECLAIM, 0);
  594. if (!dc->writeback_write_wq)
  595. return -ENOMEM;
  596. cached_dev_get(dc);
  597. dc->writeback_thread = kthread_create(bch_writeback_thread, dc,
  598. "bcache_writeback");
  599. if (IS_ERR(dc->writeback_thread)) {
  600. cached_dev_put(dc);
  601. return PTR_ERR(dc->writeback_thread);
  602. }
  603. WARN_ON(test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags));
  604. schedule_delayed_work(&dc->writeback_rate_update,
  605. dc->writeback_rate_update_seconds * HZ);
  606. bch_writeback_queue(dc);
  607. return 0;
  608. }