compaction.c 34 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257
  1. /*
  2. * linux/mm/compaction.c
  3. *
  4. * Memory compaction for the reduction of external fragmentation. Note that
  5. * this heavily depends upon page migration to do all the real heavy
  6. * lifting
  7. *
  8. * Copyright IBM Corp. 2007-2010 Mel Gorman <mel@csn.ul.ie>
  9. */
  10. #include <linux/swap.h>
  11. #include <linux/migrate.h>
  12. #include <linux/compaction.h>
  13. #include <linux/mm_inline.h>
  14. #include <linux/backing-dev.h>
  15. #include <linux/sysctl.h>
  16. #include <linux/sysfs.h>
  17. #include <linux/balloon_compaction.h>
  18. #include <linux/page-isolation.h>
  19. #include "internal.h"
  20. #ifdef CONFIG_COMPACTION
  21. static inline void count_compact_event(enum vm_event_item item)
  22. {
  23. count_vm_event(item);
  24. }
  25. static inline void count_compact_events(enum vm_event_item item, long delta)
  26. {
  27. count_vm_events(item, delta);
  28. }
  29. #else
  30. #define count_compact_event(item) do { } while (0)
  31. #define count_compact_events(item, delta) do { } while (0)
  32. #endif
  33. #if defined CONFIG_COMPACTION || defined CONFIG_CMA
  34. #define CREATE_TRACE_POINTS
  35. #include <trace/events/compaction.h>
  36. static unsigned long release_freepages(struct list_head *freelist)
  37. {
  38. struct page *page, *next;
  39. unsigned long count = 0;
  40. list_for_each_entry_safe(page, next, freelist, lru) {
  41. list_del(&page->lru);
  42. __free_page(page);
  43. count++;
  44. }
  45. return count;
  46. }
  47. static void map_pages(struct list_head *list)
  48. {
  49. struct page *page;
  50. list_for_each_entry(page, list, lru) {
  51. arch_alloc_page(page, 0);
  52. kernel_map_pages(page, 1, 1);
  53. }
  54. }
  55. static inline bool migrate_async_suitable(int migratetype)
  56. {
  57. return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE;
  58. }
  59. #ifdef CONFIG_COMPACTION
  60. /* Returns true if the pageblock should be scanned for pages to isolate. */
  61. static inline bool isolation_suitable(struct compact_control *cc,
  62. struct page *page)
  63. {
  64. if (cc->ignore_skip_hint)
  65. return true;
  66. return !get_pageblock_skip(page);
  67. }
  68. /*
  69. * This function is called to clear all cached information on pageblocks that
  70. * should be skipped for page isolation when the migrate and free page scanner
  71. * meet.
  72. */
  73. static void __reset_isolation_suitable(struct zone *zone)
  74. {
  75. unsigned long start_pfn = zone->zone_start_pfn;
  76. unsigned long end_pfn = zone_end_pfn(zone);
  77. unsigned long pfn;
  78. zone->compact_cached_migrate_pfn[0] = start_pfn;
  79. zone->compact_cached_migrate_pfn[1] = start_pfn;
  80. zone->compact_cached_free_pfn = end_pfn;
  81. zone->compact_blockskip_flush = false;
  82. /* Walk the zone and mark every pageblock as suitable for isolation */
  83. for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
  84. struct page *page;
  85. cond_resched();
  86. if (!pfn_valid(pfn))
  87. continue;
  88. page = pfn_to_page(pfn);
  89. if (zone != page_zone(page))
  90. continue;
  91. clear_pageblock_skip(page);
  92. }
  93. }
  94. void reset_isolation_suitable(pg_data_t *pgdat)
  95. {
  96. int zoneid;
  97. for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
  98. struct zone *zone = &pgdat->node_zones[zoneid];
  99. if (!populated_zone(zone))
  100. continue;
  101. /* Only flush if a full compaction finished recently */
  102. if (zone->compact_blockskip_flush)
  103. __reset_isolation_suitable(zone);
  104. }
  105. }
  106. /*
  107. * If no pages were isolated then mark this pageblock to be skipped in the
  108. * future. The information is later cleared by __reset_isolation_suitable().
  109. */
  110. static void update_pageblock_skip(struct compact_control *cc,
  111. struct page *page, unsigned long nr_isolated,
  112. bool set_unsuitable, bool migrate_scanner)
  113. {
  114. struct zone *zone = cc->zone;
  115. unsigned long pfn;
  116. if (cc->ignore_skip_hint)
  117. return;
  118. if (!page)
  119. return;
  120. if (nr_isolated)
  121. return;
  122. /*
  123. * Only skip pageblocks when all forms of compaction will be known to
  124. * fail in the near future.
  125. */
  126. if (set_unsuitable)
  127. set_pageblock_skip(page);
  128. pfn = page_to_pfn(page);
  129. /* Update where async and sync compaction should restart */
  130. if (migrate_scanner) {
  131. if (cc->finished_update_migrate)
  132. return;
  133. if (pfn > zone->compact_cached_migrate_pfn[0])
  134. zone->compact_cached_migrate_pfn[0] = pfn;
  135. if (cc->mode != MIGRATE_ASYNC &&
  136. pfn > zone->compact_cached_migrate_pfn[1])
  137. zone->compact_cached_migrate_pfn[1] = pfn;
  138. } else {
  139. if (cc->finished_update_free)
  140. return;
  141. if (pfn < zone->compact_cached_free_pfn)
  142. zone->compact_cached_free_pfn = pfn;
  143. }
  144. }
  145. #else
  146. static inline bool isolation_suitable(struct compact_control *cc,
  147. struct page *page)
  148. {
  149. return true;
  150. }
  151. static void update_pageblock_skip(struct compact_control *cc,
  152. struct page *page, unsigned long nr_isolated,
  153. bool set_unsuitable, bool migrate_scanner)
  154. {
  155. }
  156. #endif /* CONFIG_COMPACTION */
  157. static inline bool should_release_lock(spinlock_t *lock)
  158. {
  159. return need_resched() || spin_is_contended(lock);
  160. }
  161. /*
  162. * Compaction requires the taking of some coarse locks that are potentially
  163. * very heavily contended. Check if the process needs to be scheduled or
  164. * if the lock is contended. For async compaction, back out in the event
  165. * if contention is severe. For sync compaction, schedule.
  166. *
  167. * Returns true if the lock is held.
  168. * Returns false if the lock is released and compaction should abort
  169. */
  170. static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
  171. bool locked, struct compact_control *cc)
  172. {
  173. if (should_release_lock(lock)) {
  174. if (locked) {
  175. spin_unlock_irqrestore(lock, *flags);
  176. locked = false;
  177. }
  178. /* async aborts if taking too long or contended */
  179. if (cc->mode == MIGRATE_ASYNC) {
  180. cc->contended = true;
  181. return false;
  182. }
  183. cond_resched();
  184. }
  185. if (!locked)
  186. spin_lock_irqsave(lock, *flags);
  187. return true;
  188. }
  189. /* Returns true if the page is within a block suitable for migration to */
  190. static bool suitable_migration_target(struct page *page)
  191. {
  192. /* If the page is a large free page, then disallow migration */
  193. if (PageBuddy(page) && page_order(page) >= pageblock_order)
  194. return false;
  195. /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
  196. if (migrate_async_suitable(get_pageblock_migratetype(page)))
  197. return true;
  198. /* Otherwise skip the block */
  199. return false;
  200. }
  201. /*
  202. * Isolate free pages onto a private freelist. If @strict is true, will abort
  203. * returning 0 on any invalid PFNs or non-free pages inside of the pageblock
  204. * (even though it may still end up isolating some pages).
  205. */
  206. static unsigned long isolate_freepages_block(struct compact_control *cc,
  207. unsigned long blockpfn,
  208. unsigned long end_pfn,
  209. struct list_head *freelist,
  210. bool strict)
  211. {
  212. int nr_scanned = 0, total_isolated = 0;
  213. struct page *cursor, *valid_page = NULL;
  214. unsigned long flags;
  215. bool locked = false;
  216. bool checked_pageblock = false;
  217. cursor = pfn_to_page(blockpfn);
  218. /* Isolate free pages. */
  219. for (; blockpfn < end_pfn; blockpfn++, cursor++) {
  220. int isolated, i;
  221. struct page *page = cursor;
  222. nr_scanned++;
  223. if (!pfn_valid_within(blockpfn))
  224. goto isolate_fail;
  225. if (!valid_page)
  226. valid_page = page;
  227. if (!PageBuddy(page))
  228. goto isolate_fail;
  229. /*
  230. * The zone lock must be held to isolate freepages.
  231. * Unfortunately this is a very coarse lock and can be
  232. * heavily contended if there are parallel allocations
  233. * or parallel compactions. For async compaction do not
  234. * spin on the lock and we acquire the lock as late as
  235. * possible.
  236. */
  237. locked = compact_checklock_irqsave(&cc->zone->lock, &flags,
  238. locked, cc);
  239. if (!locked)
  240. break;
  241. /* Recheck this is a suitable migration target under lock */
  242. if (!strict && !checked_pageblock) {
  243. /*
  244. * We need to check suitability of pageblock only once
  245. * and this isolate_freepages_block() is called with
  246. * pageblock range, so just check once is sufficient.
  247. */
  248. checked_pageblock = true;
  249. if (!suitable_migration_target(page))
  250. break;
  251. }
  252. /* Recheck this is a buddy page under lock */
  253. if (!PageBuddy(page))
  254. goto isolate_fail;
  255. /* Found a free page, break it into order-0 pages */
  256. isolated = split_free_page(page);
  257. total_isolated += isolated;
  258. for (i = 0; i < isolated; i++) {
  259. list_add(&page->lru, freelist);
  260. page++;
  261. }
  262. /* If a page was split, advance to the end of it */
  263. if (isolated) {
  264. blockpfn += isolated - 1;
  265. cursor += isolated - 1;
  266. continue;
  267. }
  268. isolate_fail:
  269. if (strict)
  270. break;
  271. else
  272. continue;
  273. }
  274. trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated);
  275. /*
  276. * If strict isolation is requested by CMA then check that all the
  277. * pages requested were isolated. If there were any failures, 0 is
  278. * returned and CMA will fail.
  279. */
  280. if (strict && blockpfn < end_pfn)
  281. total_isolated = 0;
  282. if (locked)
  283. spin_unlock_irqrestore(&cc->zone->lock, flags);
  284. /* Update the pageblock-skip if the whole pageblock was scanned */
  285. if (blockpfn == end_pfn)
  286. update_pageblock_skip(cc, valid_page, total_isolated, true,
  287. false);
  288. count_compact_events(COMPACTFREE_SCANNED, nr_scanned);
  289. if (total_isolated)
  290. count_compact_events(COMPACTISOLATED, total_isolated);
  291. return total_isolated;
  292. }
  293. /**
  294. * isolate_freepages_range() - isolate free pages.
  295. * @start_pfn: The first PFN to start isolating.
  296. * @end_pfn: The one-past-last PFN.
  297. *
  298. * Non-free pages, invalid PFNs, or zone boundaries within the
  299. * [start_pfn, end_pfn) range are considered errors, cause function to
  300. * undo its actions and return zero.
  301. *
  302. * Otherwise, function returns one-past-the-last PFN of isolated page
  303. * (which may be greater then end_pfn if end fell in a middle of
  304. * a free page).
  305. */
  306. unsigned long
  307. isolate_freepages_range(struct compact_control *cc,
  308. unsigned long start_pfn, unsigned long end_pfn)
  309. {
  310. unsigned long isolated, pfn, block_end_pfn;
  311. LIST_HEAD(freelist);
  312. for (pfn = start_pfn; pfn < end_pfn; pfn += isolated) {
  313. if (!pfn_valid(pfn) || cc->zone != page_zone(pfn_to_page(pfn)))
  314. break;
  315. /*
  316. * On subsequent iterations ALIGN() is actually not needed,
  317. * but we keep it that we not to complicate the code.
  318. */
  319. block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
  320. block_end_pfn = min(block_end_pfn, end_pfn);
  321. isolated = isolate_freepages_block(cc, pfn, block_end_pfn,
  322. &freelist, true);
  323. /*
  324. * In strict mode, isolate_freepages_block() returns 0 if
  325. * there are any holes in the block (ie. invalid PFNs or
  326. * non-free pages).
  327. */
  328. if (!isolated)
  329. break;
  330. /*
  331. * If we managed to isolate pages, it is always (1 << n) *
  332. * pageblock_nr_pages for some non-negative n. (Max order
  333. * page may span two pageblocks).
  334. */
  335. }
  336. /* split_free_page does not map the pages */
  337. map_pages(&freelist);
  338. if (pfn < end_pfn) {
  339. /* Loop terminated early, cleanup. */
  340. release_freepages(&freelist);
  341. return 0;
  342. }
  343. /* We don't use freelists for anything. */
  344. return pfn;
  345. }
  346. /* Update the number of anon and file isolated pages in the zone */
  347. static void acct_isolated(struct zone *zone, bool locked, struct compact_control *cc)
  348. {
  349. struct page *page;
  350. unsigned int count[2] = { 0, };
  351. list_for_each_entry(page, &cc->migratepages, lru)
  352. count[!!page_is_file_cache(page)]++;
  353. /* If locked we can use the interrupt unsafe versions */
  354. if (locked) {
  355. __mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
  356. __mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
  357. } else {
  358. mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
  359. mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
  360. }
  361. }
  362. /* Similar to reclaim, but different enough that they don't share logic */
  363. static bool too_many_isolated(struct zone *zone)
  364. {
  365. unsigned long active, inactive, isolated;
  366. inactive = zone_page_state(zone, NR_INACTIVE_FILE) +
  367. zone_page_state(zone, NR_INACTIVE_ANON);
  368. active = zone_page_state(zone, NR_ACTIVE_FILE) +
  369. zone_page_state(zone, NR_ACTIVE_ANON);
  370. isolated = zone_page_state(zone, NR_ISOLATED_FILE) +
  371. zone_page_state(zone, NR_ISOLATED_ANON);
  372. return isolated > (inactive + active) / 2;
  373. }
  374. /**
  375. * isolate_migratepages_range() - isolate all migrate-able pages in range.
  376. * @zone: Zone pages are in.
  377. * @cc: Compaction control structure.
  378. * @low_pfn: The first PFN of the range.
  379. * @end_pfn: The one-past-the-last PFN of the range.
  380. * @unevictable: true if it allows to isolate unevictable pages
  381. *
  382. * Isolate all pages that can be migrated from the range specified by
  383. * [low_pfn, end_pfn). Returns zero if there is a fatal signal
  384. * pending), otherwise PFN of the first page that was not scanned
  385. * (which may be both less, equal to or more then end_pfn).
  386. *
  387. * Assumes that cc->migratepages is empty and cc->nr_migratepages is
  388. * zero.
  389. *
  390. * Apart from cc->migratepages and cc->nr_migratetypes this function
  391. * does not modify any cc's fields, in particular it does not modify
  392. * (or read for that matter) cc->migrate_pfn.
  393. */
  394. unsigned long
  395. isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
  396. unsigned long low_pfn, unsigned long end_pfn, bool unevictable)
  397. {
  398. unsigned long last_pageblock_nr = 0, pageblock_nr;
  399. unsigned long nr_scanned = 0, nr_isolated = 0;
  400. struct list_head *migratelist = &cc->migratepages;
  401. struct lruvec *lruvec;
  402. unsigned long flags;
  403. bool locked = false;
  404. struct page *page = NULL, *valid_page = NULL;
  405. bool set_unsuitable = true;
  406. const isolate_mode_t mode = (cc->mode == MIGRATE_ASYNC ?
  407. ISOLATE_ASYNC_MIGRATE : 0) |
  408. (unevictable ? ISOLATE_UNEVICTABLE : 0);
  409. /*
  410. * Ensure that there are not too many pages isolated from the LRU
  411. * list by either parallel reclaimers or compaction. If there are,
  412. * delay for some time until fewer pages are isolated
  413. */
  414. while (unlikely(too_many_isolated(zone))) {
  415. /* async migration should just abort */
  416. if (cc->mode == MIGRATE_ASYNC)
  417. return 0;
  418. congestion_wait(BLK_RW_ASYNC, HZ/10);
  419. if (fatal_signal_pending(current))
  420. return 0;
  421. }
  422. if (cond_resched()) {
  423. /* Async terminates prematurely on need_resched() */
  424. if (cc->mode == MIGRATE_ASYNC)
  425. return 0;
  426. }
  427. /* Time to isolate some pages for migration */
  428. for (; low_pfn < end_pfn; low_pfn++) {
  429. /* give a chance to irqs before checking need_resched() */
  430. if (locked && !(low_pfn % SWAP_CLUSTER_MAX)) {
  431. if (should_release_lock(&zone->lru_lock)) {
  432. spin_unlock_irqrestore(&zone->lru_lock, flags);
  433. locked = false;
  434. }
  435. }
  436. /*
  437. * migrate_pfn does not necessarily start aligned to a
  438. * pageblock. Ensure that pfn_valid is called when moving
  439. * into a new MAX_ORDER_NR_PAGES range in case of large
  440. * memory holes within the zone
  441. */
  442. if ((low_pfn & (MAX_ORDER_NR_PAGES - 1)) == 0) {
  443. if (!pfn_valid(low_pfn)) {
  444. low_pfn += MAX_ORDER_NR_PAGES - 1;
  445. continue;
  446. }
  447. }
  448. if (!pfn_valid_within(low_pfn))
  449. continue;
  450. nr_scanned++;
  451. /*
  452. * Get the page and ensure the page is within the same zone.
  453. * See the comment in isolate_freepages about overlapping
  454. * nodes. It is deliberate that the new zone lock is not taken
  455. * as memory compaction should not move pages between nodes.
  456. */
  457. page = pfn_to_page(low_pfn);
  458. if (page_zone(page) != zone)
  459. continue;
  460. if (!valid_page)
  461. valid_page = page;
  462. /* If isolation recently failed, do not retry */
  463. pageblock_nr = low_pfn >> pageblock_order;
  464. if (last_pageblock_nr != pageblock_nr) {
  465. int mt;
  466. last_pageblock_nr = pageblock_nr;
  467. if (!isolation_suitable(cc, page))
  468. goto next_pageblock;
  469. /*
  470. * For async migration, also only scan in MOVABLE
  471. * blocks. Async migration is optimistic to see if
  472. * the minimum amount of work satisfies the allocation
  473. */
  474. mt = get_pageblock_migratetype(page);
  475. if (cc->mode == MIGRATE_ASYNC &&
  476. !migrate_async_suitable(mt)) {
  477. set_unsuitable = false;
  478. goto next_pageblock;
  479. }
  480. }
  481. /*
  482. * Skip if free. page_order cannot be used without zone->lock
  483. * as nothing prevents parallel allocations or buddy merging.
  484. */
  485. if (PageBuddy(page))
  486. continue;
  487. /*
  488. * Check may be lockless but that's ok as we recheck later.
  489. * It's possible to migrate LRU pages and balloon pages
  490. * Skip any other type of page
  491. */
  492. if (!PageLRU(page)) {
  493. if (unlikely(balloon_page_movable(page))) {
  494. if (locked && balloon_page_isolate(page)) {
  495. /* Successfully isolated */
  496. goto isolate_success;
  497. }
  498. }
  499. continue;
  500. }
  501. /*
  502. * PageLRU is set. lru_lock normally excludes isolation
  503. * splitting and collapsing (collapsing has already happened
  504. * if PageLRU is set) but the lock is not necessarily taken
  505. * here and it is wasteful to take it just to check transhuge.
  506. * Check TransHuge without lock and skip the whole pageblock if
  507. * it's either a transhuge or hugetlbfs page, as calling
  508. * compound_order() without preventing THP from splitting the
  509. * page underneath us may return surprising results.
  510. */
  511. if (PageTransHuge(page)) {
  512. if (!locked)
  513. goto next_pageblock;
  514. low_pfn += (1 << compound_order(page)) - 1;
  515. continue;
  516. }
  517. /*
  518. * Migration will fail if an anonymous page is pinned in memory,
  519. * so avoid taking lru_lock and isolating it unnecessarily in an
  520. * admittedly racy check.
  521. */
  522. if (!page_mapping(page) &&
  523. page_count(page) > page_mapcount(page))
  524. continue;
  525. /* Check if it is ok to still hold the lock */
  526. locked = compact_checklock_irqsave(&zone->lru_lock, &flags,
  527. locked, cc);
  528. if (!locked || fatal_signal_pending(current))
  529. break;
  530. /* Recheck PageLRU and PageTransHuge under lock */
  531. if (!PageLRU(page))
  532. continue;
  533. if (PageTransHuge(page)) {
  534. low_pfn += (1 << compound_order(page)) - 1;
  535. continue;
  536. }
  537. lruvec = mem_cgroup_page_lruvec(page, zone);
  538. /* Try isolate the page */
  539. if (__isolate_lru_page(page, mode) != 0)
  540. continue;
  541. VM_BUG_ON_PAGE(PageTransCompound(page), page);
  542. /* Successfully isolated */
  543. del_page_from_lru_list(page, lruvec, page_lru(page));
  544. isolate_success:
  545. cc->finished_update_migrate = true;
  546. list_add(&page->lru, migratelist);
  547. cc->nr_migratepages++;
  548. nr_isolated++;
  549. /* Avoid isolating too much */
  550. if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) {
  551. ++low_pfn;
  552. break;
  553. }
  554. continue;
  555. next_pageblock:
  556. low_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages) - 1;
  557. }
  558. acct_isolated(zone, locked, cc);
  559. if (locked)
  560. spin_unlock_irqrestore(&zone->lru_lock, flags);
  561. /*
  562. * Update the pageblock-skip information and cached scanner pfn,
  563. * if the whole pageblock was scanned without isolating any page.
  564. */
  565. if (low_pfn == end_pfn)
  566. update_pageblock_skip(cc, valid_page, nr_isolated,
  567. set_unsuitable, true);
  568. trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
  569. count_compact_events(COMPACTMIGRATE_SCANNED, nr_scanned);
  570. if (nr_isolated)
  571. count_compact_events(COMPACTISOLATED, nr_isolated);
  572. return low_pfn;
  573. }
  574. #endif /* CONFIG_COMPACTION || CONFIG_CMA */
  575. #ifdef CONFIG_COMPACTION
  576. /*
  577. * Based on information in the current compact_control, find blocks
  578. * suitable for isolating free pages from and then isolate them.
  579. */
  580. static void isolate_freepages(struct zone *zone,
  581. struct compact_control *cc)
  582. {
  583. struct page *page;
  584. unsigned long block_start_pfn; /* start of current pageblock */
  585. unsigned long block_end_pfn; /* end of current pageblock */
  586. unsigned long low_pfn; /* lowest pfn scanner is able to scan */
  587. int nr_freepages = cc->nr_freepages;
  588. struct list_head *freelist = &cc->freepages;
  589. /*
  590. * Initialise the free scanner. The starting point is where we last
  591. * successfully isolated from, zone-cached value, or the end of the
  592. * zone when isolating for the first time. We need this aligned to
  593. * the pageblock boundary, because we do
  594. * block_start_pfn -= pageblock_nr_pages in the for loop.
  595. * For ending point, take care when isolating in last pageblock of a
  596. * a zone which ends in the middle of a pageblock.
  597. * The low boundary is the end of the pageblock the migration scanner
  598. * is using.
  599. */
  600. block_start_pfn = cc->free_pfn & ~(pageblock_nr_pages-1);
  601. block_end_pfn = min(block_start_pfn + pageblock_nr_pages,
  602. zone_end_pfn(zone));
  603. low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages);
  604. /*
  605. * Isolate free pages until enough are available to migrate the
  606. * pages on cc->migratepages. We stop searching if the migrate
  607. * and free page scanners meet or enough free pages are isolated.
  608. */
  609. for (; block_start_pfn >= low_pfn && cc->nr_migratepages > nr_freepages;
  610. block_end_pfn = block_start_pfn,
  611. block_start_pfn -= pageblock_nr_pages) {
  612. unsigned long isolated;
  613. /*
  614. * This can iterate a massively long zone without finding any
  615. * suitable migration targets, so periodically check if we need
  616. * to schedule.
  617. */
  618. cond_resched();
  619. if (!pfn_valid(block_start_pfn))
  620. continue;
  621. /*
  622. * Check for overlapping nodes/zones. It's possible on some
  623. * configurations to have a setup like
  624. * node0 node1 node0
  625. * i.e. it's possible that all pages within a zones range of
  626. * pages do not belong to a single zone.
  627. */
  628. page = pfn_to_page(block_start_pfn);
  629. if (page_zone(page) != zone)
  630. continue;
  631. /* Check the block is suitable for migration */
  632. if (!suitable_migration_target(page))
  633. continue;
  634. /* If isolation recently failed, do not retry */
  635. if (!isolation_suitable(cc, page))
  636. continue;
  637. /* Found a block suitable for isolating free pages from */
  638. cc->free_pfn = block_start_pfn;
  639. isolated = isolate_freepages_block(cc, block_start_pfn,
  640. block_end_pfn, freelist, false);
  641. nr_freepages += isolated;
  642. /*
  643. * Set a flag that we successfully isolated in this pageblock.
  644. * In the next loop iteration, zone->compact_cached_free_pfn
  645. * will not be updated and thus it will effectively contain the
  646. * highest pageblock we isolated pages from.
  647. */
  648. if (isolated)
  649. cc->finished_update_free = true;
  650. }
  651. /* split_free_page does not map the pages */
  652. map_pages(freelist);
  653. /*
  654. * If we crossed the migrate scanner, we want to keep it that way
  655. * so that compact_finished() may detect this
  656. */
  657. if (block_start_pfn < low_pfn)
  658. cc->free_pfn = cc->migrate_pfn;
  659. cc->nr_freepages = nr_freepages;
  660. }
  661. /*
  662. * This is a migrate-callback that "allocates" freepages by taking pages
  663. * from the isolated freelists in the block we are migrating to.
  664. */
  665. static struct page *compaction_alloc(struct page *migratepage,
  666. unsigned long data,
  667. int **result)
  668. {
  669. struct compact_control *cc = (struct compact_control *)data;
  670. struct page *freepage;
  671. /* Isolate free pages if necessary */
  672. if (list_empty(&cc->freepages)) {
  673. isolate_freepages(cc->zone, cc);
  674. if (list_empty(&cc->freepages))
  675. return NULL;
  676. }
  677. freepage = list_entry(cc->freepages.next, struct page, lru);
  678. list_del(&freepage->lru);
  679. cc->nr_freepages--;
  680. return freepage;
  681. }
  682. /*
  683. * This is a migrate-callback that "frees" freepages back to the isolated
  684. * freelist. All pages on the freelist are from the same zone, so there is no
  685. * special handling needed for NUMA.
  686. */
  687. static void compaction_free(struct page *page, unsigned long data)
  688. {
  689. struct compact_control *cc = (struct compact_control *)data;
  690. list_add(&page->lru, &cc->freepages);
  691. cc->nr_freepages++;
  692. }
  693. /* possible outcome of isolate_migratepages */
  694. typedef enum {
  695. ISOLATE_ABORT, /* Abort compaction now */
  696. ISOLATE_NONE, /* No pages isolated, continue scanning */
  697. ISOLATE_SUCCESS, /* Pages isolated, migrate */
  698. } isolate_migrate_t;
  699. /*
  700. * Isolate all pages that can be migrated from the block pointed to by
  701. * the migrate scanner within compact_control.
  702. */
  703. static isolate_migrate_t isolate_migratepages(struct zone *zone,
  704. struct compact_control *cc)
  705. {
  706. unsigned long low_pfn, end_pfn;
  707. /* Do not scan outside zone boundaries */
  708. low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn);
  709. /* Only scan within a pageblock boundary */
  710. end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages);
  711. /* Do not cross the free scanner or scan within a memory hole */
  712. if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) {
  713. cc->migrate_pfn = end_pfn;
  714. return ISOLATE_NONE;
  715. }
  716. /* Perform the isolation */
  717. low_pfn = isolate_migratepages_range(zone, cc, low_pfn, end_pfn, false);
  718. if (!low_pfn || cc->contended)
  719. return ISOLATE_ABORT;
  720. cc->migrate_pfn = low_pfn;
  721. return ISOLATE_SUCCESS;
  722. }
  723. static int compact_finished(struct zone *zone,
  724. struct compact_control *cc)
  725. {
  726. unsigned int order;
  727. unsigned long watermark;
  728. if (fatal_signal_pending(current))
  729. return COMPACT_PARTIAL;
  730. /* Compaction run completes if the migrate and free scanner meet */
  731. if (cc->free_pfn <= cc->migrate_pfn) {
  732. /* Let the next compaction start anew. */
  733. zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
  734. zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
  735. zone->compact_cached_free_pfn = zone_end_pfn(zone);
  736. /*
  737. * Mark that the PG_migrate_skip information should be cleared
  738. * by kswapd when it goes to sleep. kswapd does not set the
  739. * flag itself as the decision to be clear should be directly
  740. * based on an allocation request.
  741. */
  742. if (!current_is_kswapd())
  743. zone->compact_blockskip_flush = true;
  744. return COMPACT_COMPLETE;
  745. }
  746. /*
  747. * order == -1 is expected when compacting via
  748. * /proc/sys/vm/compact_memory
  749. */
  750. if (cc->order == -1)
  751. return COMPACT_CONTINUE;
  752. /* Compaction run is not finished if the watermark is not met */
  753. watermark = low_wmark_pages(zone);
  754. watermark += (1 << cc->order);
  755. if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0))
  756. return COMPACT_CONTINUE;
  757. /* Direct compactor: Is a suitable page free? */
  758. for (order = cc->order; order < MAX_ORDER; order++) {
  759. struct free_area *area = &zone->free_area[order];
  760. /* Job done if page is free of the right migratetype */
  761. if (!list_empty(&area->free_list[cc->migratetype]))
  762. return COMPACT_PARTIAL;
  763. /* Job done if allocation would set block type */
  764. if (cc->order >= pageblock_order && area->nr_free)
  765. return COMPACT_PARTIAL;
  766. }
  767. return COMPACT_CONTINUE;
  768. }
  769. /*
  770. * compaction_suitable: Is this suitable to run compaction on this zone now?
  771. * Returns
  772. * COMPACT_SKIPPED - If there are too few free pages for compaction
  773. * COMPACT_PARTIAL - If the allocation would succeed without compaction
  774. * COMPACT_CONTINUE - If compaction should run now
  775. */
  776. unsigned long compaction_suitable(struct zone *zone, int order)
  777. {
  778. int fragindex;
  779. unsigned long watermark;
  780. /*
  781. * order == -1 is expected when compacting via
  782. * /proc/sys/vm/compact_memory
  783. */
  784. if (order == -1)
  785. return COMPACT_CONTINUE;
  786. /*
  787. * Watermarks for order-0 must be met for compaction. Note the 2UL.
  788. * This is because during migration, copies of pages need to be
  789. * allocated and for a short time, the footprint is higher
  790. */
  791. watermark = low_wmark_pages(zone) + (2UL << order);
  792. if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
  793. return COMPACT_SKIPPED;
  794. /*
  795. * fragmentation index determines if allocation failures are due to
  796. * low memory or external fragmentation
  797. *
  798. * index of -1000 implies allocations might succeed depending on
  799. * watermarks
  800. * index towards 0 implies failure is due to lack of memory
  801. * index towards 1000 implies failure is due to fragmentation
  802. *
  803. * Only compact if a failure would be due to fragmentation.
  804. */
  805. fragindex = fragmentation_index(zone, order);
  806. if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
  807. return COMPACT_SKIPPED;
  808. if (fragindex == -1000 && zone_watermark_ok(zone, order, watermark,
  809. 0, 0))
  810. return COMPACT_PARTIAL;
  811. return COMPACT_CONTINUE;
  812. }
  813. static int compact_zone(struct zone *zone, struct compact_control *cc)
  814. {
  815. int ret;
  816. unsigned long start_pfn = zone->zone_start_pfn;
  817. unsigned long end_pfn = zone_end_pfn(zone);
  818. const bool sync = cc->mode != MIGRATE_ASYNC;
  819. ret = compaction_suitable(zone, cc->order);
  820. switch (ret) {
  821. case COMPACT_PARTIAL:
  822. case COMPACT_SKIPPED:
  823. /* Compaction is likely to fail */
  824. return ret;
  825. case COMPACT_CONTINUE:
  826. /* Fall through to compaction */
  827. ;
  828. }
  829. /*
  830. * Clear pageblock skip if there were failures recently and compaction
  831. * is about to be retried after being deferred. kswapd does not do
  832. * this reset as it'll reset the cached information when going to sleep.
  833. */
  834. if (compaction_restarting(zone, cc->order) && !current_is_kswapd())
  835. __reset_isolation_suitable(zone);
  836. /*
  837. * Setup to move all movable pages to the end of the zone. Used cached
  838. * information on where the scanners should start but check that it
  839. * is initialised by ensuring the values are within zone boundaries.
  840. */
  841. cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync];
  842. cc->free_pfn = zone->compact_cached_free_pfn;
  843. if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) {
  844. cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1);
  845. zone->compact_cached_free_pfn = cc->free_pfn;
  846. }
  847. if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) {
  848. cc->migrate_pfn = start_pfn;
  849. zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
  850. zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
  851. }
  852. trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn);
  853. migrate_prep_local();
  854. while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
  855. int err;
  856. switch (isolate_migratepages(zone, cc)) {
  857. case ISOLATE_ABORT:
  858. ret = COMPACT_PARTIAL;
  859. putback_movable_pages(&cc->migratepages);
  860. cc->nr_migratepages = 0;
  861. goto out;
  862. case ISOLATE_NONE:
  863. continue;
  864. case ISOLATE_SUCCESS:
  865. ;
  866. }
  867. if (!cc->nr_migratepages)
  868. continue;
  869. err = migrate_pages(&cc->migratepages, compaction_alloc,
  870. compaction_free, (unsigned long)cc, cc->mode,
  871. MR_COMPACTION);
  872. trace_mm_compaction_migratepages(cc->nr_migratepages, err,
  873. &cc->migratepages);
  874. /* All pages were either migrated or will be released */
  875. cc->nr_migratepages = 0;
  876. if (err) {
  877. putback_movable_pages(&cc->migratepages);
  878. /*
  879. * migrate_pages() may return -ENOMEM when scanners meet
  880. * and we want compact_finished() to detect it
  881. */
  882. if (err == -ENOMEM && cc->free_pfn > cc->migrate_pfn) {
  883. ret = COMPACT_PARTIAL;
  884. goto out;
  885. }
  886. }
  887. }
  888. out:
  889. /* Release free pages and check accounting */
  890. cc->nr_freepages -= release_freepages(&cc->freepages);
  891. VM_BUG_ON(cc->nr_freepages != 0);
  892. trace_mm_compaction_end(ret);
  893. return ret;
  894. }
  895. static unsigned long compact_zone_order(struct zone *zone, int order,
  896. gfp_t gfp_mask, enum migrate_mode mode, bool *contended)
  897. {
  898. unsigned long ret;
  899. struct compact_control cc = {
  900. .nr_freepages = 0,
  901. .nr_migratepages = 0,
  902. .order = order,
  903. .migratetype = allocflags_to_migratetype(gfp_mask),
  904. .zone = zone,
  905. .mode = mode,
  906. };
  907. INIT_LIST_HEAD(&cc.freepages);
  908. INIT_LIST_HEAD(&cc.migratepages);
  909. ret = compact_zone(zone, &cc);
  910. VM_BUG_ON(!list_empty(&cc.freepages));
  911. VM_BUG_ON(!list_empty(&cc.migratepages));
  912. *contended = cc.contended;
  913. return ret;
  914. }
  915. int sysctl_extfrag_threshold = 500;
  916. /**
  917. * try_to_compact_pages - Direct compact to satisfy a high-order allocation
  918. * @zonelist: The zonelist used for the current allocation
  919. * @order: The order of the current allocation
  920. * @gfp_mask: The GFP mask of the current allocation
  921. * @nodemask: The allowed nodes to allocate from
  922. * @mode: The migration mode for async, sync light, or sync migration
  923. * @contended: Return value that is true if compaction was aborted due to lock contention
  924. * @page: Optionally capture a free page of the requested order during compaction
  925. *
  926. * This is the main entry point for direct page compaction.
  927. */
  928. unsigned long try_to_compact_pages(struct zonelist *zonelist,
  929. int order, gfp_t gfp_mask, nodemask_t *nodemask,
  930. enum migrate_mode mode, bool *contended)
  931. {
  932. enum zone_type high_zoneidx = gfp_zone(gfp_mask);
  933. int may_enter_fs = gfp_mask & __GFP_FS;
  934. int may_perform_io = gfp_mask & __GFP_IO;
  935. struct zoneref *z;
  936. struct zone *zone;
  937. int rc = COMPACT_SKIPPED;
  938. int alloc_flags = 0;
  939. /* Check if the GFP flags allow compaction */
  940. if (!order || !may_enter_fs || !may_perform_io)
  941. return rc;
  942. count_compact_event(COMPACTSTALL);
  943. #ifdef CONFIG_CMA
  944. if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
  945. alloc_flags |= ALLOC_CMA;
  946. #endif
  947. /* Compact each zone in the list */
  948. for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
  949. nodemask) {
  950. int status;
  951. status = compact_zone_order(zone, order, gfp_mask, mode,
  952. contended);
  953. rc = max(status, rc);
  954. /* If a normal allocation would succeed, stop compacting */
  955. if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0,
  956. alloc_flags))
  957. break;
  958. }
  959. return rc;
  960. }
  961. /* Compact all zones within a node */
  962. static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
  963. {
  964. int zoneid;
  965. struct zone *zone;
  966. for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
  967. zone = &pgdat->node_zones[zoneid];
  968. if (!populated_zone(zone))
  969. continue;
  970. cc->nr_freepages = 0;
  971. cc->nr_migratepages = 0;
  972. cc->zone = zone;
  973. INIT_LIST_HEAD(&cc->freepages);
  974. INIT_LIST_HEAD(&cc->migratepages);
  975. if (cc->order == -1 || !compaction_deferred(zone, cc->order))
  976. compact_zone(zone, cc);
  977. if (cc->order > 0) {
  978. if (zone_watermark_ok(zone, cc->order,
  979. low_wmark_pages(zone), 0, 0))
  980. compaction_defer_reset(zone, cc->order, false);
  981. }
  982. VM_BUG_ON(!list_empty(&cc->freepages));
  983. VM_BUG_ON(!list_empty(&cc->migratepages));
  984. }
  985. }
  986. void compact_pgdat(pg_data_t *pgdat, int order)
  987. {
  988. struct compact_control cc = {
  989. .order = order,
  990. .mode = MIGRATE_ASYNC,
  991. };
  992. if (!order)
  993. return;
  994. __compact_pgdat(pgdat, &cc);
  995. }
  996. static void compact_node(int nid)
  997. {
  998. struct compact_control cc = {
  999. .order = -1,
  1000. .mode = MIGRATE_SYNC,
  1001. .ignore_skip_hint = true,
  1002. };
  1003. __compact_pgdat(NODE_DATA(nid), &cc);
  1004. }
  1005. /* Compact all nodes in the system */
  1006. static void compact_nodes(void)
  1007. {
  1008. int nid;
  1009. /* Flush pending updates to the LRU lists */
  1010. lru_add_drain_all();
  1011. for_each_online_node(nid)
  1012. compact_node(nid);
  1013. }
  1014. /* The written value is actually unused, all memory is compacted */
  1015. int sysctl_compact_memory;
  1016. /* This is the entry point for compacting all nodes via /proc/sys/vm */
  1017. int sysctl_compaction_handler(struct ctl_table *table, int write,
  1018. void __user *buffer, size_t *length, loff_t *ppos)
  1019. {
  1020. if (write)
  1021. compact_nodes();
  1022. return 0;
  1023. }
  1024. int sysctl_extfrag_handler(struct ctl_table *table, int write,
  1025. void __user *buffer, size_t *length, loff_t *ppos)
  1026. {
  1027. proc_dointvec_minmax(table, write, buffer, length, ppos);
  1028. return 0;
  1029. }
  1030. #if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
  1031. static ssize_t sysfs_compact_node(struct device *dev,
  1032. struct device_attribute *attr,
  1033. const char *buf, size_t count)
  1034. {
  1035. int nid = dev->id;
  1036. if (nid >= 0 && nid < nr_node_ids && node_online(nid)) {
  1037. /* Flush pending updates to the LRU lists */
  1038. lru_add_drain_all();
  1039. compact_node(nid);
  1040. }
  1041. return count;
  1042. }
  1043. static DEVICE_ATTR(compact, S_IWUSR, NULL, sysfs_compact_node);
  1044. int compaction_register_node(struct node *node)
  1045. {
  1046. return device_create_file(&node->dev, &dev_attr_compact);
  1047. }
  1048. void compaction_unregister_node(struct node *node)
  1049. {
  1050. return device_remove_file(&node->dev, &dev_attr_compact);
  1051. }
  1052. #endif /* CONFIG_SYSFS && CONFIG_NUMA */
  1053. #endif /* CONFIG_COMPACTION */