swapfile.c 96 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778
  1. /*
  2. * linux/mm/swapfile.c
  3. *
  4. * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
  5. * Swap reorganised 29.12.95, Stephen Tweedie
  6. */
  7. #include <linux/mm.h>
  8. #include <linux/sched/mm.h>
  9. #include <linux/sched/task.h>
  10. #include <linux/hugetlb.h>
  11. #include <linux/mman.h>
  12. #include <linux/slab.h>
  13. #include <linux/kernel_stat.h>
  14. #include <linux/swap.h>
  15. #include <linux/vmalloc.h>
  16. #include <linux/pagemap.h>
  17. #include <linux/namei.h>
  18. #include <linux/shmem_fs.h>
  19. #include <linux/blkdev.h>
  20. #include <linux/random.h>
  21. #include <linux/writeback.h>
  22. #include <linux/proc_fs.h>
  23. #include <linux/seq_file.h>
  24. #include <linux/init.h>
  25. #include <linux/ksm.h>
  26. #include <linux/rmap.h>
  27. #include <linux/security.h>
  28. #include <linux/backing-dev.h>
  29. #include <linux/mutex.h>
  30. #include <linux/capability.h>
  31. #include <linux/syscalls.h>
  32. #include <linux/memcontrol.h>
  33. #include <linux/poll.h>
  34. #include <linux/oom.h>
  35. #include <linux/frontswap.h>
  36. #include <linux/swapfile.h>
  37. #include <linux/export.h>
  38. #include <linux/swap_slots.h>
  39. #include <linux/sort.h>
  40. #include <asm/pgtable.h>
  41. #include <asm/tlbflush.h>
  42. #include <linux/swapops.h>
  43. #include <linux/swap_cgroup.h>
  44. static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
  45. unsigned char);
  46. static void free_swap_count_continuations(struct swap_info_struct *);
  47. static sector_t map_swap_entry(swp_entry_t, struct block_device**);
  48. DEFINE_SPINLOCK(swap_lock);
  49. static unsigned int nr_swapfiles;
  50. atomic_long_t nr_swap_pages;
  51. /*
  52. * Some modules use swappable objects and may try to swap them out under
  53. * memory pressure (via the shrinker). Before doing so, they may wish to
  54. * check to see if any swap space is available.
  55. */
  56. EXPORT_SYMBOL_GPL(nr_swap_pages);
  57. /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */
  58. long total_swap_pages;
  59. static int least_priority = -1;
  60. static const char Bad_file[] = "Bad swap file entry ";
  61. static const char Unused_file[] = "Unused swap file entry ";
  62. static const char Bad_offset[] = "Bad swap offset entry ";
  63. static const char Unused_offset[] = "Unused swap offset entry ";
  64. /*
  65. * all active swap_info_structs
  66. * protected with swap_lock, and ordered by priority.
  67. */
  68. PLIST_HEAD(swap_active_head);
  69. /*
  70. * all available (active, not full) swap_info_structs
  71. * protected with swap_avail_lock, ordered by priority.
  72. * This is used by get_swap_page() instead of swap_active_head
  73. * because swap_active_head includes all swap_info_structs,
  74. * but get_swap_page() doesn't need to look at full ones.
  75. * This uses its own lock instead of swap_lock because when a
  76. * swap_info_struct changes between not-full/full, it needs to
  77. * add/remove itself to/from this list, but the swap_info_struct->lock
  78. * is held and the locking order requires swap_lock to be taken
  79. * before any swap_info_struct->lock.
  80. */
  81. static struct plist_head *swap_avail_heads;
  82. static DEFINE_SPINLOCK(swap_avail_lock);
  83. struct swap_info_struct *swap_info[MAX_SWAPFILES];
  84. static DEFINE_MUTEX(swapon_mutex);
  85. static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait);
  86. /* Activity counter to indicate that a swapon or swapoff has occurred */
  87. static atomic_t proc_poll_event = ATOMIC_INIT(0);
  88. atomic_t nr_rotate_swap = ATOMIC_INIT(0);
  89. static inline unsigned char swap_count(unsigned char ent)
  90. {
  91. return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */
  92. }
  93. /* returns 1 if swap entry is freed */
  94. static int
  95. __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
  96. {
  97. swp_entry_t entry = swp_entry(si->type, offset);
  98. struct page *page;
  99. int ret = 0;
  100. page = find_get_page(swap_address_space(entry), swp_offset(entry));
  101. if (!page)
  102. return 0;
  103. /*
  104. * This function is called from scan_swap_map() and it's called
  105. * by vmscan.c at reclaiming pages. So, we hold a lock on a page, here.
  106. * We have to use trylock for avoiding deadlock. This is a special
  107. * case and you should use try_to_free_swap() with explicit lock_page()
  108. * in usual operations.
  109. */
  110. if (trylock_page(page)) {
  111. ret = try_to_free_swap(page);
  112. unlock_page(page);
  113. }
  114. put_page(page);
  115. return ret;
  116. }
  117. /*
  118. * swapon tell device that all the old swap contents can be discarded,
  119. * to allow the swap device to optimize its wear-levelling.
  120. */
  121. static int discard_swap(struct swap_info_struct *si)
  122. {
  123. struct swap_extent *se;
  124. sector_t start_block;
  125. sector_t nr_blocks;
  126. int err = 0;
  127. /* Do not discard the swap header page! */
  128. se = &si->first_swap_extent;
  129. start_block = (se->start_block + 1) << (PAGE_SHIFT - 9);
  130. nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
  131. if (nr_blocks) {
  132. err = blkdev_issue_discard(si->bdev, start_block,
  133. nr_blocks, GFP_KERNEL, 0);
  134. if (err)
  135. return err;
  136. cond_resched();
  137. }
  138. list_for_each_entry(se, &si->first_swap_extent.list, list) {
  139. start_block = se->start_block << (PAGE_SHIFT - 9);
  140. nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
  141. err = blkdev_issue_discard(si->bdev, start_block,
  142. nr_blocks, GFP_KERNEL, 0);
  143. if (err)
  144. break;
  145. cond_resched();
  146. }
  147. return err; /* That will often be -EOPNOTSUPP */
  148. }
  149. /*
  150. * swap allocation tell device that a cluster of swap can now be discarded,
  151. * to allow the swap device to optimize its wear-levelling.
  152. */
  153. static void discard_swap_cluster(struct swap_info_struct *si,
  154. pgoff_t start_page, pgoff_t nr_pages)
  155. {
  156. struct swap_extent *se = si->curr_swap_extent;
  157. int found_extent = 0;
  158. while (nr_pages) {
  159. if (se->start_page <= start_page &&
  160. start_page < se->start_page + se->nr_pages) {
  161. pgoff_t offset = start_page - se->start_page;
  162. sector_t start_block = se->start_block + offset;
  163. sector_t nr_blocks = se->nr_pages - offset;
  164. if (nr_blocks > nr_pages)
  165. nr_blocks = nr_pages;
  166. start_page += nr_blocks;
  167. nr_pages -= nr_blocks;
  168. if (!found_extent++)
  169. si->curr_swap_extent = se;
  170. start_block <<= PAGE_SHIFT - 9;
  171. nr_blocks <<= PAGE_SHIFT - 9;
  172. if (blkdev_issue_discard(si->bdev, start_block,
  173. nr_blocks, GFP_NOIO, 0))
  174. break;
  175. }
  176. se = list_next_entry(se, list);
  177. }
  178. }
  179. #ifdef CONFIG_THP_SWAP
  180. #define SWAPFILE_CLUSTER HPAGE_PMD_NR
  181. #define swap_entry_size(size) (size)
  182. #else
  183. #define SWAPFILE_CLUSTER 256
  184. /*
  185. * Define swap_entry_size() as constant to let compiler to optimize
  186. * out some code if !CONFIG_THP_SWAP
  187. */
  188. #define swap_entry_size(size) 1
  189. #endif
  190. #define LATENCY_LIMIT 256
  191. static inline void cluster_set_flag(struct swap_cluster_info *info,
  192. unsigned int flag)
  193. {
  194. info->flags = flag;
  195. }
  196. static inline unsigned int cluster_count(struct swap_cluster_info *info)
  197. {
  198. return info->data;
  199. }
  200. static inline void cluster_set_count(struct swap_cluster_info *info,
  201. unsigned int c)
  202. {
  203. info->data = c;
  204. }
  205. static inline void cluster_set_count_flag(struct swap_cluster_info *info,
  206. unsigned int c, unsigned int f)
  207. {
  208. info->flags = f;
  209. info->data = c;
  210. }
  211. static inline unsigned int cluster_next(struct swap_cluster_info *info)
  212. {
  213. return info->data;
  214. }
  215. static inline void cluster_set_next(struct swap_cluster_info *info,
  216. unsigned int n)
  217. {
  218. info->data = n;
  219. }
  220. static inline void cluster_set_next_flag(struct swap_cluster_info *info,
  221. unsigned int n, unsigned int f)
  222. {
  223. info->flags = f;
  224. info->data = n;
  225. }
  226. static inline bool cluster_is_free(struct swap_cluster_info *info)
  227. {
  228. return info->flags & CLUSTER_FLAG_FREE;
  229. }
  230. static inline bool cluster_is_null(struct swap_cluster_info *info)
  231. {
  232. return info->flags & CLUSTER_FLAG_NEXT_NULL;
  233. }
  234. static inline void cluster_set_null(struct swap_cluster_info *info)
  235. {
  236. info->flags = CLUSTER_FLAG_NEXT_NULL;
  237. info->data = 0;
  238. }
  239. static inline bool cluster_is_huge(struct swap_cluster_info *info)
  240. {
  241. if (IS_ENABLED(CONFIG_THP_SWAP))
  242. return info->flags & CLUSTER_FLAG_HUGE;
  243. return false;
  244. }
  245. static inline void cluster_clear_huge(struct swap_cluster_info *info)
  246. {
  247. info->flags &= ~CLUSTER_FLAG_HUGE;
  248. }
  249. static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
  250. unsigned long offset)
  251. {
  252. struct swap_cluster_info *ci;
  253. ci = si->cluster_info;
  254. if (ci) {
  255. ci += offset / SWAPFILE_CLUSTER;
  256. spin_lock(&ci->lock);
  257. }
  258. return ci;
  259. }
  260. static inline void unlock_cluster(struct swap_cluster_info *ci)
  261. {
  262. if (ci)
  263. spin_unlock(&ci->lock);
  264. }
  265. /*
  266. * Determine the locking method in use for this device. Return
  267. * swap_cluster_info if SSD-style cluster-based locking is in place.
  268. */
  269. static inline struct swap_cluster_info *lock_cluster_or_swap_info(
  270. struct swap_info_struct *si, unsigned long offset)
  271. {
  272. struct swap_cluster_info *ci;
  273. /* Try to use fine-grained SSD-style locking if available: */
  274. ci = lock_cluster(si, offset);
  275. /* Otherwise, fall back to traditional, coarse locking: */
  276. if (!ci)
  277. spin_lock(&si->lock);
  278. return ci;
  279. }
  280. static inline void unlock_cluster_or_swap_info(struct swap_info_struct *si,
  281. struct swap_cluster_info *ci)
  282. {
  283. if (ci)
  284. unlock_cluster(ci);
  285. else
  286. spin_unlock(&si->lock);
  287. }
  288. static inline bool cluster_list_empty(struct swap_cluster_list *list)
  289. {
  290. return cluster_is_null(&list->head);
  291. }
  292. static inline unsigned int cluster_list_first(struct swap_cluster_list *list)
  293. {
  294. return cluster_next(&list->head);
  295. }
  296. static void cluster_list_init(struct swap_cluster_list *list)
  297. {
  298. cluster_set_null(&list->head);
  299. cluster_set_null(&list->tail);
  300. }
  301. static void cluster_list_add_tail(struct swap_cluster_list *list,
  302. struct swap_cluster_info *ci,
  303. unsigned int idx)
  304. {
  305. if (cluster_list_empty(list)) {
  306. cluster_set_next_flag(&list->head, idx, 0);
  307. cluster_set_next_flag(&list->tail, idx, 0);
  308. } else {
  309. struct swap_cluster_info *ci_tail;
  310. unsigned int tail = cluster_next(&list->tail);
  311. /*
  312. * Nested cluster lock, but both cluster locks are
  313. * only acquired when we held swap_info_struct->lock
  314. */
  315. ci_tail = ci + tail;
  316. spin_lock_nested(&ci_tail->lock, SINGLE_DEPTH_NESTING);
  317. cluster_set_next(ci_tail, idx);
  318. spin_unlock(&ci_tail->lock);
  319. cluster_set_next_flag(&list->tail, idx, 0);
  320. }
  321. }
  322. static unsigned int cluster_list_del_first(struct swap_cluster_list *list,
  323. struct swap_cluster_info *ci)
  324. {
  325. unsigned int idx;
  326. idx = cluster_next(&list->head);
  327. if (cluster_next(&list->tail) == idx) {
  328. cluster_set_null(&list->head);
  329. cluster_set_null(&list->tail);
  330. } else
  331. cluster_set_next_flag(&list->head,
  332. cluster_next(&ci[idx]), 0);
  333. return idx;
  334. }
  335. /* Add a cluster to discard list and schedule it to do discard */
  336. static void swap_cluster_schedule_discard(struct swap_info_struct *si,
  337. unsigned int idx)
  338. {
  339. /*
  340. * If scan_swap_map() can't find a free cluster, it will check
  341. * si->swap_map directly. To make sure the discarding cluster isn't
  342. * taken by scan_swap_map(), mark the swap entries bad (occupied). It
  343. * will be cleared after discard
  344. */
  345. memset(si->swap_map + idx * SWAPFILE_CLUSTER,
  346. SWAP_MAP_BAD, SWAPFILE_CLUSTER);
  347. cluster_list_add_tail(&si->discard_clusters, si->cluster_info, idx);
  348. schedule_work(&si->discard_work);
  349. }
  350. static void __free_cluster(struct swap_info_struct *si, unsigned long idx)
  351. {
  352. struct swap_cluster_info *ci = si->cluster_info;
  353. cluster_set_flag(ci + idx, CLUSTER_FLAG_FREE);
  354. cluster_list_add_tail(&si->free_clusters, ci, idx);
  355. }
  356. /*
  357. * Doing discard actually. After a cluster discard is finished, the cluster
  358. * will be added to free cluster list. caller should hold si->lock.
  359. */
  360. static void swap_do_scheduled_discard(struct swap_info_struct *si)
  361. {
  362. struct swap_cluster_info *info, *ci;
  363. unsigned int idx;
  364. info = si->cluster_info;
  365. while (!cluster_list_empty(&si->discard_clusters)) {
  366. idx = cluster_list_del_first(&si->discard_clusters, info);
  367. spin_unlock(&si->lock);
  368. discard_swap_cluster(si, idx * SWAPFILE_CLUSTER,
  369. SWAPFILE_CLUSTER);
  370. spin_lock(&si->lock);
  371. ci = lock_cluster(si, idx * SWAPFILE_CLUSTER);
  372. __free_cluster(si, idx);
  373. memset(si->swap_map + idx * SWAPFILE_CLUSTER,
  374. 0, SWAPFILE_CLUSTER);
  375. unlock_cluster(ci);
  376. }
  377. }
  378. static void swap_discard_work(struct work_struct *work)
  379. {
  380. struct swap_info_struct *si;
  381. si = container_of(work, struct swap_info_struct, discard_work);
  382. spin_lock(&si->lock);
  383. swap_do_scheduled_discard(si);
  384. spin_unlock(&si->lock);
  385. }
  386. static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
  387. {
  388. struct swap_cluster_info *ci = si->cluster_info;
  389. VM_BUG_ON(cluster_list_first(&si->free_clusters) != idx);
  390. cluster_list_del_first(&si->free_clusters, ci);
  391. cluster_set_count_flag(ci + idx, 0, 0);
  392. }
  393. static void free_cluster(struct swap_info_struct *si, unsigned long idx)
  394. {
  395. struct swap_cluster_info *ci = si->cluster_info + idx;
  396. VM_BUG_ON(cluster_count(ci) != 0);
  397. /*
  398. * If the swap is discardable, prepare discard the cluster
  399. * instead of free it immediately. The cluster will be freed
  400. * after discard.
  401. */
  402. if ((si->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) ==
  403. (SWP_WRITEOK | SWP_PAGE_DISCARD)) {
  404. swap_cluster_schedule_discard(si, idx);
  405. return;
  406. }
  407. __free_cluster(si, idx);
  408. }
  409. /*
  410. * The cluster corresponding to page_nr will be used. The cluster will be
  411. * removed from free cluster list and its usage counter will be increased.
  412. */
  413. static void inc_cluster_info_page(struct swap_info_struct *p,
  414. struct swap_cluster_info *cluster_info, unsigned long page_nr)
  415. {
  416. unsigned long idx = page_nr / SWAPFILE_CLUSTER;
  417. if (!cluster_info)
  418. return;
  419. if (cluster_is_free(&cluster_info[idx]))
  420. alloc_cluster(p, idx);
  421. VM_BUG_ON(cluster_count(&cluster_info[idx]) >= SWAPFILE_CLUSTER);
  422. cluster_set_count(&cluster_info[idx],
  423. cluster_count(&cluster_info[idx]) + 1);
  424. }
  425. /*
  426. * The cluster corresponding to page_nr decreases one usage. If the usage
  427. * counter becomes 0, which means no page in the cluster is in using, we can
  428. * optionally discard the cluster and add it to free cluster list.
  429. */
  430. static void dec_cluster_info_page(struct swap_info_struct *p,
  431. struct swap_cluster_info *cluster_info, unsigned long page_nr)
  432. {
  433. unsigned long idx = page_nr / SWAPFILE_CLUSTER;
  434. if (!cluster_info)
  435. return;
  436. VM_BUG_ON(cluster_count(&cluster_info[idx]) == 0);
  437. cluster_set_count(&cluster_info[idx],
  438. cluster_count(&cluster_info[idx]) - 1);
  439. if (cluster_count(&cluster_info[idx]) == 0)
  440. free_cluster(p, idx);
  441. }
  442. /*
  443. * It's possible scan_swap_map() uses a free cluster in the middle of free
  444. * cluster list. Avoiding such abuse to avoid list corruption.
  445. */
  446. static bool
  447. scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si,
  448. unsigned long offset)
  449. {
  450. struct percpu_cluster *percpu_cluster;
  451. bool conflict;
  452. offset /= SWAPFILE_CLUSTER;
  453. conflict = !cluster_list_empty(&si->free_clusters) &&
  454. offset != cluster_list_first(&si->free_clusters) &&
  455. cluster_is_free(&si->cluster_info[offset]);
  456. if (!conflict)
  457. return false;
  458. percpu_cluster = this_cpu_ptr(si->percpu_cluster);
  459. cluster_set_null(&percpu_cluster->index);
  460. return true;
  461. }
  462. /*
  463. * Try to get a swap entry from current cpu's swap entry pool (a cluster). This
  464. * might involve allocating a new cluster for current CPU too.
  465. */
  466. static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
  467. unsigned long *offset, unsigned long *scan_base)
  468. {
  469. struct percpu_cluster *cluster;
  470. struct swap_cluster_info *ci;
  471. bool found_free;
  472. unsigned long tmp, max;
  473. new_cluster:
  474. cluster = this_cpu_ptr(si->percpu_cluster);
  475. if (cluster_is_null(&cluster->index)) {
  476. if (!cluster_list_empty(&si->free_clusters)) {
  477. cluster->index = si->free_clusters.head;
  478. cluster->next = cluster_next(&cluster->index) *
  479. SWAPFILE_CLUSTER;
  480. } else if (!cluster_list_empty(&si->discard_clusters)) {
  481. /*
  482. * we don't have free cluster but have some clusters in
  483. * discarding, do discard now and reclaim them
  484. */
  485. swap_do_scheduled_discard(si);
  486. *scan_base = *offset = si->cluster_next;
  487. goto new_cluster;
  488. } else
  489. return false;
  490. }
  491. found_free = false;
  492. /*
  493. * Other CPUs can use our cluster if they can't find a free cluster,
  494. * check if there is still free entry in the cluster
  495. */
  496. tmp = cluster->next;
  497. max = min_t(unsigned long, si->max,
  498. (cluster_next(&cluster->index) + 1) * SWAPFILE_CLUSTER);
  499. if (tmp >= max) {
  500. cluster_set_null(&cluster->index);
  501. goto new_cluster;
  502. }
  503. ci = lock_cluster(si, tmp);
  504. while (tmp < max) {
  505. if (!si->swap_map[tmp]) {
  506. found_free = true;
  507. break;
  508. }
  509. tmp++;
  510. }
  511. unlock_cluster(ci);
  512. if (!found_free) {
  513. cluster_set_null(&cluster->index);
  514. goto new_cluster;
  515. }
  516. cluster->next = tmp + 1;
  517. *offset = tmp;
  518. *scan_base = tmp;
  519. return found_free;
  520. }
  521. static void __del_from_avail_list(struct swap_info_struct *p)
  522. {
  523. int nid;
  524. for_each_node(nid)
  525. plist_del(&p->avail_lists[nid], &swap_avail_heads[nid]);
  526. }
  527. static void del_from_avail_list(struct swap_info_struct *p)
  528. {
  529. spin_lock(&swap_avail_lock);
  530. __del_from_avail_list(p);
  531. spin_unlock(&swap_avail_lock);
  532. }
  533. static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
  534. unsigned int nr_entries)
  535. {
  536. unsigned int end = offset + nr_entries - 1;
  537. if (offset == si->lowest_bit)
  538. si->lowest_bit += nr_entries;
  539. if (end == si->highest_bit)
  540. si->highest_bit -= nr_entries;
  541. si->inuse_pages += nr_entries;
  542. if (si->inuse_pages == si->pages) {
  543. si->lowest_bit = si->max;
  544. si->highest_bit = 0;
  545. del_from_avail_list(si);
  546. }
  547. }
  548. static void add_to_avail_list(struct swap_info_struct *p)
  549. {
  550. int nid;
  551. spin_lock(&swap_avail_lock);
  552. for_each_node(nid) {
  553. WARN_ON(!plist_node_empty(&p->avail_lists[nid]));
  554. plist_add(&p->avail_lists[nid], &swap_avail_heads[nid]);
  555. }
  556. spin_unlock(&swap_avail_lock);
  557. }
  558. static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
  559. unsigned int nr_entries)
  560. {
  561. unsigned long end = offset + nr_entries - 1;
  562. void (*swap_slot_free_notify)(struct block_device *, unsigned long);
  563. if (offset < si->lowest_bit)
  564. si->lowest_bit = offset;
  565. if (end > si->highest_bit) {
  566. bool was_full = !si->highest_bit;
  567. si->highest_bit = end;
  568. if (was_full && (si->flags & SWP_WRITEOK))
  569. add_to_avail_list(si);
  570. }
  571. atomic_long_add(nr_entries, &nr_swap_pages);
  572. si->inuse_pages -= nr_entries;
  573. if (si->flags & SWP_BLKDEV)
  574. swap_slot_free_notify =
  575. si->bdev->bd_disk->fops->swap_slot_free_notify;
  576. else
  577. swap_slot_free_notify = NULL;
  578. while (offset <= end) {
  579. frontswap_invalidate_page(si->type, offset);
  580. if (swap_slot_free_notify)
  581. swap_slot_free_notify(si->bdev, offset);
  582. offset++;
  583. }
  584. }
  585. static int scan_swap_map_slots(struct swap_info_struct *si,
  586. unsigned char usage, int nr,
  587. swp_entry_t slots[])
  588. {
  589. struct swap_cluster_info *ci;
  590. unsigned long offset;
  591. unsigned long scan_base;
  592. unsigned long last_in_cluster = 0;
  593. int latency_ration = LATENCY_LIMIT;
  594. int n_ret = 0;
  595. if (nr > SWAP_BATCH)
  596. nr = SWAP_BATCH;
  597. /*
  598. * We try to cluster swap pages by allocating them sequentially
  599. * in swap. Once we've allocated SWAPFILE_CLUSTER pages this
  600. * way, however, we resort to first-free allocation, starting
  601. * a new cluster. This prevents us from scattering swap pages
  602. * all over the entire swap partition, so that we reduce
  603. * overall disk seek times between swap pages. -- sct
  604. * But we do now try to find an empty cluster. -Andrea
  605. * And we let swap pages go all over an SSD partition. Hugh
  606. */
  607. si->flags += SWP_SCANNING;
  608. scan_base = offset = si->cluster_next;
  609. /* SSD algorithm */
  610. if (si->cluster_info) {
  611. if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
  612. goto checks;
  613. else
  614. goto scan;
  615. }
  616. if (unlikely(!si->cluster_nr--)) {
  617. if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
  618. si->cluster_nr = SWAPFILE_CLUSTER - 1;
  619. goto checks;
  620. }
  621. spin_unlock(&si->lock);
  622. /*
  623. * If seek is expensive, start searching for new cluster from
  624. * start of partition, to minimize the span of allocated swap.
  625. * If seek is cheap, that is the SWP_SOLIDSTATE si->cluster_info
  626. * case, just handled by scan_swap_map_try_ssd_cluster() above.
  627. */
  628. scan_base = offset = si->lowest_bit;
  629. last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
  630. /* Locate the first empty (unaligned) cluster */
  631. for (; last_in_cluster <= si->highest_bit; offset++) {
  632. if (si->swap_map[offset])
  633. last_in_cluster = offset + SWAPFILE_CLUSTER;
  634. else if (offset == last_in_cluster) {
  635. spin_lock(&si->lock);
  636. offset -= SWAPFILE_CLUSTER - 1;
  637. si->cluster_next = offset;
  638. si->cluster_nr = SWAPFILE_CLUSTER - 1;
  639. goto checks;
  640. }
  641. if (unlikely(--latency_ration < 0)) {
  642. cond_resched();
  643. latency_ration = LATENCY_LIMIT;
  644. }
  645. }
  646. offset = scan_base;
  647. spin_lock(&si->lock);
  648. si->cluster_nr = SWAPFILE_CLUSTER - 1;
  649. }
  650. checks:
  651. if (si->cluster_info) {
  652. while (scan_swap_map_ssd_cluster_conflict(si, offset)) {
  653. /* take a break if we already got some slots */
  654. if (n_ret)
  655. goto done;
  656. if (!scan_swap_map_try_ssd_cluster(si, &offset,
  657. &scan_base))
  658. goto scan;
  659. }
  660. }
  661. if (!(si->flags & SWP_WRITEOK))
  662. goto no_page;
  663. if (!si->highest_bit)
  664. goto no_page;
  665. if (offset > si->highest_bit)
  666. scan_base = offset = si->lowest_bit;
  667. ci = lock_cluster(si, offset);
  668. /* reuse swap entry of cache-only swap if not busy. */
  669. if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
  670. int swap_was_freed;
  671. unlock_cluster(ci);
  672. spin_unlock(&si->lock);
  673. swap_was_freed = __try_to_reclaim_swap(si, offset);
  674. spin_lock(&si->lock);
  675. /* entry was freed successfully, try to use this again */
  676. if (swap_was_freed)
  677. goto checks;
  678. goto scan; /* check next one */
  679. }
  680. if (si->swap_map[offset]) {
  681. unlock_cluster(ci);
  682. if (!n_ret)
  683. goto scan;
  684. else
  685. goto done;
  686. }
  687. si->swap_map[offset] = usage;
  688. inc_cluster_info_page(si, si->cluster_info, offset);
  689. unlock_cluster(ci);
  690. swap_range_alloc(si, offset, 1);
  691. si->cluster_next = offset + 1;
  692. slots[n_ret++] = swp_entry(si->type, offset);
  693. /* got enough slots or reach max slots? */
  694. if ((n_ret == nr) || (offset >= si->highest_bit))
  695. goto done;
  696. /* search for next available slot */
  697. /* time to take a break? */
  698. if (unlikely(--latency_ration < 0)) {
  699. if (n_ret)
  700. goto done;
  701. spin_unlock(&si->lock);
  702. cond_resched();
  703. spin_lock(&si->lock);
  704. latency_ration = LATENCY_LIMIT;
  705. }
  706. /* try to get more slots in cluster */
  707. if (si->cluster_info) {
  708. if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
  709. goto checks;
  710. else
  711. goto done;
  712. }
  713. /* non-ssd case */
  714. ++offset;
  715. /* non-ssd case, still more slots in cluster? */
  716. if (si->cluster_nr && !si->swap_map[offset]) {
  717. --si->cluster_nr;
  718. goto checks;
  719. }
  720. done:
  721. si->flags -= SWP_SCANNING;
  722. return n_ret;
  723. scan:
  724. spin_unlock(&si->lock);
  725. while (++offset <= si->highest_bit) {
  726. if (!si->swap_map[offset]) {
  727. spin_lock(&si->lock);
  728. goto checks;
  729. }
  730. if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
  731. spin_lock(&si->lock);
  732. goto checks;
  733. }
  734. if (unlikely(--latency_ration < 0)) {
  735. cond_resched();
  736. latency_ration = LATENCY_LIMIT;
  737. }
  738. }
  739. offset = si->lowest_bit;
  740. while (offset < scan_base) {
  741. if (!si->swap_map[offset]) {
  742. spin_lock(&si->lock);
  743. goto checks;
  744. }
  745. if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
  746. spin_lock(&si->lock);
  747. goto checks;
  748. }
  749. if (unlikely(--latency_ration < 0)) {
  750. cond_resched();
  751. latency_ration = LATENCY_LIMIT;
  752. }
  753. offset++;
  754. }
  755. spin_lock(&si->lock);
  756. no_page:
  757. si->flags -= SWP_SCANNING;
  758. return n_ret;
  759. }
  760. static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
  761. {
  762. unsigned long idx;
  763. struct swap_cluster_info *ci;
  764. unsigned long offset, i;
  765. unsigned char *map;
  766. /*
  767. * Should not even be attempting cluster allocations when huge
  768. * page swap is disabled. Warn and fail the allocation.
  769. */
  770. if (!IS_ENABLED(CONFIG_THP_SWAP)) {
  771. VM_WARN_ON_ONCE(1);
  772. return 0;
  773. }
  774. if (cluster_list_empty(&si->free_clusters))
  775. return 0;
  776. idx = cluster_list_first(&si->free_clusters);
  777. offset = idx * SWAPFILE_CLUSTER;
  778. ci = lock_cluster(si, offset);
  779. alloc_cluster(si, idx);
  780. cluster_set_count_flag(ci, SWAPFILE_CLUSTER, CLUSTER_FLAG_HUGE);
  781. map = si->swap_map + offset;
  782. for (i = 0; i < SWAPFILE_CLUSTER; i++)
  783. map[i] = SWAP_HAS_CACHE;
  784. unlock_cluster(ci);
  785. swap_range_alloc(si, offset, SWAPFILE_CLUSTER);
  786. *slot = swp_entry(si->type, offset);
  787. return 1;
  788. }
  789. static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
  790. {
  791. unsigned long offset = idx * SWAPFILE_CLUSTER;
  792. struct swap_cluster_info *ci;
  793. ci = lock_cluster(si, offset);
  794. cluster_set_count_flag(ci, 0, 0);
  795. free_cluster(si, idx);
  796. unlock_cluster(ci);
  797. swap_range_free(si, offset, SWAPFILE_CLUSTER);
  798. }
  799. static unsigned long scan_swap_map(struct swap_info_struct *si,
  800. unsigned char usage)
  801. {
  802. swp_entry_t entry;
  803. int n_ret;
  804. n_ret = scan_swap_map_slots(si, usage, 1, &entry);
  805. if (n_ret)
  806. return swp_offset(entry);
  807. else
  808. return 0;
  809. }
  810. int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size)
  811. {
  812. unsigned long size = swap_entry_size(entry_size);
  813. struct swap_info_struct *si, *next;
  814. long avail_pgs;
  815. int n_ret = 0;
  816. int node;
  817. /* Only single cluster request supported */
  818. WARN_ON_ONCE(n_goal > 1 && size == SWAPFILE_CLUSTER);
  819. avail_pgs = atomic_long_read(&nr_swap_pages) / size;
  820. if (avail_pgs <= 0)
  821. goto noswap;
  822. if (n_goal > SWAP_BATCH)
  823. n_goal = SWAP_BATCH;
  824. if (n_goal > avail_pgs)
  825. n_goal = avail_pgs;
  826. atomic_long_sub(n_goal * size, &nr_swap_pages);
  827. spin_lock(&swap_avail_lock);
  828. start_over:
  829. node = numa_node_id();
  830. plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) {
  831. /* requeue si to after same-priority siblings */
  832. plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
  833. spin_unlock(&swap_avail_lock);
  834. spin_lock(&si->lock);
  835. if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) {
  836. spin_lock(&swap_avail_lock);
  837. if (plist_node_empty(&si->avail_lists[node])) {
  838. spin_unlock(&si->lock);
  839. goto nextsi;
  840. }
  841. WARN(!si->highest_bit,
  842. "swap_info %d in list but !highest_bit\n",
  843. si->type);
  844. WARN(!(si->flags & SWP_WRITEOK),
  845. "swap_info %d in list but !SWP_WRITEOK\n",
  846. si->type);
  847. __del_from_avail_list(si);
  848. spin_unlock(&si->lock);
  849. goto nextsi;
  850. }
  851. if (size == SWAPFILE_CLUSTER) {
  852. if (!(si->flags & SWP_FILE))
  853. n_ret = swap_alloc_cluster(si, swp_entries);
  854. } else
  855. n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
  856. n_goal, swp_entries);
  857. spin_unlock(&si->lock);
  858. if (n_ret || size == SWAPFILE_CLUSTER)
  859. goto check_out;
  860. pr_debug("scan_swap_map of si %d failed to find offset\n",
  861. si->type);
  862. spin_lock(&swap_avail_lock);
  863. nextsi:
  864. /*
  865. * if we got here, it's likely that si was almost full before,
  866. * and since scan_swap_map() can drop the si->lock, multiple
  867. * callers probably all tried to get a page from the same si
  868. * and it filled up before we could get one; or, the si filled
  869. * up between us dropping swap_avail_lock and taking si->lock.
  870. * Since we dropped the swap_avail_lock, the swap_avail_head
  871. * list may have been modified; so if next is still in the
  872. * swap_avail_head list then try it, otherwise start over
  873. * if we have not gotten any slots.
  874. */
  875. if (plist_node_empty(&next->avail_lists[node]))
  876. goto start_over;
  877. }
  878. spin_unlock(&swap_avail_lock);
  879. check_out:
  880. if (n_ret < n_goal)
  881. atomic_long_add((long)(n_goal - n_ret) * size,
  882. &nr_swap_pages);
  883. noswap:
  884. return n_ret;
  885. }
  886. /* The only caller of this function is now suspend routine */
  887. swp_entry_t get_swap_page_of_type(int type)
  888. {
  889. struct swap_info_struct *si;
  890. pgoff_t offset;
  891. si = swap_info[type];
  892. spin_lock(&si->lock);
  893. if (si && (si->flags & SWP_WRITEOK)) {
  894. atomic_long_dec(&nr_swap_pages);
  895. /* This is called for allocating swap entry, not cache */
  896. offset = scan_swap_map(si, 1);
  897. if (offset) {
  898. spin_unlock(&si->lock);
  899. return swp_entry(type, offset);
  900. }
  901. atomic_long_inc(&nr_swap_pages);
  902. }
  903. spin_unlock(&si->lock);
  904. return (swp_entry_t) {0};
  905. }
  906. static struct swap_info_struct *__swap_info_get(swp_entry_t entry)
  907. {
  908. struct swap_info_struct *p;
  909. unsigned long offset, type;
  910. if (!entry.val)
  911. goto out;
  912. type = swp_type(entry);
  913. if (type >= nr_swapfiles)
  914. goto bad_nofile;
  915. p = swap_info[type];
  916. if (!(p->flags & SWP_USED))
  917. goto bad_device;
  918. offset = swp_offset(entry);
  919. if (offset >= p->max)
  920. goto bad_offset;
  921. return p;
  922. bad_offset:
  923. pr_err("swap_info_get: %s%08lx\n", Bad_offset, entry.val);
  924. goto out;
  925. bad_device:
  926. pr_err("swap_info_get: %s%08lx\n", Unused_file, entry.val);
  927. goto out;
  928. bad_nofile:
  929. pr_err("swap_info_get: %s%08lx\n", Bad_file, entry.val);
  930. out:
  931. return NULL;
  932. }
  933. static struct swap_info_struct *_swap_info_get(swp_entry_t entry)
  934. {
  935. struct swap_info_struct *p;
  936. p = __swap_info_get(entry);
  937. if (!p)
  938. goto out;
  939. if (!p->swap_map[swp_offset(entry)])
  940. goto bad_free;
  941. return p;
  942. bad_free:
  943. pr_err("swap_info_get: %s%08lx\n", Unused_offset, entry.val);
  944. goto out;
  945. out:
  946. return NULL;
  947. }
  948. static struct swap_info_struct *swap_info_get(swp_entry_t entry)
  949. {
  950. struct swap_info_struct *p;
  951. p = _swap_info_get(entry);
  952. if (p)
  953. spin_lock(&p->lock);
  954. return p;
  955. }
  956. static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry,
  957. struct swap_info_struct *q)
  958. {
  959. struct swap_info_struct *p;
  960. p = _swap_info_get(entry);
  961. if (p != q) {
  962. if (q != NULL)
  963. spin_unlock(&q->lock);
  964. if (p != NULL)
  965. spin_lock(&p->lock);
  966. }
  967. return p;
  968. }
  969. static unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
  970. unsigned long offset,
  971. unsigned char usage)
  972. {
  973. unsigned char count;
  974. unsigned char has_cache;
  975. count = p->swap_map[offset];
  976. has_cache = count & SWAP_HAS_CACHE;
  977. count &= ~SWAP_HAS_CACHE;
  978. if (usage == SWAP_HAS_CACHE) {
  979. VM_BUG_ON(!has_cache);
  980. has_cache = 0;
  981. } else if (count == SWAP_MAP_SHMEM) {
  982. /*
  983. * Or we could insist on shmem.c using a special
  984. * swap_shmem_free() and free_shmem_swap_and_cache()...
  985. */
  986. count = 0;
  987. } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
  988. if (count == COUNT_CONTINUED) {
  989. if (swap_count_continued(p, offset, count))
  990. count = SWAP_MAP_MAX | COUNT_CONTINUED;
  991. else
  992. count = SWAP_MAP_MAX;
  993. } else
  994. count--;
  995. }
  996. usage = count | has_cache;
  997. p->swap_map[offset] = usage ? : SWAP_HAS_CACHE;
  998. return usage;
  999. }
  1000. static unsigned char __swap_entry_free(struct swap_info_struct *p,
  1001. swp_entry_t entry, unsigned char usage)
  1002. {
  1003. struct swap_cluster_info *ci;
  1004. unsigned long offset = swp_offset(entry);
  1005. ci = lock_cluster_or_swap_info(p, offset);
  1006. usage = __swap_entry_free_locked(p, offset, usage);
  1007. unlock_cluster_or_swap_info(p, ci);
  1008. return usage;
  1009. }
  1010. static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry)
  1011. {
  1012. struct swap_cluster_info *ci;
  1013. unsigned long offset = swp_offset(entry);
  1014. unsigned char count;
  1015. ci = lock_cluster(p, offset);
  1016. count = p->swap_map[offset];
  1017. VM_BUG_ON(count != SWAP_HAS_CACHE);
  1018. p->swap_map[offset] = 0;
  1019. dec_cluster_info_page(p, p->cluster_info, offset);
  1020. unlock_cluster(ci);
  1021. mem_cgroup_uncharge_swap(entry, 1);
  1022. swap_range_free(p, offset, 1);
  1023. }
  1024. /*
  1025. * Caller has made sure that the swap device corresponding to entry
  1026. * is still around or has not been recycled.
  1027. */
  1028. void swap_free(swp_entry_t entry)
  1029. {
  1030. struct swap_info_struct *p;
  1031. p = _swap_info_get(entry);
  1032. if (p) {
  1033. if (!__swap_entry_free(p, entry, 1))
  1034. free_swap_slot(entry);
  1035. }
  1036. }
  1037. /*
  1038. * Called after dropping swapcache to decrease refcnt to swap entries.
  1039. */
  1040. void put_swap_page(struct page *page, swp_entry_t entry)
  1041. {
  1042. unsigned long offset = swp_offset(entry);
  1043. unsigned long idx = offset / SWAPFILE_CLUSTER;
  1044. struct swap_cluster_info *ci;
  1045. struct swap_info_struct *si;
  1046. unsigned char *map;
  1047. unsigned int i, free_entries = 0;
  1048. unsigned char val;
  1049. int size = swap_entry_size(hpage_nr_pages(page));
  1050. si = _swap_info_get(entry);
  1051. if (!si)
  1052. return;
  1053. ci = lock_cluster_or_swap_info(si, offset);
  1054. if (size == SWAPFILE_CLUSTER) {
  1055. VM_BUG_ON(!cluster_is_huge(ci));
  1056. map = si->swap_map + offset;
  1057. for (i = 0; i < SWAPFILE_CLUSTER; i++) {
  1058. val = map[i];
  1059. VM_BUG_ON(!(val & SWAP_HAS_CACHE));
  1060. if (val == SWAP_HAS_CACHE)
  1061. free_entries++;
  1062. }
  1063. cluster_clear_huge(ci);
  1064. if (free_entries == SWAPFILE_CLUSTER) {
  1065. unlock_cluster_or_swap_info(si, ci);
  1066. spin_lock(&si->lock);
  1067. ci = lock_cluster(si, offset);
  1068. memset(map, 0, SWAPFILE_CLUSTER);
  1069. unlock_cluster(ci);
  1070. mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
  1071. swap_free_cluster(si, idx);
  1072. spin_unlock(&si->lock);
  1073. return;
  1074. }
  1075. }
  1076. for (i = 0; i < size; i++, entry.val++) {
  1077. if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) {
  1078. unlock_cluster_or_swap_info(si, ci);
  1079. free_swap_slot(entry);
  1080. if (i == size - 1)
  1081. return;
  1082. lock_cluster_or_swap_info(si, offset);
  1083. }
  1084. }
  1085. unlock_cluster_or_swap_info(si, ci);
  1086. }
  1087. #ifdef CONFIG_THP_SWAP
  1088. int split_swap_cluster(swp_entry_t entry)
  1089. {
  1090. struct swap_info_struct *si;
  1091. struct swap_cluster_info *ci;
  1092. unsigned long offset = swp_offset(entry);
  1093. si = _swap_info_get(entry);
  1094. if (!si)
  1095. return -EBUSY;
  1096. ci = lock_cluster(si, offset);
  1097. cluster_clear_huge(ci);
  1098. unlock_cluster(ci);
  1099. return 0;
  1100. }
  1101. #endif
  1102. static int swp_entry_cmp(const void *ent1, const void *ent2)
  1103. {
  1104. const swp_entry_t *e1 = ent1, *e2 = ent2;
  1105. return (int)swp_type(*e1) - (int)swp_type(*e2);
  1106. }
  1107. void swapcache_free_entries(swp_entry_t *entries, int n)
  1108. {
  1109. struct swap_info_struct *p, *prev;
  1110. int i;
  1111. if (n <= 0)
  1112. return;
  1113. prev = NULL;
  1114. p = NULL;
  1115. /*
  1116. * Sort swap entries by swap device, so each lock is only taken once.
  1117. * nr_swapfiles isn't absolutely correct, but the overhead of sort() is
  1118. * so low that it isn't necessary to optimize further.
  1119. */
  1120. if (nr_swapfiles > 1)
  1121. sort(entries, n, sizeof(entries[0]), swp_entry_cmp, NULL);
  1122. for (i = 0; i < n; ++i) {
  1123. p = swap_info_get_cont(entries[i], prev);
  1124. if (p)
  1125. swap_entry_free(p, entries[i]);
  1126. prev = p;
  1127. }
  1128. if (p)
  1129. spin_unlock(&p->lock);
  1130. }
  1131. /*
  1132. * How many references to page are currently swapped out?
  1133. * This does not give an exact answer when swap count is continued,
  1134. * but does include the high COUNT_CONTINUED flag to allow for that.
  1135. */
  1136. int page_swapcount(struct page *page)
  1137. {
  1138. int count = 0;
  1139. struct swap_info_struct *p;
  1140. struct swap_cluster_info *ci;
  1141. swp_entry_t entry;
  1142. unsigned long offset;
  1143. entry.val = page_private(page);
  1144. p = _swap_info_get(entry);
  1145. if (p) {
  1146. offset = swp_offset(entry);
  1147. ci = lock_cluster_or_swap_info(p, offset);
  1148. count = swap_count(p->swap_map[offset]);
  1149. unlock_cluster_or_swap_info(p, ci);
  1150. }
  1151. return count;
  1152. }
  1153. int __swap_count(struct swap_info_struct *si, swp_entry_t entry)
  1154. {
  1155. pgoff_t offset = swp_offset(entry);
  1156. return swap_count(si->swap_map[offset]);
  1157. }
  1158. static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
  1159. {
  1160. int count = 0;
  1161. pgoff_t offset = swp_offset(entry);
  1162. struct swap_cluster_info *ci;
  1163. ci = lock_cluster_or_swap_info(si, offset);
  1164. count = swap_count(si->swap_map[offset]);
  1165. unlock_cluster_or_swap_info(si, ci);
  1166. return count;
  1167. }
  1168. /*
  1169. * How many references to @entry are currently swapped out?
  1170. * This does not give an exact answer when swap count is continued,
  1171. * but does include the high COUNT_CONTINUED flag to allow for that.
  1172. */
  1173. int __swp_swapcount(swp_entry_t entry)
  1174. {
  1175. int count = 0;
  1176. struct swap_info_struct *si;
  1177. si = __swap_info_get(entry);
  1178. if (si)
  1179. count = swap_swapcount(si, entry);
  1180. return count;
  1181. }
  1182. /*
  1183. * How many references to @entry are currently swapped out?
  1184. * This considers COUNT_CONTINUED so it returns exact answer.
  1185. */
  1186. int swp_swapcount(swp_entry_t entry)
  1187. {
  1188. int count, tmp_count, n;
  1189. struct swap_info_struct *p;
  1190. struct swap_cluster_info *ci;
  1191. struct page *page;
  1192. pgoff_t offset;
  1193. unsigned char *map;
  1194. p = _swap_info_get(entry);
  1195. if (!p)
  1196. return 0;
  1197. offset = swp_offset(entry);
  1198. ci = lock_cluster_or_swap_info(p, offset);
  1199. count = swap_count(p->swap_map[offset]);
  1200. if (!(count & COUNT_CONTINUED))
  1201. goto out;
  1202. count &= ~COUNT_CONTINUED;
  1203. n = SWAP_MAP_MAX + 1;
  1204. page = vmalloc_to_page(p->swap_map + offset);
  1205. offset &= ~PAGE_MASK;
  1206. VM_BUG_ON(page_private(page) != SWP_CONTINUED);
  1207. do {
  1208. page = list_next_entry(page, lru);
  1209. map = kmap_atomic(page);
  1210. tmp_count = map[offset];
  1211. kunmap_atomic(map);
  1212. count += (tmp_count & ~COUNT_CONTINUED) * n;
  1213. n *= (SWAP_CONT_MAX + 1);
  1214. } while (tmp_count & COUNT_CONTINUED);
  1215. out:
  1216. unlock_cluster_or_swap_info(p, ci);
  1217. return count;
  1218. }
  1219. static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
  1220. swp_entry_t entry)
  1221. {
  1222. struct swap_cluster_info *ci;
  1223. unsigned char *map = si->swap_map;
  1224. unsigned long roffset = swp_offset(entry);
  1225. unsigned long offset = round_down(roffset, SWAPFILE_CLUSTER);
  1226. int i;
  1227. bool ret = false;
  1228. ci = lock_cluster_or_swap_info(si, offset);
  1229. if (!ci || !cluster_is_huge(ci)) {
  1230. if (swap_count(map[roffset]))
  1231. ret = true;
  1232. goto unlock_out;
  1233. }
  1234. for (i = 0; i < SWAPFILE_CLUSTER; i++) {
  1235. if (swap_count(map[offset + i])) {
  1236. ret = true;
  1237. break;
  1238. }
  1239. }
  1240. unlock_out:
  1241. unlock_cluster_or_swap_info(si, ci);
  1242. return ret;
  1243. }
  1244. static bool page_swapped(struct page *page)
  1245. {
  1246. swp_entry_t entry;
  1247. struct swap_info_struct *si;
  1248. if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page)))
  1249. return page_swapcount(page) != 0;
  1250. page = compound_head(page);
  1251. entry.val = page_private(page);
  1252. si = _swap_info_get(entry);
  1253. if (si)
  1254. return swap_page_trans_huge_swapped(si, entry);
  1255. return false;
  1256. }
  1257. static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount,
  1258. int *total_swapcount)
  1259. {
  1260. int i, map_swapcount, _total_mapcount, _total_swapcount;
  1261. unsigned long offset = 0;
  1262. struct swap_info_struct *si;
  1263. struct swap_cluster_info *ci = NULL;
  1264. unsigned char *map = NULL;
  1265. int mapcount, swapcount = 0;
  1266. /* hugetlbfs shouldn't call it */
  1267. VM_BUG_ON_PAGE(PageHuge(page), page);
  1268. if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page))) {
  1269. mapcount = page_trans_huge_mapcount(page, total_mapcount);
  1270. if (PageSwapCache(page))
  1271. swapcount = page_swapcount(page);
  1272. if (total_swapcount)
  1273. *total_swapcount = swapcount;
  1274. return mapcount + swapcount;
  1275. }
  1276. page = compound_head(page);
  1277. _total_mapcount = _total_swapcount = map_swapcount = 0;
  1278. if (PageSwapCache(page)) {
  1279. swp_entry_t entry;
  1280. entry.val = page_private(page);
  1281. si = _swap_info_get(entry);
  1282. if (si) {
  1283. map = si->swap_map;
  1284. offset = swp_offset(entry);
  1285. }
  1286. }
  1287. if (map)
  1288. ci = lock_cluster(si, offset);
  1289. for (i = 0; i < HPAGE_PMD_NR; i++) {
  1290. mapcount = atomic_read(&page[i]._mapcount) + 1;
  1291. _total_mapcount += mapcount;
  1292. if (map) {
  1293. swapcount = swap_count(map[offset + i]);
  1294. _total_swapcount += swapcount;
  1295. }
  1296. map_swapcount = max(map_swapcount, mapcount + swapcount);
  1297. }
  1298. unlock_cluster(ci);
  1299. if (PageDoubleMap(page)) {
  1300. map_swapcount -= 1;
  1301. _total_mapcount -= HPAGE_PMD_NR;
  1302. }
  1303. mapcount = compound_mapcount(page);
  1304. map_swapcount += mapcount;
  1305. _total_mapcount += mapcount;
  1306. if (total_mapcount)
  1307. *total_mapcount = _total_mapcount;
  1308. if (total_swapcount)
  1309. *total_swapcount = _total_swapcount;
  1310. return map_swapcount;
  1311. }
  1312. /*
  1313. * We can write to an anon page without COW if there are no other references
  1314. * to it. And as a side-effect, free up its swap: because the old content
  1315. * on disk will never be read, and seeking back there to write new content
  1316. * later would only waste time away from clustering.
  1317. *
  1318. * NOTE: total_map_swapcount should not be relied upon by the caller if
  1319. * reuse_swap_page() returns false, but it may be always overwritten
  1320. * (see the other implementation for CONFIG_SWAP=n).
  1321. */
  1322. bool reuse_swap_page(struct page *page, int *total_map_swapcount)
  1323. {
  1324. int count, total_mapcount, total_swapcount;
  1325. VM_BUG_ON_PAGE(!PageLocked(page), page);
  1326. if (unlikely(PageKsm(page)))
  1327. return false;
  1328. count = page_trans_huge_map_swapcount(page, &total_mapcount,
  1329. &total_swapcount);
  1330. if (total_map_swapcount)
  1331. *total_map_swapcount = total_mapcount + total_swapcount;
  1332. if (count == 1 && PageSwapCache(page) &&
  1333. (likely(!PageTransCompound(page)) ||
  1334. /* The remaining swap count will be freed soon */
  1335. total_swapcount == page_swapcount(page))) {
  1336. if (!PageWriteback(page)) {
  1337. page = compound_head(page);
  1338. delete_from_swap_cache(page);
  1339. SetPageDirty(page);
  1340. } else {
  1341. swp_entry_t entry;
  1342. struct swap_info_struct *p;
  1343. entry.val = page_private(page);
  1344. p = swap_info_get(entry);
  1345. if (p->flags & SWP_STABLE_WRITES) {
  1346. spin_unlock(&p->lock);
  1347. return false;
  1348. }
  1349. spin_unlock(&p->lock);
  1350. }
  1351. }
  1352. return count <= 1;
  1353. }
  1354. /*
  1355. * If swap is getting full, or if there are no more mappings of this page,
  1356. * then try_to_free_swap is called to free its swap space.
  1357. */
  1358. int try_to_free_swap(struct page *page)
  1359. {
  1360. VM_BUG_ON_PAGE(!PageLocked(page), page);
  1361. if (!PageSwapCache(page))
  1362. return 0;
  1363. if (PageWriteback(page))
  1364. return 0;
  1365. if (page_swapped(page))
  1366. return 0;
  1367. /*
  1368. * Once hibernation has begun to create its image of memory,
  1369. * there's a danger that one of the calls to try_to_free_swap()
  1370. * - most probably a call from __try_to_reclaim_swap() while
  1371. * hibernation is allocating its own swap pages for the image,
  1372. * but conceivably even a call from memory reclaim - will free
  1373. * the swap from a page which has already been recorded in the
  1374. * image as a clean swapcache page, and then reuse its swap for
  1375. * another page of the image. On waking from hibernation, the
  1376. * original page might be freed under memory pressure, then
  1377. * later read back in from swap, now with the wrong data.
  1378. *
  1379. * Hibernation suspends storage while it is writing the image
  1380. * to disk so check that here.
  1381. */
  1382. if (pm_suspended_storage())
  1383. return 0;
  1384. page = compound_head(page);
  1385. delete_from_swap_cache(page);
  1386. SetPageDirty(page);
  1387. return 1;
  1388. }
  1389. /*
  1390. * Free the swap entry like above, but also try to
  1391. * free the page cache entry if it is the last user.
  1392. */
  1393. int free_swap_and_cache(swp_entry_t entry)
  1394. {
  1395. struct swap_info_struct *p;
  1396. struct page *page = NULL;
  1397. unsigned char count;
  1398. if (non_swap_entry(entry))
  1399. return 1;
  1400. p = _swap_info_get(entry);
  1401. if (p) {
  1402. count = __swap_entry_free(p, entry, 1);
  1403. if (count == SWAP_HAS_CACHE &&
  1404. !swap_page_trans_huge_swapped(p, entry)) {
  1405. page = find_get_page(swap_address_space(entry),
  1406. swp_offset(entry));
  1407. if (page && !trylock_page(page)) {
  1408. put_page(page);
  1409. page = NULL;
  1410. }
  1411. } else if (!count)
  1412. free_swap_slot(entry);
  1413. }
  1414. if (page) {
  1415. /*
  1416. * Not mapped elsewhere, or swap space full? Free it!
  1417. * Also recheck PageSwapCache now page is locked (above).
  1418. */
  1419. if (PageSwapCache(page) && !PageWriteback(page) &&
  1420. (!page_mapped(page) || mem_cgroup_swap_full(page)) &&
  1421. !swap_page_trans_huge_swapped(p, entry)) {
  1422. page = compound_head(page);
  1423. delete_from_swap_cache(page);
  1424. SetPageDirty(page);
  1425. }
  1426. unlock_page(page);
  1427. put_page(page);
  1428. }
  1429. return p != NULL;
  1430. }
  1431. #ifdef CONFIG_HIBERNATION
  1432. /*
  1433. * Find the swap type that corresponds to given device (if any).
  1434. *
  1435. * @offset - number of the PAGE_SIZE-sized block of the device, starting
  1436. * from 0, in which the swap header is expected to be located.
  1437. *
  1438. * This is needed for the suspend to disk (aka swsusp).
  1439. */
  1440. int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
  1441. {
  1442. struct block_device *bdev = NULL;
  1443. int type;
  1444. if (device)
  1445. bdev = bdget(device);
  1446. spin_lock(&swap_lock);
  1447. for (type = 0; type < nr_swapfiles; type++) {
  1448. struct swap_info_struct *sis = swap_info[type];
  1449. if (!(sis->flags & SWP_WRITEOK))
  1450. continue;
  1451. if (!bdev) {
  1452. if (bdev_p)
  1453. *bdev_p = bdgrab(sis->bdev);
  1454. spin_unlock(&swap_lock);
  1455. return type;
  1456. }
  1457. if (bdev == sis->bdev) {
  1458. struct swap_extent *se = &sis->first_swap_extent;
  1459. if (se->start_block == offset) {
  1460. if (bdev_p)
  1461. *bdev_p = bdgrab(sis->bdev);
  1462. spin_unlock(&swap_lock);
  1463. bdput(bdev);
  1464. return type;
  1465. }
  1466. }
  1467. }
  1468. spin_unlock(&swap_lock);
  1469. if (bdev)
  1470. bdput(bdev);
  1471. return -ENODEV;
  1472. }
  1473. /*
  1474. * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev
  1475. * corresponding to given index in swap_info (swap type).
  1476. */
  1477. sector_t swapdev_block(int type, pgoff_t offset)
  1478. {
  1479. struct block_device *bdev;
  1480. if ((unsigned int)type >= nr_swapfiles)
  1481. return 0;
  1482. if (!(swap_info[type]->flags & SWP_WRITEOK))
  1483. return 0;
  1484. return map_swap_entry(swp_entry(type, offset), &bdev);
  1485. }
  1486. /*
  1487. * Return either the total number of swap pages of given type, or the number
  1488. * of free pages of that type (depending on @free)
  1489. *
  1490. * This is needed for software suspend
  1491. */
  1492. unsigned int count_swap_pages(int type, int free)
  1493. {
  1494. unsigned int n = 0;
  1495. spin_lock(&swap_lock);
  1496. if ((unsigned int)type < nr_swapfiles) {
  1497. struct swap_info_struct *sis = swap_info[type];
  1498. spin_lock(&sis->lock);
  1499. if (sis->flags & SWP_WRITEOK) {
  1500. n = sis->pages;
  1501. if (free)
  1502. n -= sis->inuse_pages;
  1503. }
  1504. spin_unlock(&sis->lock);
  1505. }
  1506. spin_unlock(&swap_lock);
  1507. return n;
  1508. }
  1509. #endif /* CONFIG_HIBERNATION */
  1510. static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte)
  1511. {
  1512. return pte_same(pte_swp_clear_soft_dirty(pte), swp_pte);
  1513. }
  1514. /*
  1515. * No need to decide whether this PTE shares the swap entry with others,
  1516. * just let do_wp_page work it out if a write is requested later - to
  1517. * force COW, vm_page_prot omits write permission from any private vma.
  1518. */
  1519. static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
  1520. unsigned long addr, swp_entry_t entry, struct page *page)
  1521. {
  1522. struct page *swapcache;
  1523. struct mem_cgroup *memcg;
  1524. spinlock_t *ptl;
  1525. pte_t *pte;
  1526. int ret = 1;
  1527. swapcache = page;
  1528. page = ksm_might_need_to_copy(page, vma, addr);
  1529. if (unlikely(!page))
  1530. return -ENOMEM;
  1531. if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL,
  1532. &memcg, false)) {
  1533. ret = -ENOMEM;
  1534. goto out_nolock;
  1535. }
  1536. pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
  1537. if (unlikely(!pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) {
  1538. mem_cgroup_cancel_charge(page, memcg, false);
  1539. ret = 0;
  1540. goto out;
  1541. }
  1542. dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
  1543. inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
  1544. get_page(page);
  1545. set_pte_at(vma->vm_mm, addr, pte,
  1546. pte_mkold(mk_pte(page, vma->vm_page_prot)));
  1547. if (page == swapcache) {
  1548. page_add_anon_rmap(page, vma, addr, false);
  1549. mem_cgroup_commit_charge(page, memcg, true, false);
  1550. } else { /* ksm created a completely new copy */
  1551. page_add_new_anon_rmap(page, vma, addr, false);
  1552. mem_cgroup_commit_charge(page, memcg, false, false);
  1553. lru_cache_add_active_or_unevictable(page, vma);
  1554. }
  1555. swap_free(entry);
  1556. /*
  1557. * Move the page to the active list so it is not
  1558. * immediately swapped out again after swapon.
  1559. */
  1560. activate_page(page);
  1561. out:
  1562. pte_unmap_unlock(pte, ptl);
  1563. out_nolock:
  1564. if (page != swapcache) {
  1565. unlock_page(page);
  1566. put_page(page);
  1567. }
  1568. return ret;
  1569. }
  1570. static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
  1571. unsigned long addr, unsigned long end,
  1572. swp_entry_t entry, struct page *page)
  1573. {
  1574. pte_t swp_pte = swp_entry_to_pte(entry);
  1575. pte_t *pte;
  1576. int ret = 0;
  1577. /*
  1578. * We don't actually need pte lock while scanning for swp_pte: since
  1579. * we hold page lock and mmap_sem, swp_pte cannot be inserted into the
  1580. * page table while we're scanning; though it could get zapped, and on
  1581. * some architectures (e.g. x86_32 with PAE) we might catch a glimpse
  1582. * of unmatched parts which look like swp_pte, so unuse_pte must
  1583. * recheck under pte lock. Scanning without pte lock lets it be
  1584. * preemptable whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE.
  1585. */
  1586. pte = pte_offset_map(pmd, addr);
  1587. do {
  1588. /*
  1589. * swapoff spends a _lot_ of time in this loop!
  1590. * Test inline before going to call unuse_pte.
  1591. */
  1592. if (unlikely(pte_same_as_swp(*pte, swp_pte))) {
  1593. pte_unmap(pte);
  1594. ret = unuse_pte(vma, pmd, addr, entry, page);
  1595. if (ret)
  1596. goto out;
  1597. pte = pte_offset_map(pmd, addr);
  1598. }
  1599. } while (pte++, addr += PAGE_SIZE, addr != end);
  1600. pte_unmap(pte - 1);
  1601. out:
  1602. return ret;
  1603. }
  1604. static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
  1605. unsigned long addr, unsigned long end,
  1606. swp_entry_t entry, struct page *page)
  1607. {
  1608. pmd_t *pmd;
  1609. unsigned long next;
  1610. int ret;
  1611. pmd = pmd_offset(pud, addr);
  1612. do {
  1613. cond_resched();
  1614. next = pmd_addr_end(addr, end);
  1615. if (pmd_none_or_trans_huge_or_clear_bad(pmd))
  1616. continue;
  1617. ret = unuse_pte_range(vma, pmd, addr, next, entry, page);
  1618. if (ret)
  1619. return ret;
  1620. } while (pmd++, addr = next, addr != end);
  1621. return 0;
  1622. }
  1623. static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d,
  1624. unsigned long addr, unsigned long end,
  1625. swp_entry_t entry, struct page *page)
  1626. {
  1627. pud_t *pud;
  1628. unsigned long next;
  1629. int ret;
  1630. pud = pud_offset(p4d, addr);
  1631. do {
  1632. next = pud_addr_end(addr, end);
  1633. if (pud_none_or_clear_bad(pud))
  1634. continue;
  1635. ret = unuse_pmd_range(vma, pud, addr, next, entry, page);
  1636. if (ret)
  1637. return ret;
  1638. } while (pud++, addr = next, addr != end);
  1639. return 0;
  1640. }
  1641. static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd,
  1642. unsigned long addr, unsigned long end,
  1643. swp_entry_t entry, struct page *page)
  1644. {
  1645. p4d_t *p4d;
  1646. unsigned long next;
  1647. int ret;
  1648. p4d = p4d_offset(pgd, addr);
  1649. do {
  1650. next = p4d_addr_end(addr, end);
  1651. if (p4d_none_or_clear_bad(p4d))
  1652. continue;
  1653. ret = unuse_pud_range(vma, p4d, addr, next, entry, page);
  1654. if (ret)
  1655. return ret;
  1656. } while (p4d++, addr = next, addr != end);
  1657. return 0;
  1658. }
  1659. static int unuse_vma(struct vm_area_struct *vma,
  1660. swp_entry_t entry, struct page *page)
  1661. {
  1662. pgd_t *pgd;
  1663. unsigned long addr, end, next;
  1664. int ret;
  1665. if (page_anon_vma(page)) {
  1666. addr = page_address_in_vma(page, vma);
  1667. if (addr == -EFAULT)
  1668. return 0;
  1669. else
  1670. end = addr + PAGE_SIZE;
  1671. } else {
  1672. addr = vma->vm_start;
  1673. end = vma->vm_end;
  1674. }
  1675. pgd = pgd_offset(vma->vm_mm, addr);
  1676. do {
  1677. next = pgd_addr_end(addr, end);
  1678. if (pgd_none_or_clear_bad(pgd))
  1679. continue;
  1680. ret = unuse_p4d_range(vma, pgd, addr, next, entry, page);
  1681. if (ret)
  1682. return ret;
  1683. } while (pgd++, addr = next, addr != end);
  1684. return 0;
  1685. }
  1686. static int unuse_mm(struct mm_struct *mm,
  1687. swp_entry_t entry, struct page *page)
  1688. {
  1689. struct vm_area_struct *vma;
  1690. int ret = 0;
  1691. if (!down_read_trylock(&mm->mmap_sem)) {
  1692. /*
  1693. * Activate page so shrink_inactive_list is unlikely to unmap
  1694. * its ptes while lock is dropped, so swapoff can make progress.
  1695. */
  1696. activate_page(page);
  1697. unlock_page(page);
  1698. down_read(&mm->mmap_sem);
  1699. lock_page(page);
  1700. }
  1701. for (vma = mm->mmap; vma; vma = vma->vm_next) {
  1702. if (vma->anon_vma && (ret = unuse_vma(vma, entry, page)))
  1703. break;
  1704. cond_resched();
  1705. }
  1706. up_read(&mm->mmap_sem);
  1707. return (ret < 0)? ret: 0;
  1708. }
  1709. /*
  1710. * Scan swap_map (or frontswap_map if frontswap parameter is true)
  1711. * from current position to next entry still in use.
  1712. * Recycle to start on reaching the end, returning 0 when empty.
  1713. */
  1714. static unsigned int find_next_to_unuse(struct swap_info_struct *si,
  1715. unsigned int prev, bool frontswap)
  1716. {
  1717. unsigned int max = si->max;
  1718. unsigned int i = prev;
  1719. unsigned char count;
  1720. /*
  1721. * No need for swap_lock here: we're just looking
  1722. * for whether an entry is in use, not modifying it; false
  1723. * hits are okay, and sys_swapoff() has already prevented new
  1724. * allocations from this area (while holding swap_lock).
  1725. */
  1726. for (;;) {
  1727. if (++i >= max) {
  1728. if (!prev) {
  1729. i = 0;
  1730. break;
  1731. }
  1732. /*
  1733. * No entries in use at top of swap_map,
  1734. * loop back to start and recheck there.
  1735. */
  1736. max = prev + 1;
  1737. prev = 0;
  1738. i = 1;
  1739. }
  1740. count = READ_ONCE(si->swap_map[i]);
  1741. if (count && swap_count(count) != SWAP_MAP_BAD)
  1742. if (!frontswap || frontswap_test(si, i))
  1743. break;
  1744. if ((i % LATENCY_LIMIT) == 0)
  1745. cond_resched();
  1746. }
  1747. return i;
  1748. }
  1749. /*
  1750. * We completely avoid races by reading each swap page in advance,
  1751. * and then search for the process using it. All the necessary
  1752. * page table adjustments can then be made atomically.
  1753. *
  1754. * if the boolean frontswap is true, only unuse pages_to_unuse pages;
  1755. * pages_to_unuse==0 means all pages; ignored if frontswap is false
  1756. */
  1757. int try_to_unuse(unsigned int type, bool frontswap,
  1758. unsigned long pages_to_unuse)
  1759. {
  1760. struct swap_info_struct *si = swap_info[type];
  1761. struct mm_struct *start_mm;
  1762. volatile unsigned char *swap_map; /* swap_map is accessed without
  1763. * locking. Mark it as volatile
  1764. * to prevent compiler doing
  1765. * something odd.
  1766. */
  1767. unsigned char swcount;
  1768. struct page *page;
  1769. swp_entry_t entry;
  1770. unsigned int i = 0;
  1771. int retval = 0;
  1772. /*
  1773. * When searching mms for an entry, a good strategy is to
  1774. * start at the first mm we freed the previous entry from
  1775. * (though actually we don't notice whether we or coincidence
  1776. * freed the entry). Initialize this start_mm with a hold.
  1777. *
  1778. * A simpler strategy would be to start at the last mm we
  1779. * freed the previous entry from; but that would take less
  1780. * advantage of mmlist ordering, which clusters forked mms
  1781. * together, child after parent. If we race with dup_mmap(), we
  1782. * prefer to resolve parent before child, lest we miss entries
  1783. * duplicated after we scanned child: using last mm would invert
  1784. * that.
  1785. */
  1786. start_mm = &init_mm;
  1787. mmget(&init_mm);
  1788. /*
  1789. * Keep on scanning until all entries have gone. Usually,
  1790. * one pass through swap_map is enough, but not necessarily:
  1791. * there are races when an instance of an entry might be missed.
  1792. */
  1793. while ((i = find_next_to_unuse(si, i, frontswap)) != 0) {
  1794. if (signal_pending(current)) {
  1795. retval = -EINTR;
  1796. break;
  1797. }
  1798. /*
  1799. * Get a page for the entry, using the existing swap
  1800. * cache page if there is one. Otherwise, get a clean
  1801. * page and read the swap into it.
  1802. */
  1803. swap_map = &si->swap_map[i];
  1804. entry = swp_entry(type, i);
  1805. page = read_swap_cache_async(entry,
  1806. GFP_HIGHUSER_MOVABLE, NULL, 0, false);
  1807. if (!page) {
  1808. /*
  1809. * Either swap_duplicate() failed because entry
  1810. * has been freed independently, and will not be
  1811. * reused since sys_swapoff() already disabled
  1812. * allocation from here, or alloc_page() failed.
  1813. */
  1814. swcount = *swap_map;
  1815. /*
  1816. * We don't hold lock here, so the swap entry could be
  1817. * SWAP_MAP_BAD (when the cluster is discarding).
  1818. * Instead of fail out, We can just skip the swap
  1819. * entry because swapoff will wait for discarding
  1820. * finish anyway.
  1821. */
  1822. if (!swcount || swcount == SWAP_MAP_BAD)
  1823. continue;
  1824. retval = -ENOMEM;
  1825. break;
  1826. }
  1827. /*
  1828. * Don't hold on to start_mm if it looks like exiting.
  1829. */
  1830. if (atomic_read(&start_mm->mm_users) == 1) {
  1831. mmput(start_mm);
  1832. start_mm = &init_mm;
  1833. mmget(&init_mm);
  1834. }
  1835. /*
  1836. * Wait for and lock page. When do_swap_page races with
  1837. * try_to_unuse, do_swap_page can handle the fault much
  1838. * faster than try_to_unuse can locate the entry. This
  1839. * apparently redundant "wait_on_page_locked" lets try_to_unuse
  1840. * defer to do_swap_page in such a case - in some tests,
  1841. * do_swap_page and try_to_unuse repeatedly compete.
  1842. */
  1843. wait_on_page_locked(page);
  1844. wait_on_page_writeback(page);
  1845. lock_page(page);
  1846. wait_on_page_writeback(page);
  1847. /*
  1848. * Remove all references to entry.
  1849. */
  1850. swcount = *swap_map;
  1851. if (swap_count(swcount) == SWAP_MAP_SHMEM) {
  1852. retval = shmem_unuse(entry, page);
  1853. /* page has already been unlocked and released */
  1854. if (retval < 0)
  1855. break;
  1856. continue;
  1857. }
  1858. if (swap_count(swcount) && start_mm != &init_mm)
  1859. retval = unuse_mm(start_mm, entry, page);
  1860. if (swap_count(*swap_map)) {
  1861. int set_start_mm = (*swap_map >= swcount);
  1862. struct list_head *p = &start_mm->mmlist;
  1863. struct mm_struct *new_start_mm = start_mm;
  1864. struct mm_struct *prev_mm = start_mm;
  1865. struct mm_struct *mm;
  1866. mmget(new_start_mm);
  1867. mmget(prev_mm);
  1868. spin_lock(&mmlist_lock);
  1869. while (swap_count(*swap_map) && !retval &&
  1870. (p = p->next) != &start_mm->mmlist) {
  1871. mm = list_entry(p, struct mm_struct, mmlist);
  1872. if (!mmget_not_zero(mm))
  1873. continue;
  1874. spin_unlock(&mmlist_lock);
  1875. mmput(prev_mm);
  1876. prev_mm = mm;
  1877. cond_resched();
  1878. swcount = *swap_map;
  1879. if (!swap_count(swcount)) /* any usage ? */
  1880. ;
  1881. else if (mm == &init_mm)
  1882. set_start_mm = 1;
  1883. else
  1884. retval = unuse_mm(mm, entry, page);
  1885. if (set_start_mm && *swap_map < swcount) {
  1886. mmput(new_start_mm);
  1887. mmget(mm);
  1888. new_start_mm = mm;
  1889. set_start_mm = 0;
  1890. }
  1891. spin_lock(&mmlist_lock);
  1892. }
  1893. spin_unlock(&mmlist_lock);
  1894. mmput(prev_mm);
  1895. mmput(start_mm);
  1896. start_mm = new_start_mm;
  1897. }
  1898. if (retval) {
  1899. unlock_page(page);
  1900. put_page(page);
  1901. break;
  1902. }
  1903. /*
  1904. * If a reference remains (rare), we would like to leave
  1905. * the page in the swap cache; but try_to_unmap could
  1906. * then re-duplicate the entry once we drop page lock,
  1907. * so we might loop indefinitely; also, that page could
  1908. * not be swapped out to other storage meanwhile. So:
  1909. * delete from cache even if there's another reference,
  1910. * after ensuring that the data has been saved to disk -
  1911. * since if the reference remains (rarer), it will be
  1912. * read from disk into another page. Splitting into two
  1913. * pages would be incorrect if swap supported "shared
  1914. * private" pages, but they are handled by tmpfs files.
  1915. *
  1916. * Given how unuse_vma() targets one particular offset
  1917. * in an anon_vma, once the anon_vma has been determined,
  1918. * this splitting happens to be just what is needed to
  1919. * handle where KSM pages have been swapped out: re-reading
  1920. * is unnecessarily slow, but we can fix that later on.
  1921. */
  1922. if (swap_count(*swap_map) &&
  1923. PageDirty(page) && PageSwapCache(page)) {
  1924. struct writeback_control wbc = {
  1925. .sync_mode = WB_SYNC_NONE,
  1926. };
  1927. swap_writepage(compound_head(page), &wbc);
  1928. lock_page(page);
  1929. wait_on_page_writeback(page);
  1930. }
  1931. /*
  1932. * It is conceivable that a racing task removed this page from
  1933. * swap cache just before we acquired the page lock at the top,
  1934. * or while we dropped it in unuse_mm(). The page might even
  1935. * be back in swap cache on another swap area: that we must not
  1936. * delete, since it may not have been written out to swap yet.
  1937. */
  1938. if (PageSwapCache(page) &&
  1939. likely(page_private(page) == entry.val) &&
  1940. !page_swapped(page))
  1941. delete_from_swap_cache(compound_head(page));
  1942. /*
  1943. * So we could skip searching mms once swap count went
  1944. * to 1, we did not mark any present ptes as dirty: must
  1945. * mark page dirty so shrink_page_list will preserve it.
  1946. */
  1947. SetPageDirty(page);
  1948. unlock_page(page);
  1949. put_page(page);
  1950. /*
  1951. * Make sure that we aren't completely killing
  1952. * interactive performance.
  1953. */
  1954. cond_resched();
  1955. if (frontswap && pages_to_unuse > 0) {
  1956. if (!--pages_to_unuse)
  1957. break;
  1958. }
  1959. }
  1960. mmput(start_mm);
  1961. return retval;
  1962. }
  1963. /*
  1964. * After a successful try_to_unuse, if no swap is now in use, we know
  1965. * we can empty the mmlist. swap_lock must be held on entry and exit.
  1966. * Note that mmlist_lock nests inside swap_lock, and an mm must be
  1967. * added to the mmlist just after page_duplicate - before would be racy.
  1968. */
  1969. static void drain_mmlist(void)
  1970. {
  1971. struct list_head *p, *next;
  1972. unsigned int type;
  1973. for (type = 0; type < nr_swapfiles; type++)
  1974. if (swap_info[type]->inuse_pages)
  1975. return;
  1976. spin_lock(&mmlist_lock);
  1977. list_for_each_safe(p, next, &init_mm.mmlist)
  1978. list_del_init(p);
  1979. spin_unlock(&mmlist_lock);
  1980. }
  1981. /*
  1982. * Use this swapdev's extent info to locate the (PAGE_SIZE) block which
  1983. * corresponds to page offset for the specified swap entry.
  1984. * Note that the type of this function is sector_t, but it returns page offset
  1985. * into the bdev, not sector offset.
  1986. */
  1987. static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
  1988. {
  1989. struct swap_info_struct *sis;
  1990. struct swap_extent *start_se;
  1991. struct swap_extent *se;
  1992. pgoff_t offset;
  1993. sis = swap_info[swp_type(entry)];
  1994. *bdev = sis->bdev;
  1995. offset = swp_offset(entry);
  1996. start_se = sis->curr_swap_extent;
  1997. se = start_se;
  1998. for ( ; ; ) {
  1999. if (se->start_page <= offset &&
  2000. offset < (se->start_page + se->nr_pages)) {
  2001. return se->start_block + (offset - se->start_page);
  2002. }
  2003. se = list_next_entry(se, list);
  2004. sis->curr_swap_extent = se;
  2005. BUG_ON(se == start_se); /* It *must* be present */
  2006. }
  2007. }
  2008. /*
  2009. * Returns the page offset into bdev for the specified page's swap entry.
  2010. */
  2011. sector_t map_swap_page(struct page *page, struct block_device **bdev)
  2012. {
  2013. swp_entry_t entry;
  2014. entry.val = page_private(page);
  2015. return map_swap_entry(entry, bdev);
  2016. }
  2017. /*
  2018. * Free all of a swapdev's extent information
  2019. */
  2020. static void destroy_swap_extents(struct swap_info_struct *sis)
  2021. {
  2022. while (!list_empty(&sis->first_swap_extent.list)) {
  2023. struct swap_extent *se;
  2024. se = list_first_entry(&sis->first_swap_extent.list,
  2025. struct swap_extent, list);
  2026. list_del(&se->list);
  2027. kfree(se);
  2028. }
  2029. if (sis->flags & SWP_FILE) {
  2030. struct file *swap_file = sis->swap_file;
  2031. struct address_space *mapping = swap_file->f_mapping;
  2032. sis->flags &= ~SWP_FILE;
  2033. mapping->a_ops->swap_deactivate(swap_file);
  2034. }
  2035. }
  2036. /*
  2037. * Add a block range (and the corresponding page range) into this swapdev's
  2038. * extent list. The extent list is kept sorted in page order.
  2039. *
  2040. * This function rather assumes that it is called in ascending page order.
  2041. */
  2042. int
  2043. add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
  2044. unsigned long nr_pages, sector_t start_block)
  2045. {
  2046. struct swap_extent *se;
  2047. struct swap_extent *new_se;
  2048. struct list_head *lh;
  2049. if (start_page == 0) {
  2050. se = &sis->first_swap_extent;
  2051. sis->curr_swap_extent = se;
  2052. se->start_page = 0;
  2053. se->nr_pages = nr_pages;
  2054. se->start_block = start_block;
  2055. return 1;
  2056. } else {
  2057. lh = sis->first_swap_extent.list.prev; /* Highest extent */
  2058. se = list_entry(lh, struct swap_extent, list);
  2059. BUG_ON(se->start_page + se->nr_pages != start_page);
  2060. if (se->start_block + se->nr_pages == start_block) {
  2061. /* Merge it */
  2062. se->nr_pages += nr_pages;
  2063. return 0;
  2064. }
  2065. }
  2066. /*
  2067. * No merge. Insert a new extent, preserving ordering.
  2068. */
  2069. new_se = kmalloc(sizeof(*se), GFP_KERNEL);
  2070. if (new_se == NULL)
  2071. return -ENOMEM;
  2072. new_se->start_page = start_page;
  2073. new_se->nr_pages = nr_pages;
  2074. new_se->start_block = start_block;
  2075. list_add_tail(&new_se->list, &sis->first_swap_extent.list);
  2076. return 1;
  2077. }
  2078. /*
  2079. * A `swap extent' is a simple thing which maps a contiguous range of pages
  2080. * onto a contiguous range of disk blocks. An ordered list of swap extents
  2081. * is built at swapon time and is then used at swap_writepage/swap_readpage
  2082. * time for locating where on disk a page belongs.
  2083. *
  2084. * If the swapfile is an S_ISBLK block device, a single extent is installed.
  2085. * This is done so that the main operating code can treat S_ISBLK and S_ISREG
  2086. * swap files identically.
  2087. *
  2088. * Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap
  2089. * extent list operates in PAGE_SIZE disk blocks. Both S_ISREG and S_ISBLK
  2090. * swapfiles are handled *identically* after swapon time.
  2091. *
  2092. * For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks
  2093. * and will parse them into an ordered extent list, in PAGE_SIZE chunks. If
  2094. * some stray blocks are found which do not fall within the PAGE_SIZE alignment
  2095. * requirements, they are simply tossed out - we will never use those blocks
  2096. * for swapping.
  2097. *
  2098. * For S_ISREG swapfiles we set S_SWAPFILE across the life of the swapon. This
  2099. * prevents root from shooting her foot off by ftruncating an in-use swapfile,
  2100. * which will scribble on the fs.
  2101. *
  2102. * The amount of disk space which a single swap extent represents varies.
  2103. * Typically it is in the 1-4 megabyte range. So we can have hundreds of
  2104. * extents in the list. To avoid much list walking, we cache the previous
  2105. * search location in `curr_swap_extent', and start new searches from there.
  2106. * This is extremely effective. The average number of iterations in
  2107. * map_swap_page() has been measured at about 0.3 per page. - akpm.
  2108. */
  2109. static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
  2110. {
  2111. struct file *swap_file = sis->swap_file;
  2112. struct address_space *mapping = swap_file->f_mapping;
  2113. struct inode *inode = mapping->host;
  2114. int ret;
  2115. if (S_ISBLK(inode->i_mode)) {
  2116. ret = add_swap_extent(sis, 0, sis->max, 0);
  2117. *span = sis->pages;
  2118. return ret;
  2119. }
  2120. if (mapping->a_ops->swap_activate) {
  2121. ret = mapping->a_ops->swap_activate(sis, swap_file, span);
  2122. if (!ret) {
  2123. sis->flags |= SWP_FILE;
  2124. ret = add_swap_extent(sis, 0, sis->max, 0);
  2125. *span = sis->pages;
  2126. }
  2127. return ret;
  2128. }
  2129. return generic_swapfile_activate(sis, swap_file, span);
  2130. }
  2131. static int swap_node(struct swap_info_struct *p)
  2132. {
  2133. struct block_device *bdev;
  2134. if (p->bdev)
  2135. bdev = p->bdev;
  2136. else
  2137. bdev = p->swap_file->f_inode->i_sb->s_bdev;
  2138. return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE;
  2139. }
  2140. static void _enable_swap_info(struct swap_info_struct *p, int prio,
  2141. unsigned char *swap_map,
  2142. struct swap_cluster_info *cluster_info)
  2143. {
  2144. int i;
  2145. if (prio >= 0)
  2146. p->prio = prio;
  2147. else
  2148. p->prio = --least_priority;
  2149. /*
  2150. * the plist prio is negated because plist ordering is
  2151. * low-to-high, while swap ordering is high-to-low
  2152. */
  2153. p->list.prio = -p->prio;
  2154. for_each_node(i) {
  2155. if (p->prio >= 0)
  2156. p->avail_lists[i].prio = -p->prio;
  2157. else {
  2158. if (swap_node(p) == i)
  2159. p->avail_lists[i].prio = 1;
  2160. else
  2161. p->avail_lists[i].prio = -p->prio;
  2162. }
  2163. }
  2164. p->swap_map = swap_map;
  2165. p->cluster_info = cluster_info;
  2166. p->flags |= SWP_WRITEOK;
  2167. atomic_long_add(p->pages, &nr_swap_pages);
  2168. total_swap_pages += p->pages;
  2169. assert_spin_locked(&swap_lock);
  2170. /*
  2171. * both lists are plists, and thus priority ordered.
  2172. * swap_active_head needs to be priority ordered for swapoff(),
  2173. * which on removal of any swap_info_struct with an auto-assigned
  2174. * (i.e. negative) priority increments the auto-assigned priority
  2175. * of any lower-priority swap_info_structs.
  2176. * swap_avail_head needs to be priority ordered for get_swap_page(),
  2177. * which allocates swap pages from the highest available priority
  2178. * swap_info_struct.
  2179. */
  2180. plist_add(&p->list, &swap_active_head);
  2181. add_to_avail_list(p);
  2182. }
  2183. static void enable_swap_info(struct swap_info_struct *p, int prio,
  2184. unsigned char *swap_map,
  2185. struct swap_cluster_info *cluster_info,
  2186. unsigned long *frontswap_map)
  2187. {
  2188. frontswap_init(p->type, frontswap_map);
  2189. spin_lock(&swap_lock);
  2190. spin_lock(&p->lock);
  2191. _enable_swap_info(p, prio, swap_map, cluster_info);
  2192. spin_unlock(&p->lock);
  2193. spin_unlock(&swap_lock);
  2194. }
  2195. static void reinsert_swap_info(struct swap_info_struct *p)
  2196. {
  2197. spin_lock(&swap_lock);
  2198. spin_lock(&p->lock);
  2199. _enable_swap_info(p, p->prio, p->swap_map, p->cluster_info);
  2200. spin_unlock(&p->lock);
  2201. spin_unlock(&swap_lock);
  2202. }
  2203. bool has_usable_swap(void)
  2204. {
  2205. bool ret = true;
  2206. spin_lock(&swap_lock);
  2207. if (plist_head_empty(&swap_active_head))
  2208. ret = false;
  2209. spin_unlock(&swap_lock);
  2210. return ret;
  2211. }
  2212. SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
  2213. {
  2214. struct swap_info_struct *p = NULL;
  2215. unsigned char *swap_map;
  2216. struct swap_cluster_info *cluster_info;
  2217. unsigned long *frontswap_map;
  2218. struct file *swap_file, *victim;
  2219. struct address_space *mapping;
  2220. struct inode *inode;
  2221. struct filename *pathname;
  2222. int err, found = 0;
  2223. unsigned int old_block_size;
  2224. if (!capable(CAP_SYS_ADMIN))
  2225. return -EPERM;
  2226. BUG_ON(!current->mm);
  2227. pathname = getname(specialfile);
  2228. if (IS_ERR(pathname))
  2229. return PTR_ERR(pathname);
  2230. victim = file_open_name(pathname, O_RDWR|O_LARGEFILE, 0);
  2231. err = PTR_ERR(victim);
  2232. if (IS_ERR(victim))
  2233. goto out;
  2234. mapping = victim->f_mapping;
  2235. spin_lock(&swap_lock);
  2236. plist_for_each_entry(p, &swap_active_head, list) {
  2237. if (p->flags & SWP_WRITEOK) {
  2238. if (p->swap_file->f_mapping == mapping) {
  2239. found = 1;
  2240. break;
  2241. }
  2242. }
  2243. }
  2244. if (!found) {
  2245. err = -EINVAL;
  2246. spin_unlock(&swap_lock);
  2247. goto out_dput;
  2248. }
  2249. if (!security_vm_enough_memory_mm(current->mm, p->pages))
  2250. vm_unacct_memory(p->pages);
  2251. else {
  2252. err = -ENOMEM;
  2253. spin_unlock(&swap_lock);
  2254. goto out_dput;
  2255. }
  2256. del_from_avail_list(p);
  2257. spin_lock(&p->lock);
  2258. if (p->prio < 0) {
  2259. struct swap_info_struct *si = p;
  2260. int nid;
  2261. plist_for_each_entry_continue(si, &swap_active_head, list) {
  2262. si->prio++;
  2263. si->list.prio--;
  2264. for_each_node(nid) {
  2265. if (si->avail_lists[nid].prio != 1)
  2266. si->avail_lists[nid].prio--;
  2267. }
  2268. }
  2269. least_priority++;
  2270. }
  2271. plist_del(&p->list, &swap_active_head);
  2272. atomic_long_sub(p->pages, &nr_swap_pages);
  2273. total_swap_pages -= p->pages;
  2274. p->flags &= ~SWP_WRITEOK;
  2275. spin_unlock(&p->lock);
  2276. spin_unlock(&swap_lock);
  2277. disable_swap_slots_cache_lock();
  2278. set_current_oom_origin();
  2279. err = try_to_unuse(p->type, false, 0); /* force unuse all pages */
  2280. clear_current_oom_origin();
  2281. if (err) {
  2282. /* re-insert swap space back into swap_list */
  2283. reinsert_swap_info(p);
  2284. reenable_swap_slots_cache_unlock();
  2285. goto out_dput;
  2286. }
  2287. reenable_swap_slots_cache_unlock();
  2288. flush_work(&p->discard_work);
  2289. destroy_swap_extents(p);
  2290. if (p->flags & SWP_CONTINUED)
  2291. free_swap_count_continuations(p);
  2292. if (!p->bdev || !blk_queue_nonrot(bdev_get_queue(p->bdev)))
  2293. atomic_dec(&nr_rotate_swap);
  2294. mutex_lock(&swapon_mutex);
  2295. spin_lock(&swap_lock);
  2296. spin_lock(&p->lock);
  2297. drain_mmlist();
  2298. /* wait for anyone still in scan_swap_map */
  2299. p->highest_bit = 0; /* cuts scans short */
  2300. while (p->flags >= SWP_SCANNING) {
  2301. spin_unlock(&p->lock);
  2302. spin_unlock(&swap_lock);
  2303. schedule_timeout_uninterruptible(1);
  2304. spin_lock(&swap_lock);
  2305. spin_lock(&p->lock);
  2306. }
  2307. swap_file = p->swap_file;
  2308. old_block_size = p->old_block_size;
  2309. p->swap_file = NULL;
  2310. p->max = 0;
  2311. swap_map = p->swap_map;
  2312. p->swap_map = NULL;
  2313. cluster_info = p->cluster_info;
  2314. p->cluster_info = NULL;
  2315. frontswap_map = frontswap_map_get(p);
  2316. spin_unlock(&p->lock);
  2317. spin_unlock(&swap_lock);
  2318. frontswap_invalidate_area(p->type);
  2319. frontswap_map_set(p, NULL);
  2320. mutex_unlock(&swapon_mutex);
  2321. free_percpu(p->percpu_cluster);
  2322. p->percpu_cluster = NULL;
  2323. vfree(swap_map);
  2324. kvfree(cluster_info);
  2325. kvfree(frontswap_map);
  2326. /* Destroy swap account information */
  2327. swap_cgroup_swapoff(p->type);
  2328. exit_swap_address_space(p->type);
  2329. inode = mapping->host;
  2330. if (S_ISBLK(inode->i_mode)) {
  2331. struct block_device *bdev = I_BDEV(inode);
  2332. set_blocksize(bdev, old_block_size);
  2333. blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
  2334. } else {
  2335. inode_lock(inode);
  2336. inode->i_flags &= ~S_SWAPFILE;
  2337. inode_unlock(inode);
  2338. }
  2339. filp_close(swap_file, NULL);
  2340. /*
  2341. * Clear the SWP_USED flag after all resources are freed so that swapon
  2342. * can reuse this swap_info in alloc_swap_info() safely. It is ok to
  2343. * not hold p->lock after we cleared its SWP_WRITEOK.
  2344. */
  2345. spin_lock(&swap_lock);
  2346. p->flags = 0;
  2347. spin_unlock(&swap_lock);
  2348. err = 0;
  2349. atomic_inc(&proc_poll_event);
  2350. wake_up_interruptible(&proc_poll_wait);
  2351. out_dput:
  2352. filp_close(victim, NULL);
  2353. out:
  2354. putname(pathname);
  2355. return err;
  2356. }
  2357. #ifdef CONFIG_PROC_FS
  2358. static __poll_t swaps_poll(struct file *file, poll_table *wait)
  2359. {
  2360. struct seq_file *seq = file->private_data;
  2361. poll_wait(file, &proc_poll_wait, wait);
  2362. if (seq->poll_event != atomic_read(&proc_poll_event)) {
  2363. seq->poll_event = atomic_read(&proc_poll_event);
  2364. return EPOLLIN | EPOLLRDNORM | EPOLLERR | EPOLLPRI;
  2365. }
  2366. return EPOLLIN | EPOLLRDNORM;
  2367. }
  2368. /* iterator */
  2369. static void *swap_start(struct seq_file *swap, loff_t *pos)
  2370. {
  2371. struct swap_info_struct *si;
  2372. int type;
  2373. loff_t l = *pos;
  2374. mutex_lock(&swapon_mutex);
  2375. if (!l)
  2376. return SEQ_START_TOKEN;
  2377. for (type = 0; type < nr_swapfiles; type++) {
  2378. smp_rmb(); /* read nr_swapfiles before swap_info[type] */
  2379. si = swap_info[type];
  2380. if (!(si->flags & SWP_USED) || !si->swap_map)
  2381. continue;
  2382. if (!--l)
  2383. return si;
  2384. }
  2385. return NULL;
  2386. }
  2387. static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
  2388. {
  2389. struct swap_info_struct *si = v;
  2390. int type;
  2391. if (v == SEQ_START_TOKEN)
  2392. type = 0;
  2393. else
  2394. type = si->type + 1;
  2395. for (; type < nr_swapfiles; type++) {
  2396. smp_rmb(); /* read nr_swapfiles before swap_info[type] */
  2397. si = swap_info[type];
  2398. if (!(si->flags & SWP_USED) || !si->swap_map)
  2399. continue;
  2400. ++*pos;
  2401. return si;
  2402. }
  2403. return NULL;
  2404. }
  2405. static void swap_stop(struct seq_file *swap, void *v)
  2406. {
  2407. mutex_unlock(&swapon_mutex);
  2408. }
  2409. static int swap_show(struct seq_file *swap, void *v)
  2410. {
  2411. struct swap_info_struct *si = v;
  2412. struct file *file;
  2413. int len;
  2414. if (si == SEQ_START_TOKEN) {
  2415. seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
  2416. return 0;
  2417. }
  2418. file = si->swap_file;
  2419. len = seq_file_path(swap, file, " \t\n\\");
  2420. seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
  2421. len < 40 ? 40 - len : 1, " ",
  2422. S_ISBLK(file_inode(file)->i_mode) ?
  2423. "partition" : "file\t",
  2424. si->pages << (PAGE_SHIFT - 10),
  2425. si->inuse_pages << (PAGE_SHIFT - 10),
  2426. si->prio);
  2427. return 0;
  2428. }
  2429. static const struct seq_operations swaps_op = {
  2430. .start = swap_start,
  2431. .next = swap_next,
  2432. .stop = swap_stop,
  2433. .show = swap_show
  2434. };
  2435. static int swaps_open(struct inode *inode, struct file *file)
  2436. {
  2437. struct seq_file *seq;
  2438. int ret;
  2439. ret = seq_open(file, &swaps_op);
  2440. if (ret)
  2441. return ret;
  2442. seq = file->private_data;
  2443. seq->poll_event = atomic_read(&proc_poll_event);
  2444. return 0;
  2445. }
  2446. static const struct file_operations proc_swaps_operations = {
  2447. .open = swaps_open,
  2448. .read = seq_read,
  2449. .llseek = seq_lseek,
  2450. .release = seq_release,
  2451. .poll = swaps_poll,
  2452. };
  2453. static int __init procswaps_init(void)
  2454. {
  2455. proc_create("swaps", 0, NULL, &proc_swaps_operations);
  2456. return 0;
  2457. }
  2458. __initcall(procswaps_init);
  2459. #endif /* CONFIG_PROC_FS */
  2460. #ifdef MAX_SWAPFILES_CHECK
  2461. static int __init max_swapfiles_check(void)
  2462. {
  2463. MAX_SWAPFILES_CHECK();
  2464. return 0;
  2465. }
  2466. late_initcall(max_swapfiles_check);
  2467. #endif
  2468. static struct swap_info_struct *alloc_swap_info(void)
  2469. {
  2470. struct swap_info_struct *p;
  2471. unsigned int type;
  2472. int i;
  2473. p = kzalloc(sizeof(*p), GFP_KERNEL);
  2474. if (!p)
  2475. return ERR_PTR(-ENOMEM);
  2476. spin_lock(&swap_lock);
  2477. for (type = 0; type < nr_swapfiles; type++) {
  2478. if (!(swap_info[type]->flags & SWP_USED))
  2479. break;
  2480. }
  2481. if (type >= MAX_SWAPFILES) {
  2482. spin_unlock(&swap_lock);
  2483. kfree(p);
  2484. return ERR_PTR(-EPERM);
  2485. }
  2486. if (type >= nr_swapfiles) {
  2487. p->type = type;
  2488. swap_info[type] = p;
  2489. /*
  2490. * Write swap_info[type] before nr_swapfiles, in case a
  2491. * racing procfs swap_start() or swap_next() is reading them.
  2492. * (We never shrink nr_swapfiles, we never free this entry.)
  2493. */
  2494. smp_wmb();
  2495. nr_swapfiles++;
  2496. } else {
  2497. kfree(p);
  2498. p = swap_info[type];
  2499. /*
  2500. * Do not memset this entry: a racing procfs swap_next()
  2501. * would be relying on p->type to remain valid.
  2502. */
  2503. }
  2504. INIT_LIST_HEAD(&p->first_swap_extent.list);
  2505. plist_node_init(&p->list, 0);
  2506. for_each_node(i)
  2507. plist_node_init(&p->avail_lists[i], 0);
  2508. p->flags = SWP_USED;
  2509. spin_unlock(&swap_lock);
  2510. spin_lock_init(&p->lock);
  2511. spin_lock_init(&p->cont_lock);
  2512. return p;
  2513. }
  2514. static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
  2515. {
  2516. int error;
  2517. if (S_ISBLK(inode->i_mode)) {
  2518. p->bdev = bdgrab(I_BDEV(inode));
  2519. error = blkdev_get(p->bdev,
  2520. FMODE_READ | FMODE_WRITE | FMODE_EXCL, p);
  2521. if (error < 0) {
  2522. p->bdev = NULL;
  2523. return error;
  2524. }
  2525. p->old_block_size = block_size(p->bdev);
  2526. error = set_blocksize(p->bdev, PAGE_SIZE);
  2527. if (error < 0)
  2528. return error;
  2529. p->flags |= SWP_BLKDEV;
  2530. } else if (S_ISREG(inode->i_mode)) {
  2531. p->bdev = inode->i_sb->s_bdev;
  2532. inode_lock(inode);
  2533. if (IS_SWAPFILE(inode))
  2534. return -EBUSY;
  2535. } else
  2536. return -EINVAL;
  2537. return 0;
  2538. }
  2539. /*
  2540. * Find out how many pages are allowed for a single swap device. There
  2541. * are two limiting factors:
  2542. * 1) the number of bits for the swap offset in the swp_entry_t type, and
  2543. * 2) the number of bits in the swap pte, as defined by the different
  2544. * architectures.
  2545. *
  2546. * In order to find the largest possible bit mask, a swap entry with
  2547. * swap type 0 and swap offset ~0UL is created, encoded to a swap pte,
  2548. * decoded to a swp_entry_t again, and finally the swap offset is
  2549. * extracted.
  2550. *
  2551. * This will mask all the bits from the initial ~0UL mask that can't
  2552. * be encoded in either the swp_entry_t or the architecture definition
  2553. * of a swap pte.
  2554. */
  2555. unsigned long generic_max_swapfile_size(void)
  2556. {
  2557. return swp_offset(pte_to_swp_entry(
  2558. swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
  2559. }
  2560. /* Can be overridden by an architecture for additional checks. */
  2561. __weak unsigned long max_swapfile_size(void)
  2562. {
  2563. return generic_max_swapfile_size();
  2564. }
  2565. static unsigned long read_swap_header(struct swap_info_struct *p,
  2566. union swap_header *swap_header,
  2567. struct inode *inode)
  2568. {
  2569. int i;
  2570. unsigned long maxpages;
  2571. unsigned long swapfilepages;
  2572. unsigned long last_page;
  2573. if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
  2574. pr_err("Unable to find swap-space signature\n");
  2575. return 0;
  2576. }
  2577. /* swap partition endianess hack... */
  2578. if (swab32(swap_header->info.version) == 1) {
  2579. swab32s(&swap_header->info.version);
  2580. swab32s(&swap_header->info.last_page);
  2581. swab32s(&swap_header->info.nr_badpages);
  2582. if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
  2583. return 0;
  2584. for (i = 0; i < swap_header->info.nr_badpages; i++)
  2585. swab32s(&swap_header->info.badpages[i]);
  2586. }
  2587. /* Check the swap header's sub-version */
  2588. if (swap_header->info.version != 1) {
  2589. pr_warn("Unable to handle swap header version %d\n",
  2590. swap_header->info.version);
  2591. return 0;
  2592. }
  2593. p->lowest_bit = 1;
  2594. p->cluster_next = 1;
  2595. p->cluster_nr = 0;
  2596. maxpages = max_swapfile_size();
  2597. last_page = swap_header->info.last_page;
  2598. if (!last_page) {
  2599. pr_warn("Empty swap-file\n");
  2600. return 0;
  2601. }
  2602. if (last_page > maxpages) {
  2603. pr_warn("Truncating oversized swap area, only using %luk out of %luk\n",
  2604. maxpages << (PAGE_SHIFT - 10),
  2605. last_page << (PAGE_SHIFT - 10));
  2606. }
  2607. if (maxpages > last_page) {
  2608. maxpages = last_page + 1;
  2609. /* p->max is an unsigned int: don't overflow it */
  2610. if ((unsigned int)maxpages == 0)
  2611. maxpages = UINT_MAX;
  2612. }
  2613. p->highest_bit = maxpages - 1;
  2614. if (!maxpages)
  2615. return 0;
  2616. swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
  2617. if (swapfilepages && maxpages > swapfilepages) {
  2618. pr_warn("Swap area shorter than signature indicates\n");
  2619. return 0;
  2620. }
  2621. if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
  2622. return 0;
  2623. if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
  2624. return 0;
  2625. return maxpages;
  2626. }
  2627. #define SWAP_CLUSTER_INFO_COLS \
  2628. DIV_ROUND_UP(L1_CACHE_BYTES, sizeof(struct swap_cluster_info))
  2629. #define SWAP_CLUSTER_SPACE_COLS \
  2630. DIV_ROUND_UP(SWAP_ADDRESS_SPACE_PAGES, SWAPFILE_CLUSTER)
  2631. #define SWAP_CLUSTER_COLS \
  2632. max_t(unsigned int, SWAP_CLUSTER_INFO_COLS, SWAP_CLUSTER_SPACE_COLS)
  2633. static int setup_swap_map_and_extents(struct swap_info_struct *p,
  2634. union swap_header *swap_header,
  2635. unsigned char *swap_map,
  2636. struct swap_cluster_info *cluster_info,
  2637. unsigned long maxpages,
  2638. sector_t *span)
  2639. {
  2640. unsigned int j, k;
  2641. unsigned int nr_good_pages;
  2642. int nr_extents;
  2643. unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
  2644. unsigned long col = p->cluster_next / SWAPFILE_CLUSTER % SWAP_CLUSTER_COLS;
  2645. unsigned long i, idx;
  2646. nr_good_pages = maxpages - 1; /* omit header page */
  2647. cluster_list_init(&p->free_clusters);
  2648. cluster_list_init(&p->discard_clusters);
  2649. for (i = 0; i < swap_header->info.nr_badpages; i++) {
  2650. unsigned int page_nr = swap_header->info.badpages[i];
  2651. if (page_nr == 0 || page_nr > swap_header->info.last_page)
  2652. return -EINVAL;
  2653. if (page_nr < maxpages) {
  2654. swap_map[page_nr] = SWAP_MAP_BAD;
  2655. nr_good_pages--;
  2656. /*
  2657. * Haven't marked the cluster free yet, no list
  2658. * operation involved
  2659. */
  2660. inc_cluster_info_page(p, cluster_info, page_nr);
  2661. }
  2662. }
  2663. /* Haven't marked the cluster free yet, no list operation involved */
  2664. for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++)
  2665. inc_cluster_info_page(p, cluster_info, i);
  2666. if (nr_good_pages) {
  2667. swap_map[0] = SWAP_MAP_BAD;
  2668. /*
  2669. * Not mark the cluster free yet, no list
  2670. * operation involved
  2671. */
  2672. inc_cluster_info_page(p, cluster_info, 0);
  2673. p->max = maxpages;
  2674. p->pages = nr_good_pages;
  2675. nr_extents = setup_swap_extents(p, span);
  2676. if (nr_extents < 0)
  2677. return nr_extents;
  2678. nr_good_pages = p->pages;
  2679. }
  2680. if (!nr_good_pages) {
  2681. pr_warn("Empty swap-file\n");
  2682. return -EINVAL;
  2683. }
  2684. if (!cluster_info)
  2685. return nr_extents;
  2686. /*
  2687. * Reduce false cache line sharing between cluster_info and
  2688. * sharing same address space.
  2689. */
  2690. for (k = 0; k < SWAP_CLUSTER_COLS; k++) {
  2691. j = (k + col) % SWAP_CLUSTER_COLS;
  2692. for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) {
  2693. idx = i * SWAP_CLUSTER_COLS + j;
  2694. if (idx >= nr_clusters)
  2695. continue;
  2696. if (cluster_count(&cluster_info[idx]))
  2697. continue;
  2698. cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
  2699. cluster_list_add_tail(&p->free_clusters, cluster_info,
  2700. idx);
  2701. }
  2702. }
  2703. return nr_extents;
  2704. }
  2705. /*
  2706. * Helper to sys_swapon determining if a given swap
  2707. * backing device queue supports DISCARD operations.
  2708. */
  2709. static bool swap_discardable(struct swap_info_struct *si)
  2710. {
  2711. struct request_queue *q = bdev_get_queue(si->bdev);
  2712. if (!q || !blk_queue_discard(q))
  2713. return false;
  2714. return true;
  2715. }
  2716. SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
  2717. {
  2718. struct swap_info_struct *p;
  2719. struct filename *name;
  2720. struct file *swap_file = NULL;
  2721. struct address_space *mapping;
  2722. int prio;
  2723. int error;
  2724. union swap_header *swap_header;
  2725. int nr_extents;
  2726. sector_t span;
  2727. unsigned long maxpages;
  2728. unsigned char *swap_map = NULL;
  2729. struct swap_cluster_info *cluster_info = NULL;
  2730. unsigned long *frontswap_map = NULL;
  2731. struct page *page = NULL;
  2732. struct inode *inode = NULL;
  2733. bool inced_nr_rotate_swap = false;
  2734. if (swap_flags & ~SWAP_FLAGS_VALID)
  2735. return -EINVAL;
  2736. if (!capable(CAP_SYS_ADMIN))
  2737. return -EPERM;
  2738. if (!swap_avail_heads)
  2739. return -ENOMEM;
  2740. p = alloc_swap_info();
  2741. if (IS_ERR(p))
  2742. return PTR_ERR(p);
  2743. INIT_WORK(&p->discard_work, swap_discard_work);
  2744. name = getname(specialfile);
  2745. if (IS_ERR(name)) {
  2746. error = PTR_ERR(name);
  2747. name = NULL;
  2748. goto bad_swap;
  2749. }
  2750. swap_file = file_open_name(name, O_RDWR|O_LARGEFILE, 0);
  2751. if (IS_ERR(swap_file)) {
  2752. error = PTR_ERR(swap_file);
  2753. swap_file = NULL;
  2754. goto bad_swap;
  2755. }
  2756. p->swap_file = swap_file;
  2757. mapping = swap_file->f_mapping;
  2758. inode = mapping->host;
  2759. /* If S_ISREG(inode->i_mode) will do inode_lock(inode); */
  2760. error = claim_swapfile(p, inode);
  2761. if (unlikely(error))
  2762. goto bad_swap;
  2763. /*
  2764. * Read the swap header.
  2765. */
  2766. if (!mapping->a_ops->readpage) {
  2767. error = -EINVAL;
  2768. goto bad_swap;
  2769. }
  2770. page = read_mapping_page(mapping, 0, swap_file);
  2771. if (IS_ERR(page)) {
  2772. error = PTR_ERR(page);
  2773. goto bad_swap;
  2774. }
  2775. swap_header = kmap(page);
  2776. maxpages = read_swap_header(p, swap_header, inode);
  2777. if (unlikely(!maxpages)) {
  2778. error = -EINVAL;
  2779. goto bad_swap;
  2780. }
  2781. /* OK, set up the swap map and apply the bad block list */
  2782. swap_map = vzalloc(maxpages);
  2783. if (!swap_map) {
  2784. error = -ENOMEM;
  2785. goto bad_swap;
  2786. }
  2787. if (bdi_cap_stable_pages_required(inode_to_bdi(inode)))
  2788. p->flags |= SWP_STABLE_WRITES;
  2789. if (bdi_cap_synchronous_io(inode_to_bdi(inode)))
  2790. p->flags |= SWP_SYNCHRONOUS_IO;
  2791. if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) {
  2792. int cpu;
  2793. unsigned long ci, nr_cluster;
  2794. p->flags |= SWP_SOLIDSTATE;
  2795. /*
  2796. * select a random position to start with to help wear leveling
  2797. * SSD
  2798. */
  2799. p->cluster_next = 1 + (prandom_u32() % p->highest_bit);
  2800. nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
  2801. cluster_info = kvcalloc(nr_cluster, sizeof(*cluster_info),
  2802. GFP_KERNEL);
  2803. if (!cluster_info) {
  2804. error = -ENOMEM;
  2805. goto bad_swap;
  2806. }
  2807. for (ci = 0; ci < nr_cluster; ci++)
  2808. spin_lock_init(&((cluster_info + ci)->lock));
  2809. p->percpu_cluster = alloc_percpu(struct percpu_cluster);
  2810. if (!p->percpu_cluster) {
  2811. error = -ENOMEM;
  2812. goto bad_swap;
  2813. }
  2814. for_each_possible_cpu(cpu) {
  2815. struct percpu_cluster *cluster;
  2816. cluster = per_cpu_ptr(p->percpu_cluster, cpu);
  2817. cluster_set_null(&cluster->index);
  2818. }
  2819. } else {
  2820. atomic_inc(&nr_rotate_swap);
  2821. inced_nr_rotate_swap = true;
  2822. }
  2823. error = swap_cgroup_swapon(p->type, maxpages);
  2824. if (error)
  2825. goto bad_swap;
  2826. nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map,
  2827. cluster_info, maxpages, &span);
  2828. if (unlikely(nr_extents < 0)) {
  2829. error = nr_extents;
  2830. goto bad_swap;
  2831. }
  2832. /* frontswap enabled? set up bit-per-page map for frontswap */
  2833. if (IS_ENABLED(CONFIG_FRONTSWAP))
  2834. frontswap_map = kvcalloc(BITS_TO_LONGS(maxpages),
  2835. sizeof(long),
  2836. GFP_KERNEL);
  2837. if (p->bdev &&(swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) {
  2838. /*
  2839. * When discard is enabled for swap with no particular
  2840. * policy flagged, we set all swap discard flags here in
  2841. * order to sustain backward compatibility with older
  2842. * swapon(8) releases.
  2843. */
  2844. p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD |
  2845. SWP_PAGE_DISCARD);
  2846. /*
  2847. * By flagging sys_swapon, a sysadmin can tell us to
  2848. * either do single-time area discards only, or to just
  2849. * perform discards for released swap page-clusters.
  2850. * Now it's time to adjust the p->flags accordingly.
  2851. */
  2852. if (swap_flags & SWAP_FLAG_DISCARD_ONCE)
  2853. p->flags &= ~SWP_PAGE_DISCARD;
  2854. else if (swap_flags & SWAP_FLAG_DISCARD_PAGES)
  2855. p->flags &= ~SWP_AREA_DISCARD;
  2856. /* issue a swapon-time discard if it's still required */
  2857. if (p->flags & SWP_AREA_DISCARD) {
  2858. int err = discard_swap(p);
  2859. if (unlikely(err))
  2860. pr_err("swapon: discard_swap(%p): %d\n",
  2861. p, err);
  2862. }
  2863. }
  2864. error = init_swap_address_space(p->type, maxpages);
  2865. if (error)
  2866. goto bad_swap;
  2867. mutex_lock(&swapon_mutex);
  2868. prio = -1;
  2869. if (swap_flags & SWAP_FLAG_PREFER)
  2870. prio =
  2871. (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
  2872. enable_swap_info(p, prio, swap_map, cluster_info, frontswap_map);
  2873. pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s%s\n",
  2874. p->pages<<(PAGE_SHIFT-10), name->name, p->prio,
  2875. nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
  2876. (p->flags & SWP_SOLIDSTATE) ? "SS" : "",
  2877. (p->flags & SWP_DISCARDABLE) ? "D" : "",
  2878. (p->flags & SWP_AREA_DISCARD) ? "s" : "",
  2879. (p->flags & SWP_PAGE_DISCARD) ? "c" : "",
  2880. (frontswap_map) ? "FS" : "");
  2881. mutex_unlock(&swapon_mutex);
  2882. atomic_inc(&proc_poll_event);
  2883. wake_up_interruptible(&proc_poll_wait);
  2884. if (S_ISREG(inode->i_mode))
  2885. inode->i_flags |= S_SWAPFILE;
  2886. error = 0;
  2887. goto out;
  2888. bad_swap:
  2889. free_percpu(p->percpu_cluster);
  2890. p->percpu_cluster = NULL;
  2891. if (inode && S_ISBLK(inode->i_mode) && p->bdev) {
  2892. set_blocksize(p->bdev, p->old_block_size);
  2893. blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
  2894. }
  2895. destroy_swap_extents(p);
  2896. swap_cgroup_swapoff(p->type);
  2897. spin_lock(&swap_lock);
  2898. p->swap_file = NULL;
  2899. p->flags = 0;
  2900. spin_unlock(&swap_lock);
  2901. vfree(swap_map);
  2902. kvfree(cluster_info);
  2903. kvfree(frontswap_map);
  2904. if (inced_nr_rotate_swap)
  2905. atomic_dec(&nr_rotate_swap);
  2906. if (swap_file) {
  2907. if (inode && S_ISREG(inode->i_mode)) {
  2908. inode_unlock(inode);
  2909. inode = NULL;
  2910. }
  2911. filp_close(swap_file, NULL);
  2912. }
  2913. out:
  2914. if (page && !IS_ERR(page)) {
  2915. kunmap(page);
  2916. put_page(page);
  2917. }
  2918. if (name)
  2919. putname(name);
  2920. if (inode && S_ISREG(inode->i_mode))
  2921. inode_unlock(inode);
  2922. if (!error)
  2923. enable_swap_slots_cache();
  2924. return error;
  2925. }
  2926. void si_swapinfo(struct sysinfo *val)
  2927. {
  2928. unsigned int type;
  2929. unsigned long nr_to_be_unused = 0;
  2930. spin_lock(&swap_lock);
  2931. for (type = 0; type < nr_swapfiles; type++) {
  2932. struct swap_info_struct *si = swap_info[type];
  2933. if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
  2934. nr_to_be_unused += si->inuse_pages;
  2935. }
  2936. val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused;
  2937. val->totalswap = total_swap_pages + nr_to_be_unused;
  2938. spin_unlock(&swap_lock);
  2939. }
  2940. /*
  2941. * Verify that a swap entry is valid and increment its swap map count.
  2942. *
  2943. * Returns error code in following case.
  2944. * - success -> 0
  2945. * - swp_entry is invalid -> EINVAL
  2946. * - swp_entry is migration entry -> EINVAL
  2947. * - swap-cache reference is requested but there is already one. -> EEXIST
  2948. * - swap-cache reference is requested but the entry is not used. -> ENOENT
  2949. * - swap-mapped reference requested but needs continued swap count. -> ENOMEM
  2950. */
  2951. static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
  2952. {
  2953. struct swap_info_struct *p;
  2954. struct swap_cluster_info *ci;
  2955. unsigned long offset, type;
  2956. unsigned char count;
  2957. unsigned char has_cache;
  2958. int err = -EINVAL;
  2959. if (non_swap_entry(entry))
  2960. goto out;
  2961. type = swp_type(entry);
  2962. if (type >= nr_swapfiles)
  2963. goto bad_file;
  2964. p = swap_info[type];
  2965. offset = swp_offset(entry);
  2966. if (unlikely(offset >= p->max))
  2967. goto out;
  2968. ci = lock_cluster_or_swap_info(p, offset);
  2969. count = p->swap_map[offset];
  2970. /*
  2971. * swapin_readahead() doesn't check if a swap entry is valid, so the
  2972. * swap entry could be SWAP_MAP_BAD. Check here with lock held.
  2973. */
  2974. if (unlikely(swap_count(count) == SWAP_MAP_BAD)) {
  2975. err = -ENOENT;
  2976. goto unlock_out;
  2977. }
  2978. has_cache = count & SWAP_HAS_CACHE;
  2979. count &= ~SWAP_HAS_CACHE;
  2980. err = 0;
  2981. if (usage == SWAP_HAS_CACHE) {
  2982. /* set SWAP_HAS_CACHE if there is no cache and entry is used */
  2983. if (!has_cache && count)
  2984. has_cache = SWAP_HAS_CACHE;
  2985. else if (has_cache) /* someone else added cache */
  2986. err = -EEXIST;
  2987. else /* no users remaining */
  2988. err = -ENOENT;
  2989. } else if (count || has_cache) {
  2990. if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
  2991. count += usage;
  2992. else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX)
  2993. err = -EINVAL;
  2994. else if (swap_count_continued(p, offset, count))
  2995. count = COUNT_CONTINUED;
  2996. else
  2997. err = -ENOMEM;
  2998. } else
  2999. err = -ENOENT; /* unused swap entry */
  3000. p->swap_map[offset] = count | has_cache;
  3001. unlock_out:
  3002. unlock_cluster_or_swap_info(p, ci);
  3003. out:
  3004. return err;
  3005. bad_file:
  3006. pr_err("swap_dup: %s%08lx\n", Bad_file, entry.val);
  3007. goto out;
  3008. }
  3009. /*
  3010. * Help swapoff by noting that swap entry belongs to shmem/tmpfs
  3011. * (in which case its reference count is never incremented).
  3012. */
  3013. void swap_shmem_alloc(swp_entry_t entry)
  3014. {
  3015. __swap_duplicate(entry, SWAP_MAP_SHMEM);
  3016. }
  3017. /*
  3018. * Increase reference count of swap entry by 1.
  3019. * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required
  3020. * but could not be atomically allocated. Returns 0, just as if it succeeded,
  3021. * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which
  3022. * might occur if a page table entry has got corrupted.
  3023. */
  3024. int swap_duplicate(swp_entry_t entry)
  3025. {
  3026. int err = 0;
  3027. while (!err && __swap_duplicate(entry, 1) == -ENOMEM)
  3028. err = add_swap_count_continuation(entry, GFP_ATOMIC);
  3029. return err;
  3030. }
  3031. /*
  3032. * @entry: swap entry for which we allocate swap cache.
  3033. *
  3034. * Called when allocating swap cache for existing swap entry,
  3035. * This can return error codes. Returns 0 at success.
  3036. * -EBUSY means there is a swap cache.
  3037. * Note: return code is different from swap_duplicate().
  3038. */
  3039. int swapcache_prepare(swp_entry_t entry)
  3040. {
  3041. return __swap_duplicate(entry, SWAP_HAS_CACHE);
  3042. }
  3043. struct swap_info_struct *swp_swap_info(swp_entry_t entry)
  3044. {
  3045. return swap_info[swp_type(entry)];
  3046. }
  3047. struct swap_info_struct *page_swap_info(struct page *page)
  3048. {
  3049. swp_entry_t entry = { .val = page_private(page) };
  3050. return swp_swap_info(entry);
  3051. }
  3052. /*
  3053. * out-of-line __page_file_ methods to avoid include hell.
  3054. */
  3055. struct address_space *__page_file_mapping(struct page *page)
  3056. {
  3057. return page_swap_info(page)->swap_file->f_mapping;
  3058. }
  3059. EXPORT_SYMBOL_GPL(__page_file_mapping);
  3060. pgoff_t __page_file_index(struct page *page)
  3061. {
  3062. swp_entry_t swap = { .val = page_private(page) };
  3063. return swp_offset(swap);
  3064. }
  3065. EXPORT_SYMBOL_GPL(__page_file_index);
  3066. /*
  3067. * add_swap_count_continuation - called when a swap count is duplicated
  3068. * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
  3069. * page of the original vmalloc'ed swap_map, to hold the continuation count
  3070. * (for that entry and for its neighbouring PAGE_SIZE swap entries). Called
  3071. * again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc.
  3072. *
  3073. * These continuation pages are seldom referenced: the common paths all work
  3074. * on the original swap_map, only referring to a continuation page when the
  3075. * low "digit" of a count is incremented or decremented through SWAP_MAP_MAX.
  3076. *
  3077. * add_swap_count_continuation(, GFP_ATOMIC) can be called while holding
  3078. * page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL)
  3079. * can be called after dropping locks.
  3080. */
  3081. int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
  3082. {
  3083. struct swap_info_struct *si;
  3084. struct swap_cluster_info *ci;
  3085. struct page *head;
  3086. struct page *page;
  3087. struct page *list_page;
  3088. pgoff_t offset;
  3089. unsigned char count;
  3090. /*
  3091. * When debugging, it's easier to use __GFP_ZERO here; but it's better
  3092. * for latency not to zero a page while GFP_ATOMIC and holding locks.
  3093. */
  3094. page = alloc_page(gfp_mask | __GFP_HIGHMEM);
  3095. si = swap_info_get(entry);
  3096. if (!si) {
  3097. /*
  3098. * An acceptable race has occurred since the failing
  3099. * __swap_duplicate(): the swap entry has been freed,
  3100. * perhaps even the whole swap_map cleared for swapoff.
  3101. */
  3102. goto outer;
  3103. }
  3104. offset = swp_offset(entry);
  3105. ci = lock_cluster(si, offset);
  3106. count = si->swap_map[offset] & ~SWAP_HAS_CACHE;
  3107. if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) {
  3108. /*
  3109. * The higher the swap count, the more likely it is that tasks
  3110. * will race to add swap count continuation: we need to avoid
  3111. * over-provisioning.
  3112. */
  3113. goto out;
  3114. }
  3115. if (!page) {
  3116. unlock_cluster(ci);
  3117. spin_unlock(&si->lock);
  3118. return -ENOMEM;
  3119. }
  3120. /*
  3121. * We are fortunate that although vmalloc_to_page uses pte_offset_map,
  3122. * no architecture is using highmem pages for kernel page tables: so it
  3123. * will not corrupt the GFP_ATOMIC caller's atomic page table kmaps.
  3124. */
  3125. head = vmalloc_to_page(si->swap_map + offset);
  3126. offset &= ~PAGE_MASK;
  3127. spin_lock(&si->cont_lock);
  3128. /*
  3129. * Page allocation does not initialize the page's lru field,
  3130. * but it does always reset its private field.
  3131. */
  3132. if (!page_private(head)) {
  3133. BUG_ON(count & COUNT_CONTINUED);
  3134. INIT_LIST_HEAD(&head->lru);
  3135. set_page_private(head, SWP_CONTINUED);
  3136. si->flags |= SWP_CONTINUED;
  3137. }
  3138. list_for_each_entry(list_page, &head->lru, lru) {
  3139. unsigned char *map;
  3140. /*
  3141. * If the previous map said no continuation, but we've found
  3142. * a continuation page, free our allocation and use this one.
  3143. */
  3144. if (!(count & COUNT_CONTINUED))
  3145. goto out_unlock_cont;
  3146. map = kmap_atomic(list_page) + offset;
  3147. count = *map;
  3148. kunmap_atomic(map);
  3149. /*
  3150. * If this continuation count now has some space in it,
  3151. * free our allocation and use this one.
  3152. */
  3153. if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX)
  3154. goto out_unlock_cont;
  3155. }
  3156. list_add_tail(&page->lru, &head->lru);
  3157. page = NULL; /* now it's attached, don't free it */
  3158. out_unlock_cont:
  3159. spin_unlock(&si->cont_lock);
  3160. out:
  3161. unlock_cluster(ci);
  3162. spin_unlock(&si->lock);
  3163. outer:
  3164. if (page)
  3165. __free_page(page);
  3166. return 0;
  3167. }
  3168. /*
  3169. * swap_count_continued - when the original swap_map count is incremented
  3170. * from SWAP_MAP_MAX, check if there is already a continuation page to carry
  3171. * into, carry if so, or else fail until a new continuation page is allocated;
  3172. * when the original swap_map count is decremented from 0 with continuation,
  3173. * borrow from the continuation and report whether it still holds more.
  3174. * Called while __swap_duplicate() or swap_entry_free() holds swap or cluster
  3175. * lock.
  3176. */
  3177. static bool swap_count_continued(struct swap_info_struct *si,
  3178. pgoff_t offset, unsigned char count)
  3179. {
  3180. struct page *head;
  3181. struct page *page;
  3182. unsigned char *map;
  3183. bool ret;
  3184. head = vmalloc_to_page(si->swap_map + offset);
  3185. if (page_private(head) != SWP_CONTINUED) {
  3186. BUG_ON(count & COUNT_CONTINUED);
  3187. return false; /* need to add count continuation */
  3188. }
  3189. spin_lock(&si->cont_lock);
  3190. offset &= ~PAGE_MASK;
  3191. page = list_entry(head->lru.next, struct page, lru);
  3192. map = kmap_atomic(page) + offset;
  3193. if (count == SWAP_MAP_MAX) /* initial increment from swap_map */
  3194. goto init_map; /* jump over SWAP_CONT_MAX checks */
  3195. if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) { /* incrementing */
  3196. /*
  3197. * Think of how you add 1 to 999
  3198. */
  3199. while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) {
  3200. kunmap_atomic(map);
  3201. page = list_entry(page->lru.next, struct page, lru);
  3202. BUG_ON(page == head);
  3203. map = kmap_atomic(page) + offset;
  3204. }
  3205. if (*map == SWAP_CONT_MAX) {
  3206. kunmap_atomic(map);
  3207. page = list_entry(page->lru.next, struct page, lru);
  3208. if (page == head) {
  3209. ret = false; /* add count continuation */
  3210. goto out;
  3211. }
  3212. map = kmap_atomic(page) + offset;
  3213. init_map: *map = 0; /* we didn't zero the page */
  3214. }
  3215. *map += 1;
  3216. kunmap_atomic(map);
  3217. page = list_entry(page->lru.prev, struct page, lru);
  3218. while (page != head) {
  3219. map = kmap_atomic(page) + offset;
  3220. *map = COUNT_CONTINUED;
  3221. kunmap_atomic(map);
  3222. page = list_entry(page->lru.prev, struct page, lru);
  3223. }
  3224. ret = true; /* incremented */
  3225. } else { /* decrementing */
  3226. /*
  3227. * Think of how you subtract 1 from 1000
  3228. */
  3229. BUG_ON(count != COUNT_CONTINUED);
  3230. while (*map == COUNT_CONTINUED) {
  3231. kunmap_atomic(map);
  3232. page = list_entry(page->lru.next, struct page, lru);
  3233. BUG_ON(page == head);
  3234. map = kmap_atomic(page) + offset;
  3235. }
  3236. BUG_ON(*map == 0);
  3237. *map -= 1;
  3238. if (*map == 0)
  3239. count = 0;
  3240. kunmap_atomic(map);
  3241. page = list_entry(page->lru.prev, struct page, lru);
  3242. while (page != head) {
  3243. map = kmap_atomic(page) + offset;
  3244. *map = SWAP_CONT_MAX | count;
  3245. count = COUNT_CONTINUED;
  3246. kunmap_atomic(map);
  3247. page = list_entry(page->lru.prev, struct page, lru);
  3248. }
  3249. ret = count == COUNT_CONTINUED;
  3250. }
  3251. out:
  3252. spin_unlock(&si->cont_lock);
  3253. return ret;
  3254. }
  3255. /*
  3256. * free_swap_count_continuations - swapoff free all the continuation pages
  3257. * appended to the swap_map, after swap_map is quiesced, before vfree'ing it.
  3258. */
  3259. static void free_swap_count_continuations(struct swap_info_struct *si)
  3260. {
  3261. pgoff_t offset;
  3262. for (offset = 0; offset < si->max; offset += PAGE_SIZE) {
  3263. struct page *head;
  3264. head = vmalloc_to_page(si->swap_map + offset);
  3265. if (page_private(head)) {
  3266. struct page *page, *next;
  3267. list_for_each_entry_safe(page, next, &head->lru, lru) {
  3268. list_del(&page->lru);
  3269. __free_page(page);
  3270. }
  3271. }
  3272. }
  3273. }
  3274. #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
  3275. void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg, int node,
  3276. gfp_t gfp_mask)
  3277. {
  3278. struct swap_info_struct *si, *next;
  3279. if (!(gfp_mask & __GFP_IO) || !memcg)
  3280. return;
  3281. if (!blk_cgroup_congested())
  3282. return;
  3283. /*
  3284. * We've already scheduled a throttle, avoid taking the global swap
  3285. * lock.
  3286. */
  3287. if (current->throttle_queue)
  3288. return;
  3289. spin_lock(&swap_avail_lock);
  3290. plist_for_each_entry_safe(si, next, &swap_avail_heads[node],
  3291. avail_lists[node]) {
  3292. if (si->bdev) {
  3293. blkcg_schedule_throttle(bdev_get_queue(si->bdev),
  3294. true);
  3295. break;
  3296. }
  3297. }
  3298. spin_unlock(&swap_avail_lock);
  3299. }
  3300. #endif
  3301. static int __init swapfile_init(void)
  3302. {
  3303. int nid;
  3304. swap_avail_heads = kmalloc_array(nr_node_ids, sizeof(struct plist_head),
  3305. GFP_KERNEL);
  3306. if (!swap_avail_heads) {
  3307. pr_emerg("Not enough memory for swap heads, swap is disabled\n");
  3308. return -ENOMEM;
  3309. }
  3310. for_each_node(nid)
  3311. plist_head_init(&swap_avail_heads[nid]);
  3312. return 0;
  3313. }
  3314. subsys_initcall(swapfile_init);