init_64.c 37 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497
  1. /*
  2. * linux/arch/x86_64/mm/init.c
  3. *
  4. * Copyright (C) 1995 Linus Torvalds
  5. * Copyright (C) 2000 Pavel Machek <pavel@ucw.cz>
  6. * Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
  7. */
  8. #include <linux/signal.h>
  9. #include <linux/sched.h>
  10. #include <linux/kernel.h>
  11. #include <linux/errno.h>
  12. #include <linux/string.h>
  13. #include <linux/types.h>
  14. #include <linux/ptrace.h>
  15. #include <linux/mman.h>
  16. #include <linux/mm.h>
  17. #include <linux/swap.h>
  18. #include <linux/smp.h>
  19. #include <linux/init.h>
  20. #include <linux/initrd.h>
  21. #include <linux/pagemap.h>
  22. #include <linux/bootmem.h>
  23. #include <linux/memblock.h>
  24. #include <linux/proc_fs.h>
  25. #include <linux/pci.h>
  26. #include <linux/pfn.h>
  27. #include <linux/poison.h>
  28. #include <linux/dma-mapping.h>
  29. #include <linux/memory.h>
  30. #include <linux/memory_hotplug.h>
  31. #include <linux/memremap.h>
  32. #include <linux/nmi.h>
  33. #include <linux/gfp.h>
  34. #include <linux/kcore.h>
  35. #include <asm/processor.h>
  36. #include <asm/bios_ebda.h>
  37. #include <linux/uaccess.h>
  38. #include <asm/pgtable.h>
  39. #include <asm/pgalloc.h>
  40. #include <asm/dma.h>
  41. #include <asm/fixmap.h>
  42. #include <asm/e820/api.h>
  43. #include <asm/apic.h>
  44. #include <asm/tlb.h>
  45. #include <asm/mmu_context.h>
  46. #include <asm/proto.h>
  47. #include <asm/smp.h>
  48. #include <asm/sections.h>
  49. #include <asm/kdebug.h>
  50. #include <asm/numa.h>
  51. #include <asm/set_memory.h>
  52. #include <asm/init.h>
  53. #include <asm/uv/uv.h>
  54. #include <asm/setup.h>
  55. #include "mm_internal.h"
  56. #include "ident_map.c"
  57. /*
  58. * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
  59. * physical space so we can cache the place of the first one and move
  60. * around without checking the pgd every time.
  61. */
  62. pteval_t __supported_pte_mask __read_mostly = ~0;
  63. EXPORT_SYMBOL_GPL(__supported_pte_mask);
  64. int force_personality32;
  65. /*
  66. * noexec32=on|off
  67. * Control non executable heap for 32bit processes.
  68. * To control the stack too use noexec=off
  69. *
  70. * on PROT_READ does not imply PROT_EXEC for 32-bit processes (default)
  71. * off PROT_READ implies PROT_EXEC
  72. */
  73. static int __init nonx32_setup(char *str)
  74. {
  75. if (!strcmp(str, "on"))
  76. force_personality32 &= ~READ_IMPLIES_EXEC;
  77. else if (!strcmp(str, "off"))
  78. force_personality32 |= READ_IMPLIES_EXEC;
  79. return 1;
  80. }
  81. __setup("noexec32=", nonx32_setup);
  82. /*
  83. * When memory was added make sure all the processes MM have
  84. * suitable PGD entries in the local PGD level page.
  85. */
  86. #ifdef CONFIG_X86_5LEVEL
  87. void sync_global_pgds(unsigned long start, unsigned long end)
  88. {
  89. unsigned long addr;
  90. for (addr = start; addr <= end; addr = ALIGN(addr + 1, PGDIR_SIZE)) {
  91. const pgd_t *pgd_ref = pgd_offset_k(addr);
  92. struct page *page;
  93. /* Check for overflow */
  94. if (addr < start)
  95. break;
  96. if (pgd_none(*pgd_ref))
  97. continue;
  98. spin_lock(&pgd_lock);
  99. list_for_each_entry(page, &pgd_list, lru) {
  100. pgd_t *pgd;
  101. spinlock_t *pgt_lock;
  102. pgd = (pgd_t *)page_address(page) + pgd_index(addr);
  103. /* the pgt_lock only for Xen */
  104. pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
  105. spin_lock(pgt_lock);
  106. if (!pgd_none(*pgd_ref) && !pgd_none(*pgd))
  107. BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
  108. if (pgd_none(*pgd))
  109. set_pgd(pgd, *pgd_ref);
  110. spin_unlock(pgt_lock);
  111. }
  112. spin_unlock(&pgd_lock);
  113. }
  114. }
  115. #else
  116. void sync_global_pgds(unsigned long start, unsigned long end)
  117. {
  118. unsigned long addr;
  119. for (addr = start; addr <= end; addr = ALIGN(addr + 1, PGDIR_SIZE)) {
  120. pgd_t *pgd_ref = pgd_offset_k(addr);
  121. const p4d_t *p4d_ref;
  122. struct page *page;
  123. /*
  124. * With folded p4d, pgd_none() is always false, we need to
  125. * handle synchonization on p4d level.
  126. */
  127. BUILD_BUG_ON(pgd_none(*pgd_ref));
  128. p4d_ref = p4d_offset(pgd_ref, addr);
  129. if (p4d_none(*p4d_ref))
  130. continue;
  131. spin_lock(&pgd_lock);
  132. list_for_each_entry(page, &pgd_list, lru) {
  133. pgd_t *pgd;
  134. p4d_t *p4d;
  135. spinlock_t *pgt_lock;
  136. pgd = (pgd_t *)page_address(page) + pgd_index(addr);
  137. p4d = p4d_offset(pgd, addr);
  138. /* the pgt_lock only for Xen */
  139. pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
  140. spin_lock(pgt_lock);
  141. if (!p4d_none(*p4d_ref) && !p4d_none(*p4d))
  142. BUG_ON(p4d_page_vaddr(*p4d)
  143. != p4d_page_vaddr(*p4d_ref));
  144. if (p4d_none(*p4d))
  145. set_p4d(p4d, *p4d_ref);
  146. spin_unlock(pgt_lock);
  147. }
  148. spin_unlock(&pgd_lock);
  149. }
  150. }
  151. #endif
  152. /*
  153. * NOTE: This function is marked __ref because it calls __init function
  154. * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
  155. */
  156. static __ref void *spp_getpage(void)
  157. {
  158. void *ptr;
  159. if (after_bootmem)
  160. ptr = (void *) get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
  161. else
  162. ptr = alloc_bootmem_pages(PAGE_SIZE);
  163. if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) {
  164. panic("set_pte_phys: cannot allocate page data %s\n",
  165. after_bootmem ? "after bootmem" : "");
  166. }
  167. pr_debug("spp_getpage %p\n", ptr);
  168. return ptr;
  169. }
  170. static p4d_t *fill_p4d(pgd_t *pgd, unsigned long vaddr)
  171. {
  172. if (pgd_none(*pgd)) {
  173. p4d_t *p4d = (p4d_t *)spp_getpage();
  174. pgd_populate(&init_mm, pgd, p4d);
  175. if (p4d != p4d_offset(pgd, 0))
  176. printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n",
  177. p4d, p4d_offset(pgd, 0));
  178. }
  179. return p4d_offset(pgd, vaddr);
  180. }
  181. static pud_t *fill_pud(p4d_t *p4d, unsigned long vaddr)
  182. {
  183. if (p4d_none(*p4d)) {
  184. pud_t *pud = (pud_t *)spp_getpage();
  185. p4d_populate(&init_mm, p4d, pud);
  186. if (pud != pud_offset(p4d, 0))
  187. printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
  188. pud, pud_offset(p4d, 0));
  189. }
  190. return pud_offset(p4d, vaddr);
  191. }
  192. static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr)
  193. {
  194. if (pud_none(*pud)) {
  195. pmd_t *pmd = (pmd_t *) spp_getpage();
  196. pud_populate(&init_mm, pud, pmd);
  197. if (pmd != pmd_offset(pud, 0))
  198. printk(KERN_ERR "PAGETABLE BUG #02! %p <-> %p\n",
  199. pmd, pmd_offset(pud, 0));
  200. }
  201. return pmd_offset(pud, vaddr);
  202. }
  203. static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr)
  204. {
  205. if (pmd_none(*pmd)) {
  206. pte_t *pte = (pte_t *) spp_getpage();
  207. pmd_populate_kernel(&init_mm, pmd, pte);
  208. if (pte != pte_offset_kernel(pmd, 0))
  209. printk(KERN_ERR "PAGETABLE BUG #03!\n");
  210. }
  211. return pte_offset_kernel(pmd, vaddr);
  212. }
  213. static void __set_pte_vaddr(pud_t *pud, unsigned long vaddr, pte_t new_pte)
  214. {
  215. pmd_t *pmd = fill_pmd(pud, vaddr);
  216. pte_t *pte = fill_pte(pmd, vaddr);
  217. set_pte(pte, new_pte);
  218. /*
  219. * It's enough to flush this one mapping.
  220. * (PGE mappings get flushed as well)
  221. */
  222. __flush_tlb_one(vaddr);
  223. }
  224. void set_pte_vaddr_p4d(p4d_t *p4d_page, unsigned long vaddr, pte_t new_pte)
  225. {
  226. p4d_t *p4d = p4d_page + p4d_index(vaddr);
  227. pud_t *pud = fill_pud(p4d, vaddr);
  228. __set_pte_vaddr(pud, vaddr, new_pte);
  229. }
  230. void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
  231. {
  232. pud_t *pud = pud_page + pud_index(vaddr);
  233. __set_pte_vaddr(pud, vaddr, new_pte);
  234. }
  235. void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
  236. {
  237. pgd_t *pgd;
  238. p4d_t *p4d_page;
  239. pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, native_pte_val(pteval));
  240. pgd = pgd_offset_k(vaddr);
  241. if (pgd_none(*pgd)) {
  242. printk(KERN_ERR
  243. "PGD FIXMAP MISSING, it should be setup in head.S!\n");
  244. return;
  245. }
  246. p4d_page = p4d_offset(pgd, 0);
  247. set_pte_vaddr_p4d(p4d_page, vaddr, pteval);
  248. }
  249. pmd_t * __init populate_extra_pmd(unsigned long vaddr)
  250. {
  251. pgd_t *pgd;
  252. p4d_t *p4d;
  253. pud_t *pud;
  254. pgd = pgd_offset_k(vaddr);
  255. p4d = fill_p4d(pgd, vaddr);
  256. pud = fill_pud(p4d, vaddr);
  257. return fill_pmd(pud, vaddr);
  258. }
  259. pte_t * __init populate_extra_pte(unsigned long vaddr)
  260. {
  261. pmd_t *pmd;
  262. pmd = populate_extra_pmd(vaddr);
  263. return fill_pte(pmd, vaddr);
  264. }
  265. /*
  266. * Create large page table mappings for a range of physical addresses.
  267. */
  268. static void __init __init_extra_mapping(unsigned long phys, unsigned long size,
  269. enum page_cache_mode cache)
  270. {
  271. pgd_t *pgd;
  272. p4d_t *p4d;
  273. pud_t *pud;
  274. pmd_t *pmd;
  275. pgprot_t prot;
  276. pgprot_val(prot) = pgprot_val(PAGE_KERNEL_LARGE) |
  277. pgprot_val(pgprot_4k_2_large(cachemode2pgprot(cache)));
  278. BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK));
  279. for (; size; phys += PMD_SIZE, size -= PMD_SIZE) {
  280. pgd = pgd_offset_k((unsigned long)__va(phys));
  281. if (pgd_none(*pgd)) {
  282. p4d = (p4d_t *) spp_getpage();
  283. set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE |
  284. _PAGE_USER));
  285. }
  286. p4d = p4d_offset(pgd, (unsigned long)__va(phys));
  287. if (p4d_none(*p4d)) {
  288. pud = (pud_t *) spp_getpage();
  289. set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE |
  290. _PAGE_USER));
  291. }
  292. pud = pud_offset(p4d, (unsigned long)__va(phys));
  293. if (pud_none(*pud)) {
  294. pmd = (pmd_t *) spp_getpage();
  295. set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE |
  296. _PAGE_USER));
  297. }
  298. pmd = pmd_offset(pud, phys);
  299. BUG_ON(!pmd_none(*pmd));
  300. set_pmd(pmd, __pmd(phys | pgprot_val(prot)));
  301. }
  302. }
  303. void __init init_extra_mapping_wb(unsigned long phys, unsigned long size)
  304. {
  305. __init_extra_mapping(phys, size, _PAGE_CACHE_MODE_WB);
  306. }
  307. void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
  308. {
  309. __init_extra_mapping(phys, size, _PAGE_CACHE_MODE_UC);
  310. }
  311. /*
  312. * The head.S code sets up the kernel high mapping:
  313. *
  314. * from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text)
  315. *
  316. * phys_base holds the negative offset to the kernel, which is added
  317. * to the compile time generated pmds. This results in invalid pmds up
  318. * to the point where we hit the physaddr 0 mapping.
  319. *
  320. * We limit the mappings to the region from _text to _brk_end. _brk_end
  321. * is rounded up to the 2MB boundary. This catches the invalid pmds as
  322. * well, as they are located before _text:
  323. */
  324. void __init cleanup_highmap(void)
  325. {
  326. unsigned long vaddr = __START_KERNEL_map;
  327. unsigned long vaddr_end = __START_KERNEL_map + KERNEL_IMAGE_SIZE;
  328. unsigned long end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1;
  329. pmd_t *pmd = level2_kernel_pgt;
  330. /*
  331. * Native path, max_pfn_mapped is not set yet.
  332. * Xen has valid max_pfn_mapped set in
  333. * arch/x86/xen/mmu.c:xen_setup_kernel_pagetable().
  334. */
  335. if (max_pfn_mapped)
  336. vaddr_end = __START_KERNEL_map + (max_pfn_mapped << PAGE_SHIFT);
  337. for (; vaddr + PMD_SIZE - 1 < vaddr_end; pmd++, vaddr += PMD_SIZE) {
  338. if (pmd_none(*pmd))
  339. continue;
  340. if (vaddr < (unsigned long) _text || vaddr > end)
  341. set_pmd(pmd, __pmd(0));
  342. }
  343. }
  344. /*
  345. * Create PTE level page table mapping for physical addresses.
  346. * It returns the last physical address mapped.
  347. */
  348. static unsigned long __meminit
  349. phys_pte_init(pte_t *pte_page, unsigned long paddr, unsigned long paddr_end,
  350. pgprot_t prot)
  351. {
  352. unsigned long pages = 0, paddr_next;
  353. unsigned long paddr_last = paddr_end;
  354. pte_t *pte;
  355. int i;
  356. pte = pte_page + pte_index(paddr);
  357. i = pte_index(paddr);
  358. for (; i < PTRS_PER_PTE; i++, paddr = paddr_next, pte++) {
  359. paddr_next = (paddr & PAGE_MASK) + PAGE_SIZE;
  360. if (paddr >= paddr_end) {
  361. if (!after_bootmem &&
  362. !e820__mapped_any(paddr & PAGE_MASK, paddr_next,
  363. E820_TYPE_RAM) &&
  364. !e820__mapped_any(paddr & PAGE_MASK, paddr_next,
  365. E820_TYPE_RESERVED_KERN))
  366. set_pte(pte, __pte(0));
  367. continue;
  368. }
  369. /*
  370. * We will re-use the existing mapping.
  371. * Xen for example has some special requirements, like mapping
  372. * pagetable pages as RO. So assume someone who pre-setup
  373. * these mappings are more intelligent.
  374. */
  375. if (!pte_none(*pte)) {
  376. if (!after_bootmem)
  377. pages++;
  378. continue;
  379. }
  380. if (0)
  381. pr_info(" pte=%p addr=%lx pte=%016lx\n", pte, paddr,
  382. pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL).pte);
  383. pages++;
  384. set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, prot));
  385. paddr_last = (paddr & PAGE_MASK) + PAGE_SIZE;
  386. }
  387. update_page_count(PG_LEVEL_4K, pages);
  388. return paddr_last;
  389. }
  390. /*
  391. * Create PMD level page table mapping for physical addresses. The virtual
  392. * and physical address have to be aligned at this level.
  393. * It returns the last physical address mapped.
  394. */
  395. static unsigned long __meminit
  396. phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end,
  397. unsigned long page_size_mask, pgprot_t prot)
  398. {
  399. unsigned long pages = 0, paddr_next;
  400. unsigned long paddr_last = paddr_end;
  401. int i = pmd_index(paddr);
  402. for (; i < PTRS_PER_PMD; i++, paddr = paddr_next) {
  403. pmd_t *pmd = pmd_page + pmd_index(paddr);
  404. pte_t *pte;
  405. pgprot_t new_prot = prot;
  406. paddr_next = (paddr & PMD_MASK) + PMD_SIZE;
  407. if (paddr >= paddr_end) {
  408. if (!after_bootmem &&
  409. !e820__mapped_any(paddr & PMD_MASK, paddr_next,
  410. E820_TYPE_RAM) &&
  411. !e820__mapped_any(paddr & PMD_MASK, paddr_next,
  412. E820_TYPE_RESERVED_KERN))
  413. set_pmd(pmd, __pmd(0));
  414. continue;
  415. }
  416. if (!pmd_none(*pmd)) {
  417. if (!pmd_large(*pmd)) {
  418. spin_lock(&init_mm.page_table_lock);
  419. pte = (pte_t *)pmd_page_vaddr(*pmd);
  420. paddr_last = phys_pte_init(pte, paddr,
  421. paddr_end, prot);
  422. spin_unlock(&init_mm.page_table_lock);
  423. continue;
  424. }
  425. /*
  426. * If we are ok with PG_LEVEL_2M mapping, then we will
  427. * use the existing mapping,
  428. *
  429. * Otherwise, we will split the large page mapping but
  430. * use the same existing protection bits except for
  431. * large page, so that we don't violate Intel's TLB
  432. * Application note (317080) which says, while changing
  433. * the page sizes, new and old translations should
  434. * not differ with respect to page frame and
  435. * attributes.
  436. */
  437. if (page_size_mask & (1 << PG_LEVEL_2M)) {
  438. if (!after_bootmem)
  439. pages++;
  440. paddr_last = paddr_next;
  441. continue;
  442. }
  443. new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd));
  444. }
  445. if (page_size_mask & (1<<PG_LEVEL_2M)) {
  446. pages++;
  447. spin_lock(&init_mm.page_table_lock);
  448. set_pte((pte_t *)pmd,
  449. pfn_pte((paddr & PMD_MASK) >> PAGE_SHIFT,
  450. __pgprot(pgprot_val(prot) | _PAGE_PSE)));
  451. spin_unlock(&init_mm.page_table_lock);
  452. paddr_last = paddr_next;
  453. continue;
  454. }
  455. pte = alloc_low_page();
  456. paddr_last = phys_pte_init(pte, paddr, paddr_end, new_prot);
  457. spin_lock(&init_mm.page_table_lock);
  458. pmd_populate_kernel(&init_mm, pmd, pte);
  459. spin_unlock(&init_mm.page_table_lock);
  460. }
  461. update_page_count(PG_LEVEL_2M, pages);
  462. return paddr_last;
  463. }
  464. /*
  465. * Create PUD level page table mapping for physical addresses. The virtual
  466. * and physical address do not have to be aligned at this level. KASLR can
  467. * randomize virtual addresses up to this level.
  468. * It returns the last physical address mapped.
  469. */
  470. static unsigned long __meminit
  471. phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end,
  472. unsigned long page_size_mask)
  473. {
  474. unsigned long pages = 0, paddr_next;
  475. unsigned long paddr_last = paddr_end;
  476. unsigned long vaddr = (unsigned long)__va(paddr);
  477. int i = pud_index(vaddr);
  478. for (; i < PTRS_PER_PUD; i++, paddr = paddr_next) {
  479. pud_t *pud;
  480. pmd_t *pmd;
  481. pgprot_t prot = PAGE_KERNEL;
  482. vaddr = (unsigned long)__va(paddr);
  483. pud = pud_page + pud_index(vaddr);
  484. paddr_next = (paddr & PUD_MASK) + PUD_SIZE;
  485. if (paddr >= paddr_end) {
  486. if (!after_bootmem &&
  487. !e820__mapped_any(paddr & PUD_MASK, paddr_next,
  488. E820_TYPE_RAM) &&
  489. !e820__mapped_any(paddr & PUD_MASK, paddr_next,
  490. E820_TYPE_RESERVED_KERN))
  491. set_pud(pud, __pud(0));
  492. continue;
  493. }
  494. if (!pud_none(*pud)) {
  495. if (!pud_large(*pud)) {
  496. pmd = pmd_offset(pud, 0);
  497. paddr_last = phys_pmd_init(pmd, paddr,
  498. paddr_end,
  499. page_size_mask,
  500. prot);
  501. __flush_tlb_all();
  502. continue;
  503. }
  504. /*
  505. * If we are ok with PG_LEVEL_1G mapping, then we will
  506. * use the existing mapping.
  507. *
  508. * Otherwise, we will split the gbpage mapping but use
  509. * the same existing protection bits except for large
  510. * page, so that we don't violate Intel's TLB
  511. * Application note (317080) which says, while changing
  512. * the page sizes, new and old translations should
  513. * not differ with respect to page frame and
  514. * attributes.
  515. */
  516. if (page_size_mask & (1 << PG_LEVEL_1G)) {
  517. if (!after_bootmem)
  518. pages++;
  519. paddr_last = paddr_next;
  520. continue;
  521. }
  522. prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud));
  523. }
  524. if (page_size_mask & (1<<PG_LEVEL_1G)) {
  525. pages++;
  526. spin_lock(&init_mm.page_table_lock);
  527. set_pte((pte_t *)pud,
  528. pfn_pte((paddr & PUD_MASK) >> PAGE_SHIFT,
  529. PAGE_KERNEL_LARGE));
  530. spin_unlock(&init_mm.page_table_lock);
  531. paddr_last = paddr_next;
  532. continue;
  533. }
  534. pmd = alloc_low_page();
  535. paddr_last = phys_pmd_init(pmd, paddr, paddr_end,
  536. page_size_mask, prot);
  537. spin_lock(&init_mm.page_table_lock);
  538. pud_populate(&init_mm, pud, pmd);
  539. spin_unlock(&init_mm.page_table_lock);
  540. }
  541. __flush_tlb_all();
  542. update_page_count(PG_LEVEL_1G, pages);
  543. return paddr_last;
  544. }
  545. static unsigned long __meminit
  546. phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end,
  547. unsigned long page_size_mask)
  548. {
  549. unsigned long paddr_next, paddr_last = paddr_end;
  550. unsigned long vaddr = (unsigned long)__va(paddr);
  551. int i = p4d_index(vaddr);
  552. if (!IS_ENABLED(CONFIG_X86_5LEVEL))
  553. return phys_pud_init((pud_t *) p4d_page, paddr, paddr_end, page_size_mask);
  554. for (; i < PTRS_PER_P4D; i++, paddr = paddr_next) {
  555. p4d_t *p4d;
  556. pud_t *pud;
  557. vaddr = (unsigned long)__va(paddr);
  558. p4d = p4d_page + p4d_index(vaddr);
  559. paddr_next = (paddr & P4D_MASK) + P4D_SIZE;
  560. if (paddr >= paddr_end) {
  561. if (!after_bootmem &&
  562. !e820__mapped_any(paddr & P4D_MASK, paddr_next,
  563. E820_TYPE_RAM) &&
  564. !e820__mapped_any(paddr & P4D_MASK, paddr_next,
  565. E820_TYPE_RESERVED_KERN))
  566. set_p4d(p4d, __p4d(0));
  567. continue;
  568. }
  569. if (!p4d_none(*p4d)) {
  570. pud = pud_offset(p4d, 0);
  571. paddr_last = phys_pud_init(pud, paddr,
  572. paddr_end,
  573. page_size_mask);
  574. __flush_tlb_all();
  575. continue;
  576. }
  577. pud = alloc_low_page();
  578. paddr_last = phys_pud_init(pud, paddr, paddr_end,
  579. page_size_mask);
  580. spin_lock(&init_mm.page_table_lock);
  581. p4d_populate(&init_mm, p4d, pud);
  582. spin_unlock(&init_mm.page_table_lock);
  583. }
  584. __flush_tlb_all();
  585. return paddr_last;
  586. }
  587. /*
  588. * Create page table mapping for the physical memory for specific physical
  589. * addresses. The virtual and physical addresses have to be aligned on PMD level
  590. * down. It returns the last physical address mapped.
  591. */
  592. unsigned long __meminit
  593. kernel_physical_mapping_init(unsigned long paddr_start,
  594. unsigned long paddr_end,
  595. unsigned long page_size_mask)
  596. {
  597. bool pgd_changed = false;
  598. unsigned long vaddr, vaddr_start, vaddr_end, vaddr_next, paddr_last;
  599. paddr_last = paddr_end;
  600. vaddr = (unsigned long)__va(paddr_start);
  601. vaddr_end = (unsigned long)__va(paddr_end);
  602. vaddr_start = vaddr;
  603. for (; vaddr < vaddr_end; vaddr = vaddr_next) {
  604. pgd_t *pgd = pgd_offset_k(vaddr);
  605. p4d_t *p4d;
  606. vaddr_next = (vaddr & PGDIR_MASK) + PGDIR_SIZE;
  607. if (pgd_val(*pgd)) {
  608. p4d = (p4d_t *)pgd_page_vaddr(*pgd);
  609. paddr_last = phys_p4d_init(p4d, __pa(vaddr),
  610. __pa(vaddr_end),
  611. page_size_mask);
  612. continue;
  613. }
  614. p4d = alloc_low_page();
  615. paddr_last = phys_p4d_init(p4d, __pa(vaddr), __pa(vaddr_end),
  616. page_size_mask);
  617. spin_lock(&init_mm.page_table_lock);
  618. if (IS_ENABLED(CONFIG_X86_5LEVEL))
  619. pgd_populate(&init_mm, pgd, p4d);
  620. else
  621. p4d_populate(&init_mm, p4d_offset(pgd, vaddr), (pud_t *) p4d);
  622. spin_unlock(&init_mm.page_table_lock);
  623. pgd_changed = true;
  624. }
  625. if (pgd_changed)
  626. sync_global_pgds(vaddr_start, vaddr_end - 1);
  627. __flush_tlb_all();
  628. return paddr_last;
  629. }
  630. #ifndef CONFIG_NUMA
  631. void __init initmem_init(void)
  632. {
  633. memblock_set_node(0, (phys_addr_t)ULLONG_MAX, &memblock.memory, 0);
  634. }
  635. #endif
  636. void __init paging_init(void)
  637. {
  638. sparse_memory_present_with_active_regions(MAX_NUMNODES);
  639. sparse_init();
  640. /*
  641. * clear the default setting with node 0
  642. * note: don't use nodes_clear here, that is really clearing when
  643. * numa support is not compiled in, and later node_set_state
  644. * will not set it back.
  645. */
  646. node_clear_state(0, N_MEMORY);
  647. if (N_MEMORY != N_NORMAL_MEMORY)
  648. node_clear_state(0, N_NORMAL_MEMORY);
  649. zone_sizes_init();
  650. }
  651. /*
  652. * Memory hotplug specific functions
  653. */
  654. #ifdef CONFIG_MEMORY_HOTPLUG
  655. /*
  656. * After memory hotplug the variables max_pfn, max_low_pfn and high_memory need
  657. * updating.
  658. */
  659. static void update_end_of_memory_vars(u64 start, u64 size)
  660. {
  661. unsigned long end_pfn = PFN_UP(start + size);
  662. if (end_pfn > max_pfn) {
  663. max_pfn = end_pfn;
  664. max_low_pfn = end_pfn;
  665. high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
  666. }
  667. }
  668. int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock)
  669. {
  670. unsigned long start_pfn = start >> PAGE_SHIFT;
  671. unsigned long nr_pages = size >> PAGE_SHIFT;
  672. int ret;
  673. init_memory_mapping(start, start + size);
  674. ret = __add_pages(nid, start_pfn, nr_pages, want_memblock);
  675. WARN_ON_ONCE(ret);
  676. /* update max_pfn, max_low_pfn and high_memory */
  677. update_end_of_memory_vars(start, size);
  678. return ret;
  679. }
  680. EXPORT_SYMBOL_GPL(arch_add_memory);
  681. #define PAGE_INUSE 0xFD
  682. static void __meminit free_pagetable(struct page *page, int order)
  683. {
  684. unsigned long magic;
  685. unsigned int nr_pages = 1 << order;
  686. struct vmem_altmap *altmap = to_vmem_altmap((unsigned long) page);
  687. if (altmap) {
  688. vmem_altmap_free(altmap, nr_pages);
  689. return;
  690. }
  691. /* bootmem page has reserved flag */
  692. if (PageReserved(page)) {
  693. __ClearPageReserved(page);
  694. magic = (unsigned long)page->freelist;
  695. if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) {
  696. while (nr_pages--)
  697. put_page_bootmem(page++);
  698. } else
  699. while (nr_pages--)
  700. free_reserved_page(page++);
  701. } else
  702. free_pages((unsigned long)page_address(page), order);
  703. }
  704. static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd)
  705. {
  706. pte_t *pte;
  707. int i;
  708. for (i = 0; i < PTRS_PER_PTE; i++) {
  709. pte = pte_start + i;
  710. if (!pte_none(*pte))
  711. return;
  712. }
  713. /* free a pte talbe */
  714. free_pagetable(pmd_page(*pmd), 0);
  715. spin_lock(&init_mm.page_table_lock);
  716. pmd_clear(pmd);
  717. spin_unlock(&init_mm.page_table_lock);
  718. }
  719. static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud)
  720. {
  721. pmd_t *pmd;
  722. int i;
  723. for (i = 0; i < PTRS_PER_PMD; i++) {
  724. pmd = pmd_start + i;
  725. if (!pmd_none(*pmd))
  726. return;
  727. }
  728. /* free a pmd talbe */
  729. free_pagetable(pud_page(*pud), 0);
  730. spin_lock(&init_mm.page_table_lock);
  731. pud_clear(pud);
  732. spin_unlock(&init_mm.page_table_lock);
  733. }
  734. static void __meminit free_pud_table(pud_t *pud_start, p4d_t *p4d)
  735. {
  736. pud_t *pud;
  737. int i;
  738. for (i = 0; i < PTRS_PER_PUD; i++) {
  739. pud = pud_start + i;
  740. if (!pud_none(*pud))
  741. return;
  742. }
  743. /* free a pud talbe */
  744. free_pagetable(p4d_page(*p4d), 0);
  745. spin_lock(&init_mm.page_table_lock);
  746. p4d_clear(p4d);
  747. spin_unlock(&init_mm.page_table_lock);
  748. }
  749. static void __meminit
  750. remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end,
  751. bool direct)
  752. {
  753. unsigned long next, pages = 0;
  754. pte_t *pte;
  755. void *page_addr;
  756. phys_addr_t phys_addr;
  757. pte = pte_start + pte_index(addr);
  758. for (; addr < end; addr = next, pte++) {
  759. next = (addr + PAGE_SIZE) & PAGE_MASK;
  760. if (next > end)
  761. next = end;
  762. if (!pte_present(*pte))
  763. continue;
  764. /*
  765. * We mapped [0,1G) memory as identity mapping when
  766. * initializing, in arch/x86/kernel/head_64.S. These
  767. * pagetables cannot be removed.
  768. */
  769. phys_addr = pte_val(*pte) + (addr & PAGE_MASK);
  770. if (phys_addr < (phys_addr_t)0x40000000)
  771. return;
  772. if (PAGE_ALIGNED(addr) && PAGE_ALIGNED(next)) {
  773. /*
  774. * Do not free direct mapping pages since they were
  775. * freed when offlining, or simplely not in use.
  776. */
  777. if (!direct)
  778. free_pagetable(pte_page(*pte), 0);
  779. spin_lock(&init_mm.page_table_lock);
  780. pte_clear(&init_mm, addr, pte);
  781. spin_unlock(&init_mm.page_table_lock);
  782. /* For non-direct mapping, pages means nothing. */
  783. pages++;
  784. } else {
  785. /*
  786. * If we are here, we are freeing vmemmap pages since
  787. * direct mapped memory ranges to be freed are aligned.
  788. *
  789. * If we are not removing the whole page, it means
  790. * other page structs in this page are being used and
  791. * we canot remove them. So fill the unused page_structs
  792. * with 0xFD, and remove the page when it is wholly
  793. * filled with 0xFD.
  794. */
  795. memset((void *)addr, PAGE_INUSE, next - addr);
  796. page_addr = page_address(pte_page(*pte));
  797. if (!memchr_inv(page_addr, PAGE_INUSE, PAGE_SIZE)) {
  798. free_pagetable(pte_page(*pte), 0);
  799. spin_lock(&init_mm.page_table_lock);
  800. pte_clear(&init_mm, addr, pte);
  801. spin_unlock(&init_mm.page_table_lock);
  802. }
  803. }
  804. }
  805. /* Call free_pte_table() in remove_pmd_table(). */
  806. flush_tlb_all();
  807. if (direct)
  808. update_page_count(PG_LEVEL_4K, -pages);
  809. }
  810. static void __meminit
  811. remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end,
  812. bool direct)
  813. {
  814. unsigned long next, pages = 0;
  815. pte_t *pte_base;
  816. pmd_t *pmd;
  817. void *page_addr;
  818. pmd = pmd_start + pmd_index(addr);
  819. for (; addr < end; addr = next, pmd++) {
  820. next = pmd_addr_end(addr, end);
  821. if (!pmd_present(*pmd))
  822. continue;
  823. if (pmd_large(*pmd)) {
  824. if (IS_ALIGNED(addr, PMD_SIZE) &&
  825. IS_ALIGNED(next, PMD_SIZE)) {
  826. if (!direct)
  827. free_pagetable(pmd_page(*pmd),
  828. get_order(PMD_SIZE));
  829. spin_lock(&init_mm.page_table_lock);
  830. pmd_clear(pmd);
  831. spin_unlock(&init_mm.page_table_lock);
  832. pages++;
  833. } else {
  834. /* If here, we are freeing vmemmap pages. */
  835. memset((void *)addr, PAGE_INUSE, next - addr);
  836. page_addr = page_address(pmd_page(*pmd));
  837. if (!memchr_inv(page_addr, PAGE_INUSE,
  838. PMD_SIZE)) {
  839. free_pagetable(pmd_page(*pmd),
  840. get_order(PMD_SIZE));
  841. spin_lock(&init_mm.page_table_lock);
  842. pmd_clear(pmd);
  843. spin_unlock(&init_mm.page_table_lock);
  844. }
  845. }
  846. continue;
  847. }
  848. pte_base = (pte_t *)pmd_page_vaddr(*pmd);
  849. remove_pte_table(pte_base, addr, next, direct);
  850. free_pte_table(pte_base, pmd);
  851. }
  852. /* Call free_pmd_table() in remove_pud_table(). */
  853. if (direct)
  854. update_page_count(PG_LEVEL_2M, -pages);
  855. }
  856. static void __meminit
  857. remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end,
  858. bool direct)
  859. {
  860. unsigned long next, pages = 0;
  861. pmd_t *pmd_base;
  862. pud_t *pud;
  863. void *page_addr;
  864. pud = pud_start + pud_index(addr);
  865. for (; addr < end; addr = next, pud++) {
  866. next = pud_addr_end(addr, end);
  867. if (!pud_present(*pud))
  868. continue;
  869. if (pud_large(*pud)) {
  870. if (IS_ALIGNED(addr, PUD_SIZE) &&
  871. IS_ALIGNED(next, PUD_SIZE)) {
  872. if (!direct)
  873. free_pagetable(pud_page(*pud),
  874. get_order(PUD_SIZE));
  875. spin_lock(&init_mm.page_table_lock);
  876. pud_clear(pud);
  877. spin_unlock(&init_mm.page_table_lock);
  878. pages++;
  879. } else {
  880. /* If here, we are freeing vmemmap pages. */
  881. memset((void *)addr, PAGE_INUSE, next - addr);
  882. page_addr = page_address(pud_page(*pud));
  883. if (!memchr_inv(page_addr, PAGE_INUSE,
  884. PUD_SIZE)) {
  885. free_pagetable(pud_page(*pud),
  886. get_order(PUD_SIZE));
  887. spin_lock(&init_mm.page_table_lock);
  888. pud_clear(pud);
  889. spin_unlock(&init_mm.page_table_lock);
  890. }
  891. }
  892. continue;
  893. }
  894. pmd_base = pmd_offset(pud, 0);
  895. remove_pmd_table(pmd_base, addr, next, direct);
  896. free_pmd_table(pmd_base, pud);
  897. }
  898. if (direct)
  899. update_page_count(PG_LEVEL_1G, -pages);
  900. }
  901. static void __meminit
  902. remove_p4d_table(p4d_t *p4d_start, unsigned long addr, unsigned long end,
  903. bool direct)
  904. {
  905. unsigned long next, pages = 0;
  906. pud_t *pud_base;
  907. p4d_t *p4d;
  908. p4d = p4d_start + p4d_index(addr);
  909. for (; addr < end; addr = next, p4d++) {
  910. next = p4d_addr_end(addr, end);
  911. if (!p4d_present(*p4d))
  912. continue;
  913. BUILD_BUG_ON(p4d_large(*p4d));
  914. pud_base = pud_offset(p4d, 0);
  915. remove_pud_table(pud_base, addr, next, direct);
  916. /*
  917. * For 4-level page tables we do not want to free PUDs, but in the
  918. * 5-level case we should free them. This code will have to change
  919. * to adapt for boot-time switching between 4 and 5 level page tables.
  920. */
  921. if (CONFIG_PGTABLE_LEVELS == 5)
  922. free_pud_table(pud_base, p4d);
  923. }
  924. if (direct)
  925. update_page_count(PG_LEVEL_512G, -pages);
  926. }
  927. /* start and end are both virtual address. */
  928. static void __meminit
  929. remove_pagetable(unsigned long start, unsigned long end, bool direct)
  930. {
  931. unsigned long next;
  932. unsigned long addr;
  933. pgd_t *pgd;
  934. p4d_t *p4d;
  935. for (addr = start; addr < end; addr = next) {
  936. next = pgd_addr_end(addr, end);
  937. pgd = pgd_offset_k(addr);
  938. if (!pgd_present(*pgd))
  939. continue;
  940. p4d = p4d_offset(pgd, 0);
  941. remove_p4d_table(p4d, addr, next, direct);
  942. }
  943. flush_tlb_all();
  944. }
  945. void __ref vmemmap_free(unsigned long start, unsigned long end)
  946. {
  947. remove_pagetable(start, end, false);
  948. }
  949. #ifdef CONFIG_MEMORY_HOTREMOVE
  950. static void __meminit
  951. kernel_physical_mapping_remove(unsigned long start, unsigned long end)
  952. {
  953. start = (unsigned long)__va(start);
  954. end = (unsigned long)__va(end);
  955. remove_pagetable(start, end, true);
  956. }
  957. int __ref arch_remove_memory(u64 start, u64 size)
  958. {
  959. unsigned long start_pfn = start >> PAGE_SHIFT;
  960. unsigned long nr_pages = size >> PAGE_SHIFT;
  961. struct page *page = pfn_to_page(start_pfn);
  962. struct vmem_altmap *altmap;
  963. struct zone *zone;
  964. int ret;
  965. /* With altmap the first mapped page is offset from @start */
  966. altmap = to_vmem_altmap((unsigned long) page);
  967. if (altmap)
  968. page += vmem_altmap_offset(altmap);
  969. zone = page_zone(page);
  970. ret = __remove_pages(zone, start_pfn, nr_pages);
  971. WARN_ON_ONCE(ret);
  972. kernel_physical_mapping_remove(start, start + size);
  973. return ret;
  974. }
  975. #endif
  976. #endif /* CONFIG_MEMORY_HOTPLUG */
  977. static struct kcore_list kcore_vsyscall;
  978. static void __init register_page_bootmem_info(void)
  979. {
  980. #ifdef CONFIG_NUMA
  981. int i;
  982. for_each_online_node(i)
  983. register_page_bootmem_info_node(NODE_DATA(i));
  984. #endif
  985. }
  986. void __init mem_init(void)
  987. {
  988. pci_iommu_alloc();
  989. /* clear_bss() already clear the empty_zero_page */
  990. register_page_bootmem_info();
  991. /* this will put all memory onto the freelists */
  992. free_all_bootmem();
  993. after_bootmem = 1;
  994. /* Register memory areas for /proc/kcore */
  995. kclist_add(&kcore_vsyscall, (void *)VSYSCALL_ADDR,
  996. PAGE_SIZE, KCORE_OTHER);
  997. mem_init_print_info(NULL);
  998. }
  999. int kernel_set_to_readonly;
  1000. void set_kernel_text_rw(void)
  1001. {
  1002. unsigned long start = PFN_ALIGN(_text);
  1003. unsigned long end = PFN_ALIGN(__stop___ex_table);
  1004. if (!kernel_set_to_readonly)
  1005. return;
  1006. pr_debug("Set kernel text: %lx - %lx for read write\n",
  1007. start, end);
  1008. /*
  1009. * Make the kernel identity mapping for text RW. Kernel text
  1010. * mapping will always be RO. Refer to the comment in
  1011. * static_protections() in pageattr.c
  1012. */
  1013. set_memory_rw(start, (end - start) >> PAGE_SHIFT);
  1014. }
  1015. void set_kernel_text_ro(void)
  1016. {
  1017. unsigned long start = PFN_ALIGN(_text);
  1018. unsigned long end = PFN_ALIGN(__stop___ex_table);
  1019. if (!kernel_set_to_readonly)
  1020. return;
  1021. pr_debug("Set kernel text: %lx - %lx for read only\n",
  1022. start, end);
  1023. /*
  1024. * Set the kernel identity mapping for text RO.
  1025. */
  1026. set_memory_ro(start, (end - start) >> PAGE_SHIFT);
  1027. }
  1028. void mark_rodata_ro(void)
  1029. {
  1030. unsigned long start = PFN_ALIGN(_text);
  1031. unsigned long rodata_start = PFN_ALIGN(__start_rodata);
  1032. unsigned long end = (unsigned long) &__end_rodata_hpage_align;
  1033. unsigned long text_end = PFN_ALIGN(&__stop___ex_table);
  1034. unsigned long rodata_end = PFN_ALIGN(&__end_rodata);
  1035. unsigned long all_end;
  1036. printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
  1037. (end - start) >> 10);
  1038. set_memory_ro(start, (end - start) >> PAGE_SHIFT);
  1039. kernel_set_to_readonly = 1;
  1040. /*
  1041. * The rodata/data/bss/brk section (but not the kernel text!)
  1042. * should also be not-executable.
  1043. *
  1044. * We align all_end to PMD_SIZE because the existing mapping
  1045. * is a full PMD. If we would align _brk_end to PAGE_SIZE we
  1046. * split the PMD and the reminder between _brk_end and the end
  1047. * of the PMD will remain mapped executable.
  1048. *
  1049. * Any PMD which was setup after the one which covers _brk_end
  1050. * has been zapped already via cleanup_highmem().
  1051. */
  1052. all_end = roundup((unsigned long)_brk_end, PMD_SIZE);
  1053. set_memory_nx(text_end, (all_end - text_end) >> PAGE_SHIFT);
  1054. #ifdef CONFIG_CPA_DEBUG
  1055. printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end);
  1056. set_memory_rw(start, (end-start) >> PAGE_SHIFT);
  1057. printk(KERN_INFO "Testing CPA: again\n");
  1058. set_memory_ro(start, (end-start) >> PAGE_SHIFT);
  1059. #endif
  1060. free_init_pages("unused kernel",
  1061. (unsigned long) __va(__pa_symbol(text_end)),
  1062. (unsigned long) __va(__pa_symbol(rodata_start)));
  1063. free_init_pages("unused kernel",
  1064. (unsigned long) __va(__pa_symbol(rodata_end)),
  1065. (unsigned long) __va(__pa_symbol(_sdata)));
  1066. debug_checkwx();
  1067. }
  1068. int kern_addr_valid(unsigned long addr)
  1069. {
  1070. unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
  1071. pgd_t *pgd;
  1072. p4d_t *p4d;
  1073. pud_t *pud;
  1074. pmd_t *pmd;
  1075. pte_t *pte;
  1076. if (above != 0 && above != -1UL)
  1077. return 0;
  1078. pgd = pgd_offset_k(addr);
  1079. if (pgd_none(*pgd))
  1080. return 0;
  1081. p4d = p4d_offset(pgd, addr);
  1082. if (p4d_none(*p4d))
  1083. return 0;
  1084. pud = pud_offset(p4d, addr);
  1085. if (pud_none(*pud))
  1086. return 0;
  1087. if (pud_large(*pud))
  1088. return pfn_valid(pud_pfn(*pud));
  1089. pmd = pmd_offset(pud, addr);
  1090. if (pmd_none(*pmd))
  1091. return 0;
  1092. if (pmd_large(*pmd))
  1093. return pfn_valid(pmd_pfn(*pmd));
  1094. pte = pte_offset_kernel(pmd, addr);
  1095. if (pte_none(*pte))
  1096. return 0;
  1097. return pfn_valid(pte_pfn(*pte));
  1098. }
  1099. static unsigned long probe_memory_block_size(void)
  1100. {
  1101. unsigned long bz = MIN_MEMORY_BLOCK_SIZE;
  1102. /* if system is UV or has 64GB of RAM or more, use large blocks */
  1103. if (is_uv_system() || ((max_pfn << PAGE_SHIFT) >= (64UL << 30)))
  1104. bz = 2UL << 30; /* 2GB */
  1105. pr_info("x86/mm: Memory block size: %ldMB\n", bz >> 20);
  1106. return bz;
  1107. }
  1108. static unsigned long memory_block_size_probed;
  1109. unsigned long memory_block_size_bytes(void)
  1110. {
  1111. if (!memory_block_size_probed)
  1112. memory_block_size_probed = probe_memory_block_size();
  1113. return memory_block_size_probed;
  1114. }
  1115. #ifdef CONFIG_SPARSEMEM_VMEMMAP
  1116. /*
  1117. * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
  1118. */
  1119. static long __meminitdata addr_start, addr_end;
  1120. static void __meminitdata *p_start, *p_end;
  1121. static int __meminitdata node_start;
  1122. static int __meminit vmemmap_populate_hugepages(unsigned long start,
  1123. unsigned long end, int node, struct vmem_altmap *altmap)
  1124. {
  1125. unsigned long addr;
  1126. unsigned long next;
  1127. pgd_t *pgd;
  1128. p4d_t *p4d;
  1129. pud_t *pud;
  1130. pmd_t *pmd;
  1131. for (addr = start; addr < end; addr = next) {
  1132. next = pmd_addr_end(addr, end);
  1133. pgd = vmemmap_pgd_populate(addr, node);
  1134. if (!pgd)
  1135. return -ENOMEM;
  1136. p4d = vmemmap_p4d_populate(pgd, addr, node);
  1137. if (!p4d)
  1138. return -ENOMEM;
  1139. pud = vmemmap_pud_populate(p4d, addr, node);
  1140. if (!pud)
  1141. return -ENOMEM;
  1142. pmd = pmd_offset(pud, addr);
  1143. if (pmd_none(*pmd)) {
  1144. void *p;
  1145. p = __vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
  1146. if (p) {
  1147. pte_t entry;
  1148. entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
  1149. PAGE_KERNEL_LARGE);
  1150. set_pmd(pmd, __pmd(pte_val(entry)));
  1151. /* check to see if we have contiguous blocks */
  1152. if (p_end != p || node_start != node) {
  1153. if (p_start)
  1154. pr_debug(" [%lx-%lx] PMD -> [%p-%p] on node %d\n",
  1155. addr_start, addr_end-1, p_start, p_end-1, node_start);
  1156. addr_start = addr;
  1157. node_start = node;
  1158. p_start = p;
  1159. }
  1160. addr_end = addr + PMD_SIZE;
  1161. p_end = p + PMD_SIZE;
  1162. continue;
  1163. } else if (altmap)
  1164. return -ENOMEM; /* no fallback */
  1165. } else if (pmd_large(*pmd)) {
  1166. vmemmap_verify((pte_t *)pmd, node, addr, next);
  1167. continue;
  1168. }
  1169. pr_warn_once("vmemmap: falling back to regular page backing\n");
  1170. if (vmemmap_populate_basepages(addr, next, node))
  1171. return -ENOMEM;
  1172. }
  1173. return 0;
  1174. }
  1175. int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node)
  1176. {
  1177. struct vmem_altmap *altmap = to_vmem_altmap(start);
  1178. int err;
  1179. if (boot_cpu_has(X86_FEATURE_PSE))
  1180. err = vmemmap_populate_hugepages(start, end, node, altmap);
  1181. else if (altmap) {
  1182. pr_err_once("%s: no cpu support for altmap allocations\n",
  1183. __func__);
  1184. err = -ENOMEM;
  1185. } else
  1186. err = vmemmap_populate_basepages(start, end, node);
  1187. if (!err)
  1188. sync_global_pgds(start, end - 1);
  1189. return err;
  1190. }
  1191. #if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_HAVE_BOOTMEM_INFO_NODE)
  1192. void register_page_bootmem_memmap(unsigned long section_nr,
  1193. struct page *start_page, unsigned long size)
  1194. {
  1195. unsigned long addr = (unsigned long)start_page;
  1196. unsigned long end = (unsigned long)(start_page + size);
  1197. unsigned long next;
  1198. pgd_t *pgd;
  1199. p4d_t *p4d;
  1200. pud_t *pud;
  1201. pmd_t *pmd;
  1202. unsigned int nr_pages;
  1203. struct page *page;
  1204. for (; addr < end; addr = next) {
  1205. pte_t *pte = NULL;
  1206. pgd = pgd_offset_k(addr);
  1207. if (pgd_none(*pgd)) {
  1208. next = (addr + PAGE_SIZE) & PAGE_MASK;
  1209. continue;
  1210. }
  1211. get_page_bootmem(section_nr, pgd_page(*pgd), MIX_SECTION_INFO);
  1212. p4d = p4d_offset(pgd, addr);
  1213. if (p4d_none(*p4d)) {
  1214. next = (addr + PAGE_SIZE) & PAGE_MASK;
  1215. continue;
  1216. }
  1217. get_page_bootmem(section_nr, p4d_page(*p4d), MIX_SECTION_INFO);
  1218. pud = pud_offset(p4d, addr);
  1219. if (pud_none(*pud)) {
  1220. next = (addr + PAGE_SIZE) & PAGE_MASK;
  1221. continue;
  1222. }
  1223. get_page_bootmem(section_nr, pud_page(*pud), MIX_SECTION_INFO);
  1224. if (!boot_cpu_has(X86_FEATURE_PSE)) {
  1225. next = (addr + PAGE_SIZE) & PAGE_MASK;
  1226. pmd = pmd_offset(pud, addr);
  1227. if (pmd_none(*pmd))
  1228. continue;
  1229. get_page_bootmem(section_nr, pmd_page(*pmd),
  1230. MIX_SECTION_INFO);
  1231. pte = pte_offset_kernel(pmd, addr);
  1232. if (pte_none(*pte))
  1233. continue;
  1234. get_page_bootmem(section_nr, pte_page(*pte),
  1235. SECTION_INFO);
  1236. } else {
  1237. next = pmd_addr_end(addr, end);
  1238. pmd = pmd_offset(pud, addr);
  1239. if (pmd_none(*pmd))
  1240. continue;
  1241. nr_pages = 1 << (get_order(PMD_SIZE));
  1242. page = pmd_page(*pmd);
  1243. while (nr_pages--)
  1244. get_page_bootmem(section_nr, page++,
  1245. SECTION_INFO);
  1246. }
  1247. }
  1248. }
  1249. #endif
  1250. void __meminit vmemmap_populate_print_last(void)
  1251. {
  1252. if (p_start) {
  1253. pr_debug(" [%lx-%lx] PMD -> [%p-%p] on node %d\n",
  1254. addr_start, addr_end-1, p_start, p_end-1, node_start);
  1255. p_start = NULL;
  1256. p_end = NULL;
  1257. node_start = 0;
  1258. }
  1259. }
  1260. #endif