pgtable-radix.c 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119
  1. /*
  2. * Page table handling routines for radix page table.
  3. *
  4. * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
  5. *
  6. * This program is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU General Public License
  8. * as published by the Free Software Foundation; either version
  9. * 2 of the License, or (at your option) any later version.
  10. */
  11. #define pr_fmt(fmt) "radix-mmu: " fmt
  12. #include <linux/kernel.h>
  13. #include <linux/sched/mm.h>
  14. #include <linux/memblock.h>
  15. #include <linux/of_fdt.h>
  16. #include <linux/mm.h>
  17. #include <linux/string_helpers.h>
  18. #include <linux/stop_machine.h>
  19. #include <asm/pgtable.h>
  20. #include <asm/pgalloc.h>
  21. #include <asm/mmu_context.h>
  22. #include <asm/dma.h>
  23. #include <asm/machdep.h>
  24. #include <asm/mmu.h>
  25. #include <asm/firmware.h>
  26. #include <asm/powernv.h>
  27. #include <asm/sections.h>
  28. #include <asm/trace.h>
  29. #include <trace/events/thp.h>
  30. unsigned int mmu_pid_bits;
  31. unsigned int mmu_base_pid;
  32. static int native_register_process_table(unsigned long base, unsigned long pg_sz,
  33. unsigned long table_size)
  34. {
  35. unsigned long patb0, patb1;
  36. patb0 = be64_to_cpu(partition_tb[0].patb0);
  37. patb1 = base | table_size | PATB_GR;
  38. mmu_partition_table_set_entry(0, patb0, patb1);
  39. return 0;
  40. }
  41. static __ref void *early_alloc_pgtable(unsigned long size, int nid,
  42. unsigned long region_start, unsigned long region_end)
  43. {
  44. unsigned long pa = 0;
  45. void *pt;
  46. if (region_start || region_end) /* has region hint */
  47. pa = memblock_alloc_range(size, size, region_start, region_end,
  48. MEMBLOCK_NONE);
  49. else if (nid != -1) /* has node hint */
  50. pa = memblock_alloc_base_nid(size, size,
  51. MEMBLOCK_ALLOC_ANYWHERE,
  52. nid, MEMBLOCK_NONE);
  53. if (!pa)
  54. pa = memblock_alloc_base(size, size, MEMBLOCK_ALLOC_ANYWHERE);
  55. BUG_ON(!pa);
  56. pt = __va(pa);
  57. memset(pt, 0, size);
  58. return pt;
  59. }
  60. static int early_map_kernel_page(unsigned long ea, unsigned long pa,
  61. pgprot_t flags,
  62. unsigned int map_page_size,
  63. int nid,
  64. unsigned long region_start, unsigned long region_end)
  65. {
  66. unsigned long pfn = pa >> PAGE_SHIFT;
  67. pgd_t *pgdp;
  68. pud_t *pudp;
  69. pmd_t *pmdp;
  70. pte_t *ptep;
  71. pgdp = pgd_offset_k(ea);
  72. if (pgd_none(*pgdp)) {
  73. pudp = early_alloc_pgtable(PUD_TABLE_SIZE, nid,
  74. region_start, region_end);
  75. pgd_populate(&init_mm, pgdp, pudp);
  76. }
  77. pudp = pud_offset(pgdp, ea);
  78. if (map_page_size == PUD_SIZE) {
  79. ptep = (pte_t *)pudp;
  80. goto set_the_pte;
  81. }
  82. if (pud_none(*pudp)) {
  83. pmdp = early_alloc_pgtable(PMD_TABLE_SIZE, nid,
  84. region_start, region_end);
  85. pud_populate(&init_mm, pudp, pmdp);
  86. }
  87. pmdp = pmd_offset(pudp, ea);
  88. if (map_page_size == PMD_SIZE) {
  89. ptep = pmdp_ptep(pmdp);
  90. goto set_the_pte;
  91. }
  92. if (!pmd_present(*pmdp)) {
  93. ptep = early_alloc_pgtable(PAGE_SIZE, nid,
  94. region_start, region_end);
  95. pmd_populate_kernel(&init_mm, pmdp, ptep);
  96. }
  97. ptep = pte_offset_kernel(pmdp, ea);
  98. set_the_pte:
  99. set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags));
  100. smp_wmb();
  101. return 0;
  102. }
  103. /*
  104. * nid, region_start, and region_end are hints to try to place the page
  105. * table memory in the same node or region.
  106. */
  107. static int __map_kernel_page(unsigned long ea, unsigned long pa,
  108. pgprot_t flags,
  109. unsigned int map_page_size,
  110. int nid,
  111. unsigned long region_start, unsigned long region_end)
  112. {
  113. unsigned long pfn = pa >> PAGE_SHIFT;
  114. pgd_t *pgdp;
  115. pud_t *pudp;
  116. pmd_t *pmdp;
  117. pte_t *ptep;
  118. /*
  119. * Make sure task size is correct as per the max adddr
  120. */
  121. BUILD_BUG_ON(TASK_SIZE_USER64 > RADIX_PGTABLE_RANGE);
  122. if (unlikely(!slab_is_available()))
  123. return early_map_kernel_page(ea, pa, flags, map_page_size,
  124. nid, region_start, region_end);
  125. /*
  126. * Should make page table allocation functions be able to take a
  127. * node, so we can place kernel page tables on the right nodes after
  128. * boot.
  129. */
  130. pgdp = pgd_offset_k(ea);
  131. pudp = pud_alloc(&init_mm, pgdp, ea);
  132. if (!pudp)
  133. return -ENOMEM;
  134. if (map_page_size == PUD_SIZE) {
  135. ptep = (pte_t *)pudp;
  136. goto set_the_pte;
  137. }
  138. pmdp = pmd_alloc(&init_mm, pudp, ea);
  139. if (!pmdp)
  140. return -ENOMEM;
  141. if (map_page_size == PMD_SIZE) {
  142. ptep = pmdp_ptep(pmdp);
  143. goto set_the_pte;
  144. }
  145. ptep = pte_alloc_kernel(pmdp, ea);
  146. if (!ptep)
  147. return -ENOMEM;
  148. set_the_pte:
  149. set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags));
  150. smp_wmb();
  151. return 0;
  152. }
  153. int radix__map_kernel_page(unsigned long ea, unsigned long pa,
  154. pgprot_t flags,
  155. unsigned int map_page_size)
  156. {
  157. return __map_kernel_page(ea, pa, flags, map_page_size, -1, 0, 0);
  158. }
  159. #ifdef CONFIG_STRICT_KERNEL_RWX
  160. void radix__change_memory_range(unsigned long start, unsigned long end,
  161. unsigned long clear)
  162. {
  163. unsigned long idx;
  164. pgd_t *pgdp;
  165. pud_t *pudp;
  166. pmd_t *pmdp;
  167. pte_t *ptep;
  168. start = ALIGN_DOWN(start, PAGE_SIZE);
  169. end = PAGE_ALIGN(end); // aligns up
  170. pr_debug("Changing flags on range %lx-%lx removing 0x%lx\n",
  171. start, end, clear);
  172. for (idx = start; idx < end; idx += PAGE_SIZE) {
  173. pgdp = pgd_offset_k(idx);
  174. pudp = pud_alloc(&init_mm, pgdp, idx);
  175. if (!pudp)
  176. continue;
  177. if (pud_huge(*pudp)) {
  178. ptep = (pte_t *)pudp;
  179. goto update_the_pte;
  180. }
  181. pmdp = pmd_alloc(&init_mm, pudp, idx);
  182. if (!pmdp)
  183. continue;
  184. if (pmd_huge(*pmdp)) {
  185. ptep = pmdp_ptep(pmdp);
  186. goto update_the_pte;
  187. }
  188. ptep = pte_alloc_kernel(pmdp, idx);
  189. if (!ptep)
  190. continue;
  191. update_the_pte:
  192. radix__pte_update(&init_mm, idx, ptep, clear, 0, 0);
  193. }
  194. radix__flush_tlb_kernel_range(start, end);
  195. }
  196. void radix__mark_rodata_ro(void)
  197. {
  198. unsigned long start, end;
  199. /*
  200. * mark_rodata_ro() will mark itself as !writable at some point.
  201. * Due to DD1 workaround in radix__pte_update(), we'll end up with
  202. * an invalid pte and the system will crash quite severly.
  203. */
  204. if (cpu_has_feature(CPU_FTR_POWER9_DD1)) {
  205. pr_warn("Warning: Unable to mark rodata read only on P9 DD1\n");
  206. return;
  207. }
  208. start = (unsigned long)_stext;
  209. end = (unsigned long)__init_begin;
  210. radix__change_memory_range(start, end, _PAGE_WRITE);
  211. }
  212. void radix__mark_initmem_nx(void)
  213. {
  214. unsigned long start = (unsigned long)__init_begin;
  215. unsigned long end = (unsigned long)__init_end;
  216. radix__change_memory_range(start, end, _PAGE_EXEC);
  217. }
  218. #endif /* CONFIG_STRICT_KERNEL_RWX */
  219. static inline void __meminit print_mapping(unsigned long start,
  220. unsigned long end,
  221. unsigned long size)
  222. {
  223. char buf[10];
  224. if (end <= start)
  225. return;
  226. string_get_size(size, 1, STRING_UNITS_2, buf, sizeof(buf));
  227. pr_info("Mapped 0x%016lx-0x%016lx with %s pages\n", start, end, buf);
  228. }
  229. static int __meminit create_physical_mapping(unsigned long start,
  230. unsigned long end,
  231. int nid)
  232. {
  233. unsigned long vaddr, addr, mapping_size = 0;
  234. pgprot_t prot;
  235. unsigned long max_mapping_size;
  236. #ifdef CONFIG_STRICT_KERNEL_RWX
  237. int split_text_mapping = 1;
  238. #else
  239. int split_text_mapping = 0;
  240. #endif
  241. start = _ALIGN_UP(start, PAGE_SIZE);
  242. for (addr = start; addr < end; addr += mapping_size) {
  243. unsigned long gap, previous_size;
  244. int rc;
  245. gap = end - addr;
  246. previous_size = mapping_size;
  247. max_mapping_size = PUD_SIZE;
  248. retry:
  249. if (IS_ALIGNED(addr, PUD_SIZE) && gap >= PUD_SIZE &&
  250. mmu_psize_defs[MMU_PAGE_1G].shift &&
  251. PUD_SIZE <= max_mapping_size)
  252. mapping_size = PUD_SIZE;
  253. else if (IS_ALIGNED(addr, PMD_SIZE) && gap >= PMD_SIZE &&
  254. mmu_psize_defs[MMU_PAGE_2M].shift)
  255. mapping_size = PMD_SIZE;
  256. else
  257. mapping_size = PAGE_SIZE;
  258. if (split_text_mapping && (mapping_size == PUD_SIZE) &&
  259. (addr <= __pa_symbol(__init_begin)) &&
  260. (addr + mapping_size) >= __pa_symbol(_stext)) {
  261. max_mapping_size = PMD_SIZE;
  262. goto retry;
  263. }
  264. if (split_text_mapping && (mapping_size == PMD_SIZE) &&
  265. (addr <= __pa_symbol(__init_begin)) &&
  266. (addr + mapping_size) >= __pa_symbol(_stext))
  267. mapping_size = PAGE_SIZE;
  268. if (mapping_size != previous_size) {
  269. print_mapping(start, addr, previous_size);
  270. start = addr;
  271. }
  272. vaddr = (unsigned long)__va(addr);
  273. if (overlaps_kernel_text(vaddr, vaddr + mapping_size) ||
  274. overlaps_interrupt_vector_text(vaddr, vaddr + mapping_size))
  275. prot = PAGE_KERNEL_X;
  276. else
  277. prot = PAGE_KERNEL;
  278. rc = __map_kernel_page(vaddr, addr, prot, mapping_size, nid, start, end);
  279. if (rc)
  280. return rc;
  281. }
  282. print_mapping(start, addr, mapping_size);
  283. return 0;
  284. }
  285. void __init radix_init_pgtable(void)
  286. {
  287. unsigned long rts_field;
  288. struct memblock_region *reg;
  289. /* We don't support slb for radix */
  290. mmu_slb_size = 0;
  291. /*
  292. * Create the linear mapping, using standard page size for now
  293. */
  294. for_each_memblock(memory, reg) {
  295. /*
  296. * The memblock allocator is up at this point, so the
  297. * page tables will be allocated within the range. No
  298. * need or a node (which we don't have yet).
  299. */
  300. WARN_ON(create_physical_mapping(reg->base,
  301. reg->base + reg->size,
  302. -1));
  303. }
  304. /* Find out how many PID bits are supported */
  305. if (cpu_has_feature(CPU_FTR_HVMODE)) {
  306. if (!mmu_pid_bits)
  307. mmu_pid_bits = 20;
  308. #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
  309. /*
  310. * When KVM is possible, we only use the top half of the
  311. * PID space to avoid collisions between host and guest PIDs
  312. * which can cause problems due to prefetch when exiting the
  313. * guest with AIL=3
  314. */
  315. mmu_base_pid = 1 << (mmu_pid_bits - 1);
  316. #else
  317. mmu_base_pid = 1;
  318. #endif
  319. } else {
  320. /* The guest uses the bottom half of the PID space */
  321. if (!mmu_pid_bits)
  322. mmu_pid_bits = 19;
  323. mmu_base_pid = 1;
  324. }
  325. /*
  326. * Allocate Partition table and process table for the
  327. * host.
  328. */
  329. BUG_ON(PRTB_SIZE_SHIFT > 36);
  330. process_tb = early_alloc_pgtable(1UL << PRTB_SIZE_SHIFT, -1, 0, 0);
  331. /*
  332. * Fill in the process table.
  333. */
  334. rts_field = radix__get_tree_size();
  335. process_tb->prtb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE);
  336. /*
  337. * Fill in the partition table. We are suppose to use effective address
  338. * of process table here. But our linear mapping also enable us to use
  339. * physical address here.
  340. */
  341. register_process_table(__pa(process_tb), 0, PRTB_SIZE_SHIFT - 12);
  342. pr_info("Process table %p and radix root for kernel: %p\n", process_tb, init_mm.pgd);
  343. asm volatile("ptesync" : : : "memory");
  344. asm volatile(PPC_TLBIE_5(%0,%1,2,1,1) : :
  345. "r" (TLBIEL_INVAL_SET_LPID), "r" (0));
  346. asm volatile("eieio; tlbsync; ptesync" : : : "memory");
  347. trace_tlbie(0, 0, TLBIEL_INVAL_SET_LPID, 0, 2, 1, 1);
  348. /*
  349. * The init_mm context is given the first available (non-zero) PID,
  350. * which is the "guard PID" and contains no page table. PIDR should
  351. * never be set to zero because that duplicates the kernel address
  352. * space at the 0x0... offset (quadrant 0)!
  353. *
  354. * An arbitrary PID that may later be allocated by the PID allocator
  355. * for userspace processes must not be used either, because that
  356. * would cause stale user mappings for that PID on CPUs outside of
  357. * the TLB invalidation scheme (because it won't be in mm_cpumask).
  358. *
  359. * So permanently carve out one PID for the purpose of a guard PID.
  360. */
  361. init_mm.context.id = mmu_base_pid;
  362. mmu_base_pid++;
  363. }
  364. static void __init radix_init_partition_table(void)
  365. {
  366. unsigned long rts_field, dw0;
  367. mmu_partition_table_init();
  368. rts_field = radix__get_tree_size();
  369. dw0 = rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE | PATB_HR;
  370. mmu_partition_table_set_entry(0, dw0, 0);
  371. pr_info("Initializing Radix MMU\n");
  372. pr_info("Partition table %p\n", partition_tb);
  373. }
  374. void __init radix_init_native(void)
  375. {
  376. register_process_table = native_register_process_table;
  377. }
  378. static int __init get_idx_from_shift(unsigned int shift)
  379. {
  380. int idx = -1;
  381. switch (shift) {
  382. case 0xc:
  383. idx = MMU_PAGE_4K;
  384. break;
  385. case 0x10:
  386. idx = MMU_PAGE_64K;
  387. break;
  388. case 0x15:
  389. idx = MMU_PAGE_2M;
  390. break;
  391. case 0x1e:
  392. idx = MMU_PAGE_1G;
  393. break;
  394. }
  395. return idx;
  396. }
  397. static int __init radix_dt_scan_page_sizes(unsigned long node,
  398. const char *uname, int depth,
  399. void *data)
  400. {
  401. int size = 0;
  402. int shift, idx;
  403. unsigned int ap;
  404. const __be32 *prop;
  405. const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
  406. /* We are scanning "cpu" nodes only */
  407. if (type == NULL || strcmp(type, "cpu") != 0)
  408. return 0;
  409. /* Find MMU PID size */
  410. prop = of_get_flat_dt_prop(node, "ibm,mmu-pid-bits", &size);
  411. if (prop && size == 4)
  412. mmu_pid_bits = be32_to_cpup(prop);
  413. /* Grab page size encodings */
  414. prop = of_get_flat_dt_prop(node, "ibm,processor-radix-AP-encodings", &size);
  415. if (!prop)
  416. return 0;
  417. pr_info("Page sizes from device-tree:\n");
  418. for (; size >= 4; size -= 4, ++prop) {
  419. struct mmu_psize_def *def;
  420. /* top 3 bit is AP encoding */
  421. shift = be32_to_cpu(prop[0]) & ~(0xe << 28);
  422. ap = be32_to_cpu(prop[0]) >> 29;
  423. pr_info("Page size shift = %d AP=0x%x\n", shift, ap);
  424. idx = get_idx_from_shift(shift);
  425. if (idx < 0)
  426. continue;
  427. def = &mmu_psize_defs[idx];
  428. def->shift = shift;
  429. def->ap = ap;
  430. }
  431. /* needed ? */
  432. cur_cpu_spec->mmu_features &= ~MMU_FTR_NO_SLBIE_B;
  433. return 1;
  434. }
  435. void __init radix__early_init_devtree(void)
  436. {
  437. int rc;
  438. /*
  439. * Try to find the available page sizes in the device-tree
  440. */
  441. rc = of_scan_flat_dt(radix_dt_scan_page_sizes, NULL);
  442. if (rc != 0) /* Found */
  443. goto found;
  444. /*
  445. * let's assume we have page 4k and 64k support
  446. */
  447. mmu_psize_defs[MMU_PAGE_4K].shift = 12;
  448. mmu_psize_defs[MMU_PAGE_4K].ap = 0x0;
  449. mmu_psize_defs[MMU_PAGE_64K].shift = 16;
  450. mmu_psize_defs[MMU_PAGE_64K].ap = 0x5;
  451. found:
  452. #ifdef CONFIG_SPARSEMEM_VMEMMAP
  453. if (mmu_psize_defs[MMU_PAGE_2M].shift) {
  454. /*
  455. * map vmemmap using 2M if available
  456. */
  457. mmu_vmemmap_psize = MMU_PAGE_2M;
  458. }
  459. #endif /* CONFIG_SPARSEMEM_VMEMMAP */
  460. return;
  461. }
  462. static void update_hid_for_radix(void)
  463. {
  464. unsigned long hid0;
  465. unsigned long rb = 3UL << PPC_BITLSHIFT(53); /* IS = 3 */
  466. asm volatile("ptesync": : :"memory");
  467. /* prs = 0, ric = 2, rs = 0, r = 1 is = 3 */
  468. asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
  469. : : "r"(rb), "i"(1), "i"(0), "i"(2), "r"(0) : "memory");
  470. /* prs = 1, ric = 2, rs = 0, r = 1 is = 3 */
  471. asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
  472. : : "r"(rb), "i"(1), "i"(1), "i"(2), "r"(0) : "memory");
  473. asm volatile("eieio; tlbsync; ptesync; isync; slbia": : :"memory");
  474. trace_tlbie(0, 0, rb, 0, 2, 0, 1);
  475. trace_tlbie(0, 0, rb, 0, 2, 1, 1);
  476. /*
  477. * now switch the HID
  478. */
  479. hid0 = mfspr(SPRN_HID0);
  480. hid0 |= HID0_POWER9_RADIX;
  481. mtspr(SPRN_HID0, hid0);
  482. asm volatile("isync": : :"memory");
  483. /* Wait for it to happen */
  484. while (!(mfspr(SPRN_HID0) & HID0_POWER9_RADIX))
  485. cpu_relax();
  486. }
  487. static void radix_init_amor(void)
  488. {
  489. /*
  490. * In HV mode, we init AMOR (Authority Mask Override Register) so that
  491. * the hypervisor and guest can setup IAMR (Instruction Authority Mask
  492. * Register), enable key 0 and set it to 1.
  493. *
  494. * AMOR = 0b1100 .... 0000 (Mask for key 0 is 11)
  495. */
  496. mtspr(SPRN_AMOR, (3ul << 62));
  497. }
  498. static void radix_init_iamr(void)
  499. {
  500. unsigned long iamr;
  501. /*
  502. * The IAMR should set to 0 on DD1.
  503. */
  504. if (cpu_has_feature(CPU_FTR_POWER9_DD1))
  505. iamr = 0;
  506. else
  507. iamr = (1ul << 62);
  508. /*
  509. * Radix always uses key0 of the IAMR to determine if an access is
  510. * allowed. We set bit 0 (IBM bit 1) of key0, to prevent instruction
  511. * fetch.
  512. */
  513. mtspr(SPRN_IAMR, iamr);
  514. }
  515. void __init radix__early_init_mmu(void)
  516. {
  517. unsigned long lpcr;
  518. #ifdef CONFIG_PPC_64K_PAGES
  519. /* PAGE_SIZE mappings */
  520. mmu_virtual_psize = MMU_PAGE_64K;
  521. #else
  522. mmu_virtual_psize = MMU_PAGE_4K;
  523. #endif
  524. #ifdef CONFIG_SPARSEMEM_VMEMMAP
  525. /* vmemmap mapping */
  526. mmu_vmemmap_psize = mmu_virtual_psize;
  527. #endif
  528. /*
  529. * initialize page table size
  530. */
  531. __pte_index_size = RADIX_PTE_INDEX_SIZE;
  532. __pmd_index_size = RADIX_PMD_INDEX_SIZE;
  533. __pud_index_size = RADIX_PUD_INDEX_SIZE;
  534. __pgd_index_size = RADIX_PGD_INDEX_SIZE;
  535. __pud_cache_index = RADIX_PUD_INDEX_SIZE;
  536. __pte_table_size = RADIX_PTE_TABLE_SIZE;
  537. __pmd_table_size = RADIX_PMD_TABLE_SIZE;
  538. __pud_table_size = RADIX_PUD_TABLE_SIZE;
  539. __pgd_table_size = RADIX_PGD_TABLE_SIZE;
  540. __pmd_val_bits = RADIX_PMD_VAL_BITS;
  541. __pud_val_bits = RADIX_PUD_VAL_BITS;
  542. __pgd_val_bits = RADIX_PGD_VAL_BITS;
  543. __kernel_virt_start = RADIX_KERN_VIRT_START;
  544. __kernel_virt_size = RADIX_KERN_VIRT_SIZE;
  545. __vmalloc_start = RADIX_VMALLOC_START;
  546. __vmalloc_end = RADIX_VMALLOC_END;
  547. __kernel_io_start = RADIX_KERN_IO_START;
  548. vmemmap = (struct page *)RADIX_VMEMMAP_BASE;
  549. ioremap_bot = IOREMAP_BASE;
  550. #ifdef CONFIG_PCI
  551. pci_io_base = ISA_IO_BASE;
  552. #endif
  553. __pte_frag_nr = RADIX_PTE_FRAG_NR;
  554. __pte_frag_size_shift = RADIX_PTE_FRAG_SIZE_SHIFT;
  555. __pmd_frag_nr = RADIX_PMD_FRAG_NR;
  556. __pmd_frag_size_shift = RADIX_PMD_FRAG_SIZE_SHIFT;
  557. if (!firmware_has_feature(FW_FEATURE_LPAR)) {
  558. radix_init_native();
  559. if (cpu_has_feature(CPU_FTR_POWER9_DD1))
  560. update_hid_for_radix();
  561. lpcr = mfspr(SPRN_LPCR);
  562. mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
  563. radix_init_partition_table();
  564. radix_init_amor();
  565. } else {
  566. radix_init_pseries();
  567. }
  568. memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE);
  569. radix_init_iamr();
  570. radix_init_pgtable();
  571. /* Switch to the guard PID before turning on MMU */
  572. radix__switch_mmu_context(NULL, &init_mm);
  573. if (cpu_has_feature(CPU_FTR_HVMODE))
  574. tlbiel_all();
  575. }
  576. void radix__early_init_mmu_secondary(void)
  577. {
  578. unsigned long lpcr;
  579. /*
  580. * update partition table control register and UPRT
  581. */
  582. if (!firmware_has_feature(FW_FEATURE_LPAR)) {
  583. if (cpu_has_feature(CPU_FTR_POWER9_DD1))
  584. update_hid_for_radix();
  585. lpcr = mfspr(SPRN_LPCR);
  586. mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
  587. mtspr(SPRN_PTCR,
  588. __pa(partition_tb) | (PATB_SIZE_SHIFT - 12));
  589. radix_init_amor();
  590. }
  591. radix_init_iamr();
  592. radix__switch_mmu_context(NULL, &init_mm);
  593. if (cpu_has_feature(CPU_FTR_HVMODE))
  594. tlbiel_all();
  595. }
  596. void radix__mmu_cleanup_all(void)
  597. {
  598. unsigned long lpcr;
  599. if (!firmware_has_feature(FW_FEATURE_LPAR)) {
  600. lpcr = mfspr(SPRN_LPCR);
  601. mtspr(SPRN_LPCR, lpcr & ~LPCR_UPRT);
  602. mtspr(SPRN_PTCR, 0);
  603. powernv_set_nmmu_ptcr(0);
  604. radix__flush_tlb_all();
  605. }
  606. }
  607. void radix__setup_initial_memory_limit(phys_addr_t first_memblock_base,
  608. phys_addr_t first_memblock_size)
  609. {
  610. /* We don't currently support the first MEMBLOCK not mapping 0
  611. * physical on those processors
  612. */
  613. BUG_ON(first_memblock_base != 0);
  614. /*
  615. * Radix mode is not limited by RMA / VRMA addressing.
  616. */
  617. ppc64_rma_size = ULONG_MAX;
  618. }
  619. #ifdef CONFIG_MEMORY_HOTPLUG
  620. static void free_pte_table(pte_t *pte_start, pmd_t *pmd)
  621. {
  622. pte_t *pte;
  623. int i;
  624. for (i = 0; i < PTRS_PER_PTE; i++) {
  625. pte = pte_start + i;
  626. if (!pte_none(*pte))
  627. return;
  628. }
  629. pte_free_kernel(&init_mm, pte_start);
  630. pmd_clear(pmd);
  631. }
  632. static void free_pmd_table(pmd_t *pmd_start, pud_t *pud)
  633. {
  634. pmd_t *pmd;
  635. int i;
  636. for (i = 0; i < PTRS_PER_PMD; i++) {
  637. pmd = pmd_start + i;
  638. if (!pmd_none(*pmd))
  639. return;
  640. }
  641. pmd_free(&init_mm, pmd_start);
  642. pud_clear(pud);
  643. }
  644. struct change_mapping_params {
  645. pte_t *pte;
  646. unsigned long start;
  647. unsigned long end;
  648. unsigned long aligned_start;
  649. unsigned long aligned_end;
  650. };
  651. static int __meminit stop_machine_change_mapping(void *data)
  652. {
  653. struct change_mapping_params *params =
  654. (struct change_mapping_params *)data;
  655. if (!data)
  656. return -1;
  657. spin_unlock(&init_mm.page_table_lock);
  658. pte_clear(&init_mm, params->aligned_start, params->pte);
  659. create_physical_mapping(params->aligned_start, params->start, -1);
  660. create_physical_mapping(params->end, params->aligned_end, -1);
  661. spin_lock(&init_mm.page_table_lock);
  662. return 0;
  663. }
  664. static void remove_pte_table(pte_t *pte_start, unsigned long addr,
  665. unsigned long end)
  666. {
  667. unsigned long next;
  668. pte_t *pte;
  669. pte = pte_start + pte_index(addr);
  670. for (; addr < end; addr = next, pte++) {
  671. next = (addr + PAGE_SIZE) & PAGE_MASK;
  672. if (next > end)
  673. next = end;
  674. if (!pte_present(*pte))
  675. continue;
  676. if (!PAGE_ALIGNED(addr) || !PAGE_ALIGNED(next)) {
  677. /*
  678. * The vmemmap_free() and remove_section_mapping()
  679. * codepaths call us with aligned addresses.
  680. */
  681. WARN_ONCE(1, "%s: unaligned range\n", __func__);
  682. continue;
  683. }
  684. pte_clear(&init_mm, addr, pte);
  685. }
  686. }
  687. /*
  688. * clear the pte and potentially split the mapping helper
  689. */
  690. static void __meminit split_kernel_mapping(unsigned long addr, unsigned long end,
  691. unsigned long size, pte_t *pte)
  692. {
  693. unsigned long mask = ~(size - 1);
  694. unsigned long aligned_start = addr & mask;
  695. unsigned long aligned_end = addr + size;
  696. struct change_mapping_params params;
  697. bool split_region = false;
  698. if ((end - addr) < size) {
  699. /*
  700. * We're going to clear the PTE, but not flushed
  701. * the mapping, time to remap and flush. The
  702. * effects if visible outside the processor or
  703. * if we are running in code close to the
  704. * mapping we cleared, we are in trouble.
  705. */
  706. if (overlaps_kernel_text(aligned_start, addr) ||
  707. overlaps_kernel_text(end, aligned_end)) {
  708. /*
  709. * Hack, just return, don't pte_clear
  710. */
  711. WARN_ONCE(1, "Linear mapping %lx->%lx overlaps kernel "
  712. "text, not splitting\n", addr, end);
  713. return;
  714. }
  715. split_region = true;
  716. }
  717. if (split_region) {
  718. params.pte = pte;
  719. params.start = addr;
  720. params.end = end;
  721. params.aligned_start = addr & ~(size - 1);
  722. params.aligned_end = min_t(unsigned long, aligned_end,
  723. (unsigned long)__va(memblock_end_of_DRAM()));
  724. stop_machine(stop_machine_change_mapping, &params, NULL);
  725. return;
  726. }
  727. pte_clear(&init_mm, addr, pte);
  728. }
  729. static void remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
  730. unsigned long end)
  731. {
  732. unsigned long next;
  733. pte_t *pte_base;
  734. pmd_t *pmd;
  735. pmd = pmd_start + pmd_index(addr);
  736. for (; addr < end; addr = next, pmd++) {
  737. next = pmd_addr_end(addr, end);
  738. if (!pmd_present(*pmd))
  739. continue;
  740. if (pmd_huge(*pmd)) {
  741. split_kernel_mapping(addr, end, PMD_SIZE, (pte_t *)pmd);
  742. continue;
  743. }
  744. pte_base = (pte_t *)pmd_page_vaddr(*pmd);
  745. remove_pte_table(pte_base, addr, next);
  746. free_pte_table(pte_base, pmd);
  747. }
  748. }
  749. static void remove_pud_table(pud_t *pud_start, unsigned long addr,
  750. unsigned long end)
  751. {
  752. unsigned long next;
  753. pmd_t *pmd_base;
  754. pud_t *pud;
  755. pud = pud_start + pud_index(addr);
  756. for (; addr < end; addr = next, pud++) {
  757. next = pud_addr_end(addr, end);
  758. if (!pud_present(*pud))
  759. continue;
  760. if (pud_huge(*pud)) {
  761. split_kernel_mapping(addr, end, PUD_SIZE, (pte_t *)pud);
  762. continue;
  763. }
  764. pmd_base = (pmd_t *)pud_page_vaddr(*pud);
  765. remove_pmd_table(pmd_base, addr, next);
  766. free_pmd_table(pmd_base, pud);
  767. }
  768. }
  769. static void __meminit remove_pagetable(unsigned long start, unsigned long end)
  770. {
  771. unsigned long addr, next;
  772. pud_t *pud_base;
  773. pgd_t *pgd;
  774. spin_lock(&init_mm.page_table_lock);
  775. for (addr = start; addr < end; addr = next) {
  776. next = pgd_addr_end(addr, end);
  777. pgd = pgd_offset_k(addr);
  778. if (!pgd_present(*pgd))
  779. continue;
  780. if (pgd_huge(*pgd)) {
  781. split_kernel_mapping(addr, end, PGDIR_SIZE, (pte_t *)pgd);
  782. continue;
  783. }
  784. pud_base = (pud_t *)pgd_page_vaddr(*pgd);
  785. remove_pud_table(pud_base, addr, next);
  786. }
  787. spin_unlock(&init_mm.page_table_lock);
  788. radix__flush_tlb_kernel_range(start, end);
  789. }
  790. int __meminit radix__create_section_mapping(unsigned long start, unsigned long end, int nid)
  791. {
  792. return create_physical_mapping(start, end, nid);
  793. }
  794. int __meminit radix__remove_section_mapping(unsigned long start, unsigned long end)
  795. {
  796. remove_pagetable(start, end);
  797. return 0;
  798. }
  799. #endif /* CONFIG_MEMORY_HOTPLUG */
  800. #ifdef CONFIG_SPARSEMEM_VMEMMAP
  801. static int __map_kernel_page_nid(unsigned long ea, unsigned long pa,
  802. pgprot_t flags, unsigned int map_page_size,
  803. int nid)
  804. {
  805. return __map_kernel_page(ea, pa, flags, map_page_size, nid, 0, 0);
  806. }
  807. int __meminit radix__vmemmap_create_mapping(unsigned long start,
  808. unsigned long page_size,
  809. unsigned long phys)
  810. {
  811. /* Create a PTE encoding */
  812. unsigned long flags = _PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_KERNEL_RW;
  813. int nid = early_pfn_to_nid(phys >> PAGE_SHIFT);
  814. int ret;
  815. ret = __map_kernel_page_nid(start, phys, __pgprot(flags), page_size, nid);
  816. BUG_ON(ret);
  817. return 0;
  818. }
  819. #ifdef CONFIG_MEMORY_HOTPLUG
  820. void __meminit radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size)
  821. {
  822. remove_pagetable(start, start + page_size);
  823. }
  824. #endif
  825. #endif
  826. #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  827. unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
  828. pmd_t *pmdp, unsigned long clr,
  829. unsigned long set)
  830. {
  831. unsigned long old;
  832. #ifdef CONFIG_DEBUG_VM
  833. WARN_ON(!radix__pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
  834. assert_spin_locked(pmd_lockptr(mm, pmdp));
  835. #endif
  836. old = radix__pte_update(mm, addr, (pte_t *)pmdp, clr, set, 1);
  837. trace_hugepage_update(addr, old, clr, set);
  838. return old;
  839. }
  840. pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
  841. pmd_t *pmdp)
  842. {
  843. pmd_t pmd;
  844. VM_BUG_ON(address & ~HPAGE_PMD_MASK);
  845. VM_BUG_ON(radix__pmd_trans_huge(*pmdp));
  846. VM_BUG_ON(pmd_devmap(*pmdp));
  847. /*
  848. * khugepaged calls this for normal pmd
  849. */
  850. pmd = *pmdp;
  851. pmd_clear(pmdp);
  852. /*FIXME!! Verify whether we need this kick below */
  853. serialize_against_pte_lookup(vma->vm_mm);
  854. radix__flush_tlb_collapsed_pmd(vma->vm_mm, address);
  855. return pmd;
  856. }
  857. /*
  858. * For us pgtable_t is pte_t *. Inorder to save the deposisted
  859. * page table, we consider the allocated page table as a list
  860. * head. On withdraw we need to make sure we zero out the used
  861. * list_head memory area.
  862. */
  863. void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
  864. pgtable_t pgtable)
  865. {
  866. struct list_head *lh = (struct list_head *) pgtable;
  867. assert_spin_locked(pmd_lockptr(mm, pmdp));
  868. /* FIFO */
  869. if (!pmd_huge_pte(mm, pmdp))
  870. INIT_LIST_HEAD(lh);
  871. else
  872. list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp));
  873. pmd_huge_pte(mm, pmdp) = pgtable;
  874. }
  875. pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
  876. {
  877. pte_t *ptep;
  878. pgtable_t pgtable;
  879. struct list_head *lh;
  880. assert_spin_locked(pmd_lockptr(mm, pmdp));
  881. /* FIFO */
  882. pgtable = pmd_huge_pte(mm, pmdp);
  883. lh = (struct list_head *) pgtable;
  884. if (list_empty(lh))
  885. pmd_huge_pte(mm, pmdp) = NULL;
  886. else {
  887. pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next;
  888. list_del(lh);
  889. }
  890. ptep = (pte_t *) pgtable;
  891. *ptep = __pte(0);
  892. ptep++;
  893. *ptep = __pte(0);
  894. return pgtable;
  895. }
  896. pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm,
  897. unsigned long addr, pmd_t *pmdp)
  898. {
  899. pmd_t old_pmd;
  900. unsigned long old;
  901. old = radix__pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
  902. old_pmd = __pmd(old);
  903. /*
  904. * Serialize against find_current_mm_pte which does lock-less
  905. * lookup in page tables with local interrupts disabled. For huge pages
  906. * it casts pmd_t to pte_t. Since format of pte_t is different from
  907. * pmd_t we want to prevent transit from pmd pointing to page table
  908. * to pmd pointing to huge page (and back) while interrupts are disabled.
  909. * We clear pmd to possibly replace it with page table pointer in
  910. * different code paths. So make sure we wait for the parallel
  911. * find_current_mm_pte to finish.
  912. */
  913. serialize_against_pte_lookup(mm);
  914. return old_pmd;
  915. }
  916. int radix__has_transparent_hugepage(void)
  917. {
  918. /* For radix 2M at PMD level means thp */
  919. if (mmu_psize_defs[MMU_PAGE_2M].shift == PMD_SHIFT)
  920. return 1;
  921. return 0;
  922. }
  923. #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
  924. void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep,
  925. pte_t entry, unsigned long address, int psize)
  926. {
  927. struct mm_struct *mm = vma->vm_mm;
  928. unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_ACCESSED |
  929. _PAGE_RW | _PAGE_EXEC);
  930. /*
  931. * To avoid NMMU hang while relaxing access, we need mark
  932. * the pte invalid in between.
  933. */
  934. if (cpu_has_feature(CPU_FTR_POWER9_DD1) ||
  935. atomic_read(&mm->context.copros) > 0) {
  936. unsigned long old_pte, new_pte;
  937. old_pte = __radix_pte_update(ptep, ~0, 0);
  938. /*
  939. * new value of pte
  940. */
  941. new_pte = old_pte | set;
  942. radix__flush_tlb_page_psize(mm, address, psize);
  943. __radix_pte_update(ptep, 0, new_pte);
  944. } else {
  945. __radix_pte_update(ptep, 0, set);
  946. /*
  947. * Book3S does not require a TLB flush when relaxing access
  948. * restrictions when the address space is not attached to a
  949. * NMMU, because the core MMU will reload the pte after taking
  950. * an access fault, which is defined by the architectue.
  951. */
  952. }
  953. /* See ptesync comment in radix__set_pte_at */
  954. }