hugetlbpage.c 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919
  1. /*
  2. * PPC Huge TLB Page Support for Kernel.
  3. *
  4. * Copyright (C) 2003 David Gibson, IBM Corporation.
  5. * Copyright (C) 2011 Becky Bruce, Freescale Semiconductor
  6. *
  7. * Based on the IA-32 version:
  8. * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
  9. */
  10. #include <linux/mm.h>
  11. #include <linux/io.h>
  12. #include <linux/slab.h>
  13. #include <linux/hugetlb.h>
  14. #include <linux/export.h>
  15. #include <linux/of_fdt.h>
  16. #include <linux/memblock.h>
  17. #include <linux/bootmem.h>
  18. #include <linux/moduleparam.h>
  19. #include <linux/swap.h>
  20. #include <linux/swapops.h>
  21. #include <asm/pgtable.h>
  22. #include <asm/pgalloc.h>
  23. #include <asm/tlb.h>
  24. #include <asm/setup.h>
  25. #include <asm/hugetlb.h>
  26. #include <asm/pte-walk.h>
  27. #ifdef CONFIG_HUGETLB_PAGE
  28. #define PAGE_SHIFT_64K 16
  29. #define PAGE_SHIFT_512K 19
  30. #define PAGE_SHIFT_8M 23
  31. #define PAGE_SHIFT_16M 24
  32. #define PAGE_SHIFT_16G 34
  33. bool hugetlb_disabled = false;
  34. unsigned int HPAGE_SHIFT;
  35. EXPORT_SYMBOL(HPAGE_SHIFT);
  36. #define hugepd_none(hpd) (hpd_val(hpd) == 0)
  37. pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz)
  38. {
  39. /*
  40. * Only called for hugetlbfs pages, hence can ignore THP and the
  41. * irq disabled walk.
  42. */
  43. return __find_linux_pte(mm->pgd, addr, NULL, NULL);
  44. }
  45. static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
  46. unsigned long address, unsigned int pdshift,
  47. unsigned int pshift, spinlock_t *ptl)
  48. {
  49. struct kmem_cache *cachep;
  50. pte_t *new;
  51. int i;
  52. int num_hugepd;
  53. if (pshift >= pdshift) {
  54. cachep = hugepte_cache;
  55. num_hugepd = 1 << (pshift - pdshift);
  56. } else {
  57. cachep = PGT_CACHE(pdshift - pshift);
  58. num_hugepd = 1;
  59. }
  60. new = kmem_cache_zalloc(cachep, pgtable_gfp_flags(mm, GFP_KERNEL));
  61. BUG_ON(pshift > HUGEPD_SHIFT_MASK);
  62. BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK);
  63. if (! new)
  64. return -ENOMEM;
  65. /*
  66. * Make sure other cpus find the hugepd set only after a
  67. * properly initialized page table is visible to them.
  68. * For more details look for comment in __pte_alloc().
  69. */
  70. smp_wmb();
  71. spin_lock(ptl);
  72. /*
  73. * We have multiple higher-level entries that point to the same
  74. * actual pte location. Fill in each as we go and backtrack on error.
  75. * We need all of these so the DTLB pgtable walk code can find the
  76. * right higher-level entry without knowing if it's a hugepage or not.
  77. */
  78. for (i = 0; i < num_hugepd; i++, hpdp++) {
  79. if (unlikely(!hugepd_none(*hpdp)))
  80. break;
  81. else {
  82. #ifdef CONFIG_PPC_BOOK3S_64
  83. *hpdp = __hugepd(__pa(new) |
  84. (shift_to_mmu_psize(pshift) << 2));
  85. #elif defined(CONFIG_PPC_8xx)
  86. *hpdp = __hugepd(__pa(new) | _PMD_USER |
  87. (pshift == PAGE_SHIFT_8M ? _PMD_PAGE_8M :
  88. _PMD_PAGE_512K) | _PMD_PRESENT);
  89. #else
  90. /* We use the old format for PPC_FSL_BOOK3E */
  91. *hpdp = __hugepd(((unsigned long)new & ~PD_HUGE) | pshift);
  92. #endif
  93. }
  94. }
  95. /* If we bailed from the for loop early, an error occurred, clean up */
  96. if (i < num_hugepd) {
  97. for (i = i - 1 ; i >= 0; i--, hpdp--)
  98. *hpdp = __hugepd(0);
  99. kmem_cache_free(cachep, new);
  100. }
  101. spin_unlock(ptl);
  102. return 0;
  103. }
  104. /*
  105. * These macros define how to determine which level of the page table holds
  106. * the hpdp.
  107. */
  108. #if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
  109. #define HUGEPD_PGD_SHIFT PGDIR_SHIFT
  110. #define HUGEPD_PUD_SHIFT PUD_SHIFT
  111. #endif
  112. /*
  113. * At this point we do the placement change only for BOOK3S 64. This would
  114. * possibly work on other subarchs.
  115. */
  116. pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
  117. {
  118. pgd_t *pg;
  119. pud_t *pu;
  120. pmd_t *pm;
  121. hugepd_t *hpdp = NULL;
  122. unsigned pshift = __ffs(sz);
  123. unsigned pdshift = PGDIR_SHIFT;
  124. spinlock_t *ptl;
  125. addr &= ~(sz-1);
  126. pg = pgd_offset(mm, addr);
  127. #ifdef CONFIG_PPC_BOOK3S_64
  128. if (pshift == PGDIR_SHIFT)
  129. /* 16GB huge page */
  130. return (pte_t *) pg;
  131. else if (pshift > PUD_SHIFT) {
  132. /*
  133. * We need to use hugepd table
  134. */
  135. ptl = &mm->page_table_lock;
  136. hpdp = (hugepd_t *)pg;
  137. } else {
  138. pdshift = PUD_SHIFT;
  139. pu = pud_alloc(mm, pg, addr);
  140. if (pshift == PUD_SHIFT)
  141. return (pte_t *)pu;
  142. else if (pshift > PMD_SHIFT) {
  143. ptl = pud_lockptr(mm, pu);
  144. hpdp = (hugepd_t *)pu;
  145. } else {
  146. pdshift = PMD_SHIFT;
  147. pm = pmd_alloc(mm, pu, addr);
  148. if (pshift == PMD_SHIFT)
  149. /* 16MB hugepage */
  150. return (pte_t *)pm;
  151. else {
  152. ptl = pmd_lockptr(mm, pm);
  153. hpdp = (hugepd_t *)pm;
  154. }
  155. }
  156. }
  157. #else
  158. if (pshift >= HUGEPD_PGD_SHIFT) {
  159. ptl = &mm->page_table_lock;
  160. hpdp = (hugepd_t *)pg;
  161. } else {
  162. pdshift = PUD_SHIFT;
  163. pu = pud_alloc(mm, pg, addr);
  164. if (pshift >= HUGEPD_PUD_SHIFT) {
  165. ptl = pud_lockptr(mm, pu);
  166. hpdp = (hugepd_t *)pu;
  167. } else {
  168. pdshift = PMD_SHIFT;
  169. pm = pmd_alloc(mm, pu, addr);
  170. ptl = pmd_lockptr(mm, pm);
  171. hpdp = (hugepd_t *)pm;
  172. }
  173. }
  174. #endif
  175. if (!hpdp)
  176. return NULL;
  177. BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp));
  178. if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr,
  179. pdshift, pshift, ptl))
  180. return NULL;
  181. return hugepte_offset(*hpdp, addr, pdshift);
  182. }
  183. #ifdef CONFIG_PPC_BOOK3S_64
  184. /*
  185. * Tracks gpages after the device tree is scanned and before the
  186. * huge_boot_pages list is ready on pseries.
  187. */
  188. #define MAX_NUMBER_GPAGES 1024
  189. __initdata static u64 gpage_freearray[MAX_NUMBER_GPAGES];
  190. __initdata static unsigned nr_gpages;
  191. /*
  192. * Build list of addresses of gigantic pages. This function is used in early
  193. * boot before the buddy allocator is setup.
  194. */
  195. void __init pseries_add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
  196. {
  197. if (!addr)
  198. return;
  199. while (number_of_pages > 0) {
  200. gpage_freearray[nr_gpages] = addr;
  201. nr_gpages++;
  202. number_of_pages--;
  203. addr += page_size;
  204. }
  205. }
  206. int __init pseries_alloc_bootmem_huge_page(struct hstate *hstate)
  207. {
  208. struct huge_bootmem_page *m;
  209. if (nr_gpages == 0)
  210. return 0;
  211. m = phys_to_virt(gpage_freearray[--nr_gpages]);
  212. gpage_freearray[nr_gpages] = 0;
  213. list_add(&m->list, &huge_boot_pages);
  214. m->hstate = hstate;
  215. return 1;
  216. }
  217. #endif
  218. int __init alloc_bootmem_huge_page(struct hstate *h)
  219. {
  220. #ifdef CONFIG_PPC_BOOK3S_64
  221. if (firmware_has_feature(FW_FEATURE_LPAR) && !radix_enabled())
  222. return pseries_alloc_bootmem_huge_page(h);
  223. #endif
  224. return __alloc_bootmem_huge_page(h);
  225. }
  226. #if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
  227. #define HUGEPD_FREELIST_SIZE \
  228. ((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t))
  229. struct hugepd_freelist {
  230. struct rcu_head rcu;
  231. unsigned int index;
  232. void *ptes[0];
  233. };
  234. static DEFINE_PER_CPU(struct hugepd_freelist *, hugepd_freelist_cur);
  235. static void hugepd_free_rcu_callback(struct rcu_head *head)
  236. {
  237. struct hugepd_freelist *batch =
  238. container_of(head, struct hugepd_freelist, rcu);
  239. unsigned int i;
  240. for (i = 0; i < batch->index; i++)
  241. kmem_cache_free(hugepte_cache, batch->ptes[i]);
  242. free_page((unsigned long)batch);
  243. }
  244. static void hugepd_free(struct mmu_gather *tlb, void *hugepte)
  245. {
  246. struct hugepd_freelist **batchp;
  247. batchp = &get_cpu_var(hugepd_freelist_cur);
  248. if (atomic_read(&tlb->mm->mm_users) < 2 ||
  249. mm_is_thread_local(tlb->mm)) {
  250. kmem_cache_free(hugepte_cache, hugepte);
  251. put_cpu_var(hugepd_freelist_cur);
  252. return;
  253. }
  254. if (*batchp == NULL) {
  255. *batchp = (struct hugepd_freelist *)__get_free_page(GFP_ATOMIC);
  256. (*batchp)->index = 0;
  257. }
  258. (*batchp)->ptes[(*batchp)->index++] = hugepte;
  259. if ((*batchp)->index == HUGEPD_FREELIST_SIZE) {
  260. call_rcu_sched(&(*batchp)->rcu, hugepd_free_rcu_callback);
  261. *batchp = NULL;
  262. }
  263. put_cpu_var(hugepd_freelist_cur);
  264. }
  265. #else
  266. static inline void hugepd_free(struct mmu_gather *tlb, void *hugepte) {}
  267. #endif
  268. static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift,
  269. unsigned long start, unsigned long end,
  270. unsigned long floor, unsigned long ceiling)
  271. {
  272. pte_t *hugepte = hugepd_page(*hpdp);
  273. int i;
  274. unsigned long pdmask = ~((1UL << pdshift) - 1);
  275. unsigned int num_hugepd = 1;
  276. unsigned int shift = hugepd_shift(*hpdp);
  277. /* Note: On fsl the hpdp may be the first of several */
  278. if (shift > pdshift)
  279. num_hugepd = 1 << (shift - pdshift);
  280. start &= pdmask;
  281. if (start < floor)
  282. return;
  283. if (ceiling) {
  284. ceiling &= pdmask;
  285. if (! ceiling)
  286. return;
  287. }
  288. if (end - 1 > ceiling - 1)
  289. return;
  290. for (i = 0; i < num_hugepd; i++, hpdp++)
  291. *hpdp = __hugepd(0);
  292. if (shift >= pdshift)
  293. hugepd_free(tlb, hugepte);
  294. else
  295. pgtable_free_tlb(tlb, hugepte, pdshift - shift);
  296. }
  297. static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
  298. unsigned long addr, unsigned long end,
  299. unsigned long floor, unsigned long ceiling)
  300. {
  301. pmd_t *pmd;
  302. unsigned long next;
  303. unsigned long start;
  304. start = addr;
  305. do {
  306. unsigned long more;
  307. pmd = pmd_offset(pud, addr);
  308. next = pmd_addr_end(addr, end);
  309. if (!is_hugepd(__hugepd(pmd_val(*pmd)))) {
  310. /*
  311. * if it is not hugepd pointer, we should already find
  312. * it cleared.
  313. */
  314. WARN_ON(!pmd_none_or_clear_bad(pmd));
  315. continue;
  316. }
  317. /*
  318. * Increment next by the size of the huge mapping since
  319. * there may be more than one entry at this level for a
  320. * single hugepage, but all of them point to
  321. * the same kmem cache that holds the hugepte.
  322. */
  323. more = addr + (1 << hugepd_shift(*(hugepd_t *)pmd));
  324. if (more > next)
  325. next = more;
  326. free_hugepd_range(tlb, (hugepd_t *)pmd, PMD_SHIFT,
  327. addr, next, floor, ceiling);
  328. } while (addr = next, addr != end);
  329. start &= PUD_MASK;
  330. if (start < floor)
  331. return;
  332. if (ceiling) {
  333. ceiling &= PUD_MASK;
  334. if (!ceiling)
  335. return;
  336. }
  337. if (end - 1 > ceiling - 1)
  338. return;
  339. pmd = pmd_offset(pud, start);
  340. pud_clear(pud);
  341. pmd_free_tlb(tlb, pmd, start);
  342. mm_dec_nr_pmds(tlb->mm);
  343. }
  344. static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
  345. unsigned long addr, unsigned long end,
  346. unsigned long floor, unsigned long ceiling)
  347. {
  348. pud_t *pud;
  349. unsigned long next;
  350. unsigned long start;
  351. start = addr;
  352. do {
  353. pud = pud_offset(pgd, addr);
  354. next = pud_addr_end(addr, end);
  355. if (!is_hugepd(__hugepd(pud_val(*pud)))) {
  356. if (pud_none_or_clear_bad(pud))
  357. continue;
  358. hugetlb_free_pmd_range(tlb, pud, addr, next, floor,
  359. ceiling);
  360. } else {
  361. unsigned long more;
  362. /*
  363. * Increment next by the size of the huge mapping since
  364. * there may be more than one entry at this level for a
  365. * single hugepage, but all of them point to
  366. * the same kmem cache that holds the hugepte.
  367. */
  368. more = addr + (1 << hugepd_shift(*(hugepd_t *)pud));
  369. if (more > next)
  370. next = more;
  371. free_hugepd_range(tlb, (hugepd_t *)pud, PUD_SHIFT,
  372. addr, next, floor, ceiling);
  373. }
  374. } while (addr = next, addr != end);
  375. start &= PGDIR_MASK;
  376. if (start < floor)
  377. return;
  378. if (ceiling) {
  379. ceiling &= PGDIR_MASK;
  380. if (!ceiling)
  381. return;
  382. }
  383. if (end - 1 > ceiling - 1)
  384. return;
  385. pud = pud_offset(pgd, start);
  386. pgd_clear(pgd);
  387. pud_free_tlb(tlb, pud, start);
  388. mm_dec_nr_puds(tlb->mm);
  389. }
  390. /*
  391. * This function frees user-level page tables of a process.
  392. */
  393. void hugetlb_free_pgd_range(struct mmu_gather *tlb,
  394. unsigned long addr, unsigned long end,
  395. unsigned long floor, unsigned long ceiling)
  396. {
  397. pgd_t *pgd;
  398. unsigned long next;
  399. /*
  400. * Because there are a number of different possible pagetable
  401. * layouts for hugepage ranges, we limit knowledge of how
  402. * things should be laid out to the allocation path
  403. * (huge_pte_alloc(), above). Everything else works out the
  404. * structure as it goes from information in the hugepd
  405. * pointers. That means that we can't here use the
  406. * optimization used in the normal page free_pgd_range(), of
  407. * checking whether we're actually covering a large enough
  408. * range to have to do anything at the top level of the walk
  409. * instead of at the bottom.
  410. *
  411. * To make sense of this, you should probably go read the big
  412. * block comment at the top of the normal free_pgd_range(),
  413. * too.
  414. */
  415. do {
  416. next = pgd_addr_end(addr, end);
  417. pgd = pgd_offset(tlb->mm, addr);
  418. if (!is_hugepd(__hugepd(pgd_val(*pgd)))) {
  419. if (pgd_none_or_clear_bad(pgd))
  420. continue;
  421. hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
  422. } else {
  423. unsigned long more;
  424. /*
  425. * Increment next by the size of the huge mapping since
  426. * there may be more than one entry at the pgd level
  427. * for a single hugepage, but all of them point to the
  428. * same kmem cache that holds the hugepte.
  429. */
  430. more = addr + (1 << hugepd_shift(*(hugepd_t *)pgd));
  431. if (more > next)
  432. next = more;
  433. free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT,
  434. addr, next, floor, ceiling);
  435. }
  436. } while (addr = next, addr != end);
  437. }
  438. struct page *follow_huge_pd(struct vm_area_struct *vma,
  439. unsigned long address, hugepd_t hpd,
  440. int flags, int pdshift)
  441. {
  442. pte_t *ptep;
  443. spinlock_t *ptl;
  444. struct page *page = NULL;
  445. unsigned long mask;
  446. int shift = hugepd_shift(hpd);
  447. struct mm_struct *mm = vma->vm_mm;
  448. retry:
  449. /*
  450. * hugepage directory entries are protected by mm->page_table_lock
  451. * Use this instead of huge_pte_lockptr
  452. */
  453. ptl = &mm->page_table_lock;
  454. spin_lock(ptl);
  455. ptep = hugepte_offset(hpd, address, pdshift);
  456. if (pte_present(*ptep)) {
  457. mask = (1UL << shift) - 1;
  458. page = pte_page(*ptep);
  459. page += ((address & mask) >> PAGE_SHIFT);
  460. if (flags & FOLL_GET)
  461. get_page(page);
  462. } else {
  463. if (is_hugetlb_entry_migration(*ptep)) {
  464. spin_unlock(ptl);
  465. __migration_entry_wait(mm, ptep, ptl);
  466. goto retry;
  467. }
  468. }
  469. spin_unlock(ptl);
  470. return page;
  471. }
  472. static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
  473. unsigned long sz)
  474. {
  475. unsigned long __boundary = (addr + sz) & ~(sz-1);
  476. return (__boundary - 1 < end - 1) ? __boundary : end;
  477. }
  478. int gup_huge_pd(hugepd_t hugepd, unsigned long addr, unsigned pdshift,
  479. unsigned long end, int write, struct page **pages, int *nr)
  480. {
  481. pte_t *ptep;
  482. unsigned long sz = 1UL << hugepd_shift(hugepd);
  483. unsigned long next;
  484. ptep = hugepte_offset(hugepd, addr, pdshift);
  485. do {
  486. next = hugepte_addr_end(addr, end, sz);
  487. if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr))
  488. return 0;
  489. } while (ptep++, addr = next, addr != end);
  490. return 1;
  491. }
  492. #ifdef CONFIG_PPC_MM_SLICES
  493. unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
  494. unsigned long len, unsigned long pgoff,
  495. unsigned long flags)
  496. {
  497. struct hstate *hstate = hstate_file(file);
  498. int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));
  499. #ifdef CONFIG_PPC_RADIX_MMU
  500. if (radix_enabled())
  501. return radix__hugetlb_get_unmapped_area(file, addr, len,
  502. pgoff, flags);
  503. #endif
  504. return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1);
  505. }
  506. #endif
  507. unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
  508. {
  509. #ifdef CONFIG_PPC_MM_SLICES
  510. /* With radix we don't use slice, so derive it from vma*/
  511. if (!radix_enabled()) {
  512. unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start);
  513. return 1UL << mmu_psize_to_shift(psize);
  514. }
  515. #endif
  516. return vma_kernel_pagesize(vma);
  517. }
  518. static inline bool is_power_of_4(unsigned long x)
  519. {
  520. if (is_power_of_2(x))
  521. return (__ilog2(x) % 2) ? false : true;
  522. return false;
  523. }
  524. static int __init add_huge_page_size(unsigned long long size)
  525. {
  526. int shift = __ffs(size);
  527. int mmu_psize;
  528. /* Check that it is a page size supported by the hardware and
  529. * that it fits within pagetable and slice limits. */
  530. if (size <= PAGE_SIZE)
  531. return -EINVAL;
  532. #if defined(CONFIG_PPC_FSL_BOOK3E)
  533. if (!is_power_of_4(size))
  534. return -EINVAL;
  535. #elif !defined(CONFIG_PPC_8xx)
  536. if (!is_power_of_2(size) || (shift > SLICE_HIGH_SHIFT))
  537. return -EINVAL;
  538. #endif
  539. if ((mmu_psize = shift_to_mmu_psize(shift)) < 0)
  540. return -EINVAL;
  541. #ifdef CONFIG_PPC_BOOK3S_64
  542. /*
  543. * We need to make sure that for different page sizes reported by
  544. * firmware we only add hugetlb support for page sizes that can be
  545. * supported by linux page table layout.
  546. * For now we have
  547. * Radix: 2M
  548. * Hash: 16M and 16G
  549. */
  550. if (radix_enabled()) {
  551. if (mmu_psize != MMU_PAGE_2M) {
  552. if (cpu_has_feature(CPU_FTR_POWER9_DD1) ||
  553. (mmu_psize != MMU_PAGE_1G))
  554. return -EINVAL;
  555. }
  556. } else {
  557. if (mmu_psize != MMU_PAGE_16M && mmu_psize != MMU_PAGE_16G)
  558. return -EINVAL;
  559. }
  560. #endif
  561. BUG_ON(mmu_psize_defs[mmu_psize].shift != shift);
  562. /* Return if huge page size has already been setup */
  563. if (size_to_hstate(size))
  564. return 0;
  565. hugetlb_add_hstate(shift - PAGE_SHIFT);
  566. return 0;
  567. }
  568. static int __init hugepage_setup_sz(char *str)
  569. {
  570. unsigned long long size;
  571. size = memparse(str, &str);
  572. if (add_huge_page_size(size) != 0) {
  573. hugetlb_bad_size();
  574. pr_err("Invalid huge page size specified(%llu)\n", size);
  575. }
  576. return 1;
  577. }
  578. __setup("hugepagesz=", hugepage_setup_sz);
  579. struct kmem_cache *hugepte_cache;
  580. static int __init hugetlbpage_init(void)
  581. {
  582. int psize;
  583. if (hugetlb_disabled) {
  584. pr_info("HugeTLB support is disabled!\n");
  585. return 0;
  586. }
  587. #if !defined(CONFIG_PPC_FSL_BOOK3E) && !defined(CONFIG_PPC_8xx)
  588. if (!radix_enabled() && !mmu_has_feature(MMU_FTR_16M_PAGE))
  589. return -ENODEV;
  590. #endif
  591. for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
  592. unsigned shift;
  593. unsigned pdshift;
  594. if (!mmu_psize_defs[psize].shift)
  595. continue;
  596. shift = mmu_psize_to_shift(psize);
  597. #ifdef CONFIG_PPC_BOOK3S_64
  598. if (shift > PGDIR_SHIFT)
  599. continue;
  600. else if (shift > PUD_SHIFT)
  601. pdshift = PGDIR_SHIFT;
  602. else if (shift > PMD_SHIFT)
  603. pdshift = PUD_SHIFT;
  604. else
  605. pdshift = PMD_SHIFT;
  606. #else
  607. if (shift < HUGEPD_PUD_SHIFT)
  608. pdshift = PMD_SHIFT;
  609. else if (shift < HUGEPD_PGD_SHIFT)
  610. pdshift = PUD_SHIFT;
  611. else
  612. pdshift = PGDIR_SHIFT;
  613. #endif
  614. if (add_huge_page_size(1ULL << shift) < 0)
  615. continue;
  616. /*
  617. * if we have pdshift and shift value same, we don't
  618. * use pgt cache for hugepd.
  619. */
  620. if (pdshift > shift)
  621. pgtable_cache_add(pdshift - shift, NULL);
  622. #if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
  623. else if (!hugepte_cache) {
  624. /*
  625. * Create a kmem cache for hugeptes. The bottom bits in
  626. * the pte have size information encoded in them, so
  627. * align them to allow this
  628. */
  629. hugepte_cache = kmem_cache_create("hugepte-cache",
  630. sizeof(pte_t),
  631. HUGEPD_SHIFT_MASK + 1,
  632. 0, NULL);
  633. if (hugepte_cache == NULL)
  634. panic("%s: Unable to create kmem cache "
  635. "for hugeptes\n", __func__);
  636. }
  637. #endif
  638. }
  639. #if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
  640. /* Default hpage size = 4M on FSL_BOOK3E and 512k on 8xx */
  641. if (mmu_psize_defs[MMU_PAGE_4M].shift)
  642. HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_4M].shift;
  643. else if (mmu_psize_defs[MMU_PAGE_512K].shift)
  644. HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_512K].shift;
  645. #else
  646. /* Set default large page size. Currently, we pick 16M or 1M
  647. * depending on what is available
  648. */
  649. if (mmu_psize_defs[MMU_PAGE_16M].shift)
  650. HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift;
  651. else if (mmu_psize_defs[MMU_PAGE_1M].shift)
  652. HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift;
  653. else if (mmu_psize_defs[MMU_PAGE_2M].shift)
  654. HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_2M].shift;
  655. #endif
  656. return 0;
  657. }
  658. arch_initcall(hugetlbpage_init);
  659. void flush_dcache_icache_hugepage(struct page *page)
  660. {
  661. int i;
  662. void *start;
  663. BUG_ON(!PageCompound(page));
  664. for (i = 0; i < (1UL << compound_order(page)); i++) {
  665. if (!PageHighMem(page)) {
  666. __flush_dcache_icache(page_address(page+i));
  667. } else {
  668. start = kmap_atomic(page+i);
  669. __flush_dcache_icache(start);
  670. kunmap_atomic(start);
  671. }
  672. }
  673. }
  674. #endif /* CONFIG_HUGETLB_PAGE */
  675. /*
  676. * We have 4 cases for pgds and pmds:
  677. * (1) invalid (all zeroes)
  678. * (2) pointer to next table, as normal; bottom 6 bits == 0
  679. * (3) leaf pte for huge page _PAGE_PTE set
  680. * (4) hugepd pointer, _PAGE_PTE = 0 and bits [2..6] indicate size of table
  681. *
  682. * So long as we atomically load page table pointers we are safe against teardown,
  683. * we can follow the address down to the the page and take a ref on it.
  684. * This function need to be called with interrupts disabled. We use this variant
  685. * when we have MSR[EE] = 0 but the paca->irq_soft_mask = IRQS_ENABLED
  686. */
  687. pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
  688. bool *is_thp, unsigned *hpage_shift)
  689. {
  690. pgd_t pgd, *pgdp;
  691. pud_t pud, *pudp;
  692. pmd_t pmd, *pmdp;
  693. pte_t *ret_pte;
  694. hugepd_t *hpdp = NULL;
  695. unsigned pdshift = PGDIR_SHIFT;
  696. if (hpage_shift)
  697. *hpage_shift = 0;
  698. if (is_thp)
  699. *is_thp = false;
  700. pgdp = pgdir + pgd_index(ea);
  701. pgd = READ_ONCE(*pgdp);
  702. /*
  703. * Always operate on the local stack value. This make sure the
  704. * value don't get updated by a parallel THP split/collapse,
  705. * page fault or a page unmap. The return pte_t * is still not
  706. * stable. So should be checked there for above conditions.
  707. */
  708. if (pgd_none(pgd))
  709. return NULL;
  710. else if (pgd_huge(pgd)) {
  711. ret_pte = (pte_t *) pgdp;
  712. goto out;
  713. } else if (is_hugepd(__hugepd(pgd_val(pgd))))
  714. hpdp = (hugepd_t *)&pgd;
  715. else {
  716. /*
  717. * Even if we end up with an unmap, the pgtable will not
  718. * be freed, because we do an rcu free and here we are
  719. * irq disabled
  720. */
  721. pdshift = PUD_SHIFT;
  722. pudp = pud_offset(&pgd, ea);
  723. pud = READ_ONCE(*pudp);
  724. if (pud_none(pud))
  725. return NULL;
  726. else if (pud_huge(pud)) {
  727. ret_pte = (pte_t *) pudp;
  728. goto out;
  729. } else if (is_hugepd(__hugepd(pud_val(pud))))
  730. hpdp = (hugepd_t *)&pud;
  731. else {
  732. pdshift = PMD_SHIFT;
  733. pmdp = pmd_offset(&pud, ea);
  734. pmd = READ_ONCE(*pmdp);
  735. /*
  736. * A hugepage collapse is captured by pmd_none, because
  737. * it mark the pmd none and do a hpte invalidate.
  738. */
  739. if (pmd_none(pmd))
  740. return NULL;
  741. if (pmd_trans_huge(pmd) || pmd_devmap(pmd)) {
  742. if (is_thp)
  743. *is_thp = true;
  744. ret_pte = (pte_t *) pmdp;
  745. goto out;
  746. }
  747. if (pmd_huge(pmd)) {
  748. ret_pte = (pte_t *) pmdp;
  749. goto out;
  750. } else if (is_hugepd(__hugepd(pmd_val(pmd))))
  751. hpdp = (hugepd_t *)&pmd;
  752. else
  753. return pte_offset_kernel(&pmd, ea);
  754. }
  755. }
  756. if (!hpdp)
  757. return NULL;
  758. ret_pte = hugepte_offset(*hpdp, ea, pdshift);
  759. pdshift = hugepd_shift(*hpdp);
  760. out:
  761. if (hpage_shift)
  762. *hpage_shift = pdshift;
  763. return ret_pte;
  764. }
  765. EXPORT_SYMBOL_GPL(__find_linux_pte);
  766. int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
  767. unsigned long end, int write, struct page **pages, int *nr)
  768. {
  769. unsigned long pte_end;
  770. struct page *head, *page;
  771. pte_t pte;
  772. int refs;
  773. pte_end = (addr + sz) & ~(sz-1);
  774. if (pte_end < end)
  775. end = pte_end;
  776. pte = READ_ONCE(*ptep);
  777. if (!pte_access_permitted(pte, write))
  778. return 0;
  779. /* hugepages are never "special" */
  780. VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
  781. refs = 0;
  782. head = pte_page(pte);
  783. page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
  784. do {
  785. VM_BUG_ON(compound_head(page) != head);
  786. pages[*nr] = page;
  787. (*nr)++;
  788. page++;
  789. refs++;
  790. } while (addr += PAGE_SIZE, addr != end);
  791. if (!page_cache_add_speculative(head, refs)) {
  792. *nr -= refs;
  793. return 0;
  794. }
  795. if (unlikely(pte_val(pte) != pte_val(*ptep))) {
  796. /* Could be optimized better */
  797. *nr -= refs;
  798. while (refs--)
  799. put_page(head);
  800. return 0;
  801. }
  802. return 1;
  803. }