hugetlbpage.c 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917
  1. /*
  2. * PPC Huge TLB Page Support for Kernel.
  3. *
  4. * Copyright (C) 2003 David Gibson, IBM Corporation.
  5. * Copyright (C) 2011 Becky Bruce, Freescale Semiconductor
  6. *
  7. * Based on the IA-32 version:
  8. * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
  9. */
  10. #include <linux/mm.h>
  11. #include <linux/io.h>
  12. #include <linux/slab.h>
  13. #include <linux/hugetlb.h>
  14. #include <linux/export.h>
  15. #include <linux/of_fdt.h>
  16. #include <linux/memblock.h>
  17. #include <linux/bootmem.h>
  18. #include <linux/moduleparam.h>
  19. #include <linux/swap.h>
  20. #include <linux/swapops.h>
  21. #include <asm/pgtable.h>
  22. #include <asm/pgalloc.h>
  23. #include <asm/tlb.h>
  24. #include <asm/setup.h>
  25. #include <asm/hugetlb.h>
  26. #include <asm/pte-walk.h>
  27. #ifdef CONFIG_HUGETLB_PAGE
  28. #define PAGE_SHIFT_64K 16
  29. #define PAGE_SHIFT_512K 19
  30. #define PAGE_SHIFT_8M 23
  31. #define PAGE_SHIFT_16M 24
  32. #define PAGE_SHIFT_16G 34
  33. bool hugetlb_disabled = false;
  34. unsigned int HPAGE_SHIFT;
  35. EXPORT_SYMBOL(HPAGE_SHIFT);
  36. #define hugepd_none(hpd) (hpd_val(hpd) == 0)
  37. pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz)
  38. {
  39. /*
  40. * Only called for hugetlbfs pages, hence can ignore THP and the
  41. * irq disabled walk.
  42. */
  43. return __find_linux_pte(mm->pgd, addr, NULL, NULL);
  44. }
  45. static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
  46. unsigned long address, unsigned int pdshift,
  47. unsigned int pshift, spinlock_t *ptl)
  48. {
  49. struct kmem_cache *cachep;
  50. pte_t *new;
  51. int i;
  52. int num_hugepd;
  53. if (pshift >= pdshift) {
  54. cachep = hugepte_cache;
  55. num_hugepd = 1 << (pshift - pdshift);
  56. } else {
  57. cachep = PGT_CACHE(pdshift - pshift);
  58. num_hugepd = 1;
  59. }
  60. new = kmem_cache_zalloc(cachep, pgtable_gfp_flags(mm, GFP_KERNEL));
  61. BUG_ON(pshift > HUGEPD_SHIFT_MASK);
  62. BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK);
  63. if (! new)
  64. return -ENOMEM;
  65. /*
  66. * Make sure other cpus find the hugepd set only after a
  67. * properly initialized page table is visible to them.
  68. * For more details look for comment in __pte_alloc().
  69. */
  70. smp_wmb();
  71. spin_lock(ptl);
  72. /*
  73. * We have multiple higher-level entries that point to the same
  74. * actual pte location. Fill in each as we go and backtrack on error.
  75. * We need all of these so the DTLB pgtable walk code can find the
  76. * right higher-level entry without knowing if it's a hugepage or not.
  77. */
  78. for (i = 0; i < num_hugepd; i++, hpdp++) {
  79. if (unlikely(!hugepd_none(*hpdp)))
  80. break;
  81. else {
  82. #ifdef CONFIG_PPC_BOOK3S_64
  83. *hpdp = __hugepd(__pa(new) |
  84. (shift_to_mmu_psize(pshift) << 2));
  85. #elif defined(CONFIG_PPC_8xx)
  86. *hpdp = __hugepd(__pa(new) | _PMD_USER |
  87. (pshift == PAGE_SHIFT_8M ? _PMD_PAGE_8M :
  88. _PMD_PAGE_512K) | _PMD_PRESENT);
  89. #else
  90. /* We use the old format for PPC_FSL_BOOK3E */
  91. *hpdp = __hugepd(((unsigned long)new & ~PD_HUGE) | pshift);
  92. #endif
  93. }
  94. }
  95. /* If we bailed from the for loop early, an error occurred, clean up */
  96. if (i < num_hugepd) {
  97. for (i = i - 1 ; i >= 0; i--, hpdp--)
  98. *hpdp = __hugepd(0);
  99. kmem_cache_free(cachep, new);
  100. }
  101. spin_unlock(ptl);
  102. return 0;
  103. }
  104. /*
  105. * These macros define how to determine which level of the page table holds
  106. * the hpdp.
  107. */
  108. #if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
  109. #define HUGEPD_PGD_SHIFT PGDIR_SHIFT
  110. #define HUGEPD_PUD_SHIFT PUD_SHIFT
  111. #endif
  112. /*
  113. * At this point we do the placement change only for BOOK3S 64. This would
  114. * possibly work on other subarchs.
  115. */
  116. pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
  117. {
  118. pgd_t *pg;
  119. pud_t *pu;
  120. pmd_t *pm;
  121. hugepd_t *hpdp = NULL;
  122. unsigned pshift = __ffs(sz);
  123. unsigned pdshift = PGDIR_SHIFT;
  124. spinlock_t *ptl;
  125. addr &= ~(sz-1);
  126. pg = pgd_offset(mm, addr);
  127. #ifdef CONFIG_PPC_BOOK3S_64
  128. if (pshift == PGDIR_SHIFT)
  129. /* 16GB huge page */
  130. return (pte_t *) pg;
  131. else if (pshift > PUD_SHIFT) {
  132. /*
  133. * We need to use hugepd table
  134. */
  135. ptl = &mm->page_table_lock;
  136. hpdp = (hugepd_t *)pg;
  137. } else {
  138. pdshift = PUD_SHIFT;
  139. pu = pud_alloc(mm, pg, addr);
  140. if (pshift == PUD_SHIFT)
  141. return (pte_t *)pu;
  142. else if (pshift > PMD_SHIFT) {
  143. ptl = pud_lockptr(mm, pu);
  144. hpdp = (hugepd_t *)pu;
  145. } else {
  146. pdshift = PMD_SHIFT;
  147. pm = pmd_alloc(mm, pu, addr);
  148. if (pshift == PMD_SHIFT)
  149. /* 16MB hugepage */
  150. return (pte_t *)pm;
  151. else {
  152. ptl = pmd_lockptr(mm, pm);
  153. hpdp = (hugepd_t *)pm;
  154. }
  155. }
  156. }
  157. #else
  158. if (pshift >= HUGEPD_PGD_SHIFT) {
  159. ptl = &mm->page_table_lock;
  160. hpdp = (hugepd_t *)pg;
  161. } else {
  162. pdshift = PUD_SHIFT;
  163. pu = pud_alloc(mm, pg, addr);
  164. if (pshift >= HUGEPD_PUD_SHIFT) {
  165. ptl = pud_lockptr(mm, pu);
  166. hpdp = (hugepd_t *)pu;
  167. } else {
  168. pdshift = PMD_SHIFT;
  169. pm = pmd_alloc(mm, pu, addr);
  170. ptl = pmd_lockptr(mm, pm);
  171. hpdp = (hugepd_t *)pm;
  172. }
  173. }
  174. #endif
  175. if (!hpdp)
  176. return NULL;
  177. BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp));
  178. if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr,
  179. pdshift, pshift, ptl))
  180. return NULL;
  181. return hugepte_offset(*hpdp, addr, pdshift);
  182. }
  183. #ifdef CONFIG_PPC_BOOK3S_64
  184. /*
  185. * Tracks gpages after the device tree is scanned and before the
  186. * huge_boot_pages list is ready on pseries.
  187. */
  188. #define MAX_NUMBER_GPAGES 1024
  189. __initdata static u64 gpage_freearray[MAX_NUMBER_GPAGES];
  190. __initdata static unsigned nr_gpages;
  191. /*
  192. * Build list of addresses of gigantic pages. This function is used in early
  193. * boot before the buddy allocator is setup.
  194. */
  195. void __init pseries_add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
  196. {
  197. if (!addr)
  198. return;
  199. while (number_of_pages > 0) {
  200. gpage_freearray[nr_gpages] = addr;
  201. nr_gpages++;
  202. number_of_pages--;
  203. addr += page_size;
  204. }
  205. }
  206. int __init pseries_alloc_bootmem_huge_page(struct hstate *hstate)
  207. {
  208. struct huge_bootmem_page *m;
  209. if (nr_gpages == 0)
  210. return 0;
  211. m = phys_to_virt(gpage_freearray[--nr_gpages]);
  212. gpage_freearray[nr_gpages] = 0;
  213. list_add(&m->list, &huge_boot_pages);
  214. m->hstate = hstate;
  215. return 1;
  216. }
  217. #endif
  218. int __init alloc_bootmem_huge_page(struct hstate *h)
  219. {
  220. #ifdef CONFIG_PPC_BOOK3S_64
  221. if (firmware_has_feature(FW_FEATURE_LPAR) && !radix_enabled())
  222. return pseries_alloc_bootmem_huge_page(h);
  223. #endif
  224. return __alloc_bootmem_huge_page(h);
  225. }
  226. #if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
  227. #define HUGEPD_FREELIST_SIZE \
  228. ((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t))
  229. struct hugepd_freelist {
  230. struct rcu_head rcu;
  231. unsigned int index;
  232. void *ptes[0];
  233. };
  234. static DEFINE_PER_CPU(struct hugepd_freelist *, hugepd_freelist_cur);
  235. static void hugepd_free_rcu_callback(struct rcu_head *head)
  236. {
  237. struct hugepd_freelist *batch =
  238. container_of(head, struct hugepd_freelist, rcu);
  239. unsigned int i;
  240. for (i = 0; i < batch->index; i++)
  241. kmem_cache_free(hugepte_cache, batch->ptes[i]);
  242. free_page((unsigned long)batch);
  243. }
  244. static void hugepd_free(struct mmu_gather *tlb, void *hugepte)
  245. {
  246. struct hugepd_freelist **batchp;
  247. batchp = &get_cpu_var(hugepd_freelist_cur);
  248. if (atomic_read(&tlb->mm->mm_users) < 2 ||
  249. mm_is_thread_local(tlb->mm)) {
  250. kmem_cache_free(hugepte_cache, hugepte);
  251. put_cpu_var(hugepd_freelist_cur);
  252. return;
  253. }
  254. if (*batchp == NULL) {
  255. *batchp = (struct hugepd_freelist *)__get_free_page(GFP_ATOMIC);
  256. (*batchp)->index = 0;
  257. }
  258. (*batchp)->ptes[(*batchp)->index++] = hugepte;
  259. if ((*batchp)->index == HUGEPD_FREELIST_SIZE) {
  260. call_rcu_sched(&(*batchp)->rcu, hugepd_free_rcu_callback);
  261. *batchp = NULL;
  262. }
  263. put_cpu_var(hugepd_freelist_cur);
  264. }
  265. #else
  266. static inline void hugepd_free(struct mmu_gather *tlb, void *hugepte) {}
  267. #endif
  268. static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift,
  269. unsigned long start, unsigned long end,
  270. unsigned long floor, unsigned long ceiling)
  271. {
  272. pte_t *hugepte = hugepd_page(*hpdp);
  273. int i;
  274. unsigned long pdmask = ~((1UL << pdshift) - 1);
  275. unsigned int num_hugepd = 1;
  276. unsigned int shift = hugepd_shift(*hpdp);
  277. /* Note: On fsl the hpdp may be the first of several */
  278. if (shift > pdshift)
  279. num_hugepd = 1 << (shift - pdshift);
  280. start &= pdmask;
  281. if (start < floor)
  282. return;
  283. if (ceiling) {
  284. ceiling &= pdmask;
  285. if (! ceiling)
  286. return;
  287. }
  288. if (end - 1 > ceiling - 1)
  289. return;
  290. for (i = 0; i < num_hugepd; i++, hpdp++)
  291. *hpdp = __hugepd(0);
  292. if (shift >= pdshift)
  293. hugepd_free(tlb, hugepte);
  294. else
  295. pgtable_free_tlb(tlb, hugepte,
  296. get_hugepd_cache_index(pdshift - shift));
  297. }
  298. static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
  299. unsigned long addr, unsigned long end,
  300. unsigned long floor, unsigned long ceiling)
  301. {
  302. pmd_t *pmd;
  303. unsigned long next;
  304. unsigned long start;
  305. start = addr;
  306. do {
  307. unsigned long more;
  308. pmd = pmd_offset(pud, addr);
  309. next = pmd_addr_end(addr, end);
  310. if (!is_hugepd(__hugepd(pmd_val(*pmd)))) {
  311. /*
  312. * if it is not hugepd pointer, we should already find
  313. * it cleared.
  314. */
  315. WARN_ON(!pmd_none_or_clear_bad(pmd));
  316. continue;
  317. }
  318. /*
  319. * Increment next by the size of the huge mapping since
  320. * there may be more than one entry at this level for a
  321. * single hugepage, but all of them point to
  322. * the same kmem cache that holds the hugepte.
  323. */
  324. more = addr + (1 << hugepd_shift(*(hugepd_t *)pmd));
  325. if (more > next)
  326. next = more;
  327. free_hugepd_range(tlb, (hugepd_t *)pmd, PMD_SHIFT,
  328. addr, next, floor, ceiling);
  329. } while (addr = next, addr != end);
  330. start &= PUD_MASK;
  331. if (start < floor)
  332. return;
  333. if (ceiling) {
  334. ceiling &= PUD_MASK;
  335. if (!ceiling)
  336. return;
  337. }
  338. if (end - 1 > ceiling - 1)
  339. return;
  340. pmd = pmd_offset(pud, start);
  341. pud_clear(pud);
  342. pmd_free_tlb(tlb, pmd, start);
  343. mm_dec_nr_pmds(tlb->mm);
  344. }
  345. static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
  346. unsigned long addr, unsigned long end,
  347. unsigned long floor, unsigned long ceiling)
  348. {
  349. pud_t *pud;
  350. unsigned long next;
  351. unsigned long start;
  352. start = addr;
  353. do {
  354. pud = pud_offset(pgd, addr);
  355. next = pud_addr_end(addr, end);
  356. if (!is_hugepd(__hugepd(pud_val(*pud)))) {
  357. if (pud_none_or_clear_bad(pud))
  358. continue;
  359. hugetlb_free_pmd_range(tlb, pud, addr, next, floor,
  360. ceiling);
  361. } else {
  362. unsigned long more;
  363. /*
  364. * Increment next by the size of the huge mapping since
  365. * there may be more than one entry at this level for a
  366. * single hugepage, but all of them point to
  367. * the same kmem cache that holds the hugepte.
  368. */
  369. more = addr + (1 << hugepd_shift(*(hugepd_t *)pud));
  370. if (more > next)
  371. next = more;
  372. free_hugepd_range(tlb, (hugepd_t *)pud, PUD_SHIFT,
  373. addr, next, floor, ceiling);
  374. }
  375. } while (addr = next, addr != end);
  376. start &= PGDIR_MASK;
  377. if (start < floor)
  378. return;
  379. if (ceiling) {
  380. ceiling &= PGDIR_MASK;
  381. if (!ceiling)
  382. return;
  383. }
  384. if (end - 1 > ceiling - 1)
  385. return;
  386. pud = pud_offset(pgd, start);
  387. pgd_clear(pgd);
  388. pud_free_tlb(tlb, pud, start);
  389. mm_dec_nr_puds(tlb->mm);
  390. }
  391. /*
  392. * This function frees user-level page tables of a process.
  393. */
  394. void hugetlb_free_pgd_range(struct mmu_gather *tlb,
  395. unsigned long addr, unsigned long end,
  396. unsigned long floor, unsigned long ceiling)
  397. {
  398. pgd_t *pgd;
  399. unsigned long next;
  400. /*
  401. * Because there are a number of different possible pagetable
  402. * layouts for hugepage ranges, we limit knowledge of how
  403. * things should be laid out to the allocation path
  404. * (huge_pte_alloc(), above). Everything else works out the
  405. * structure as it goes from information in the hugepd
  406. * pointers. That means that we can't here use the
  407. * optimization used in the normal page free_pgd_range(), of
  408. * checking whether we're actually covering a large enough
  409. * range to have to do anything at the top level of the walk
  410. * instead of at the bottom.
  411. *
  412. * To make sense of this, you should probably go read the big
  413. * block comment at the top of the normal free_pgd_range(),
  414. * too.
  415. */
  416. do {
  417. next = pgd_addr_end(addr, end);
  418. pgd = pgd_offset(tlb->mm, addr);
  419. if (!is_hugepd(__hugepd(pgd_val(*pgd)))) {
  420. if (pgd_none_or_clear_bad(pgd))
  421. continue;
  422. hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
  423. } else {
  424. unsigned long more;
  425. /*
  426. * Increment next by the size of the huge mapping since
  427. * there may be more than one entry at the pgd level
  428. * for a single hugepage, but all of them point to the
  429. * same kmem cache that holds the hugepte.
  430. */
  431. more = addr + (1 << hugepd_shift(*(hugepd_t *)pgd));
  432. if (more > next)
  433. next = more;
  434. free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT,
  435. addr, next, floor, ceiling);
  436. }
  437. } while (addr = next, addr != end);
  438. }
  439. struct page *follow_huge_pd(struct vm_area_struct *vma,
  440. unsigned long address, hugepd_t hpd,
  441. int flags, int pdshift)
  442. {
  443. pte_t *ptep;
  444. spinlock_t *ptl;
  445. struct page *page = NULL;
  446. unsigned long mask;
  447. int shift = hugepd_shift(hpd);
  448. struct mm_struct *mm = vma->vm_mm;
  449. retry:
  450. /*
  451. * hugepage directory entries are protected by mm->page_table_lock
  452. * Use this instead of huge_pte_lockptr
  453. */
  454. ptl = &mm->page_table_lock;
  455. spin_lock(ptl);
  456. ptep = hugepte_offset(hpd, address, pdshift);
  457. if (pte_present(*ptep)) {
  458. mask = (1UL << shift) - 1;
  459. page = pte_page(*ptep);
  460. page += ((address & mask) >> PAGE_SHIFT);
  461. if (flags & FOLL_GET)
  462. get_page(page);
  463. } else {
  464. if (is_hugetlb_entry_migration(*ptep)) {
  465. spin_unlock(ptl);
  466. __migration_entry_wait(mm, ptep, ptl);
  467. goto retry;
  468. }
  469. }
  470. spin_unlock(ptl);
  471. return page;
  472. }
  473. static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
  474. unsigned long sz)
  475. {
  476. unsigned long __boundary = (addr + sz) & ~(sz-1);
  477. return (__boundary - 1 < end - 1) ? __boundary : end;
  478. }
  479. int gup_huge_pd(hugepd_t hugepd, unsigned long addr, unsigned pdshift,
  480. unsigned long end, int write, struct page **pages, int *nr)
  481. {
  482. pte_t *ptep;
  483. unsigned long sz = 1UL << hugepd_shift(hugepd);
  484. unsigned long next;
  485. ptep = hugepte_offset(hugepd, addr, pdshift);
  486. do {
  487. next = hugepte_addr_end(addr, end, sz);
  488. if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr))
  489. return 0;
  490. } while (ptep++, addr = next, addr != end);
  491. return 1;
  492. }
  493. #ifdef CONFIG_PPC_MM_SLICES
  494. unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
  495. unsigned long len, unsigned long pgoff,
  496. unsigned long flags)
  497. {
  498. struct hstate *hstate = hstate_file(file);
  499. int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));
  500. #ifdef CONFIG_PPC_RADIX_MMU
  501. if (radix_enabled())
  502. return radix__hugetlb_get_unmapped_area(file, addr, len,
  503. pgoff, flags);
  504. #endif
  505. return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1);
  506. }
  507. #endif
  508. unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
  509. {
  510. #ifdef CONFIG_PPC_MM_SLICES
  511. /* With radix we don't use slice, so derive it from vma*/
  512. if (!radix_enabled()) {
  513. unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start);
  514. return 1UL << mmu_psize_to_shift(psize);
  515. }
  516. #endif
  517. return vma_kernel_pagesize(vma);
  518. }
  519. static inline bool is_power_of_4(unsigned long x)
  520. {
  521. if (is_power_of_2(x))
  522. return (__ilog2(x) % 2) ? false : true;
  523. return false;
  524. }
  525. static int __init add_huge_page_size(unsigned long long size)
  526. {
  527. int shift = __ffs(size);
  528. int mmu_psize;
  529. /* Check that it is a page size supported by the hardware and
  530. * that it fits within pagetable and slice limits. */
  531. if (size <= PAGE_SIZE)
  532. return -EINVAL;
  533. #if defined(CONFIG_PPC_FSL_BOOK3E)
  534. if (!is_power_of_4(size))
  535. return -EINVAL;
  536. #elif !defined(CONFIG_PPC_8xx)
  537. if (!is_power_of_2(size) || (shift > SLICE_HIGH_SHIFT))
  538. return -EINVAL;
  539. #endif
  540. if ((mmu_psize = shift_to_mmu_psize(shift)) < 0)
  541. return -EINVAL;
  542. #ifdef CONFIG_PPC_BOOK3S_64
  543. /*
  544. * We need to make sure that for different page sizes reported by
  545. * firmware we only add hugetlb support for page sizes that can be
  546. * supported by linux page table layout.
  547. * For now we have
  548. * Radix: 2M and 1G
  549. * Hash: 16M and 16G
  550. */
  551. if (radix_enabled()) {
  552. if (mmu_psize != MMU_PAGE_2M && mmu_psize != MMU_PAGE_1G)
  553. return -EINVAL;
  554. } else {
  555. if (mmu_psize != MMU_PAGE_16M && mmu_psize != MMU_PAGE_16G)
  556. return -EINVAL;
  557. }
  558. #endif
  559. BUG_ON(mmu_psize_defs[mmu_psize].shift != shift);
  560. /* Return if huge page size has already been setup */
  561. if (size_to_hstate(size))
  562. return 0;
  563. hugetlb_add_hstate(shift - PAGE_SHIFT);
  564. return 0;
  565. }
  566. static int __init hugepage_setup_sz(char *str)
  567. {
  568. unsigned long long size;
  569. size = memparse(str, &str);
  570. if (add_huge_page_size(size) != 0) {
  571. hugetlb_bad_size();
  572. pr_err("Invalid huge page size specified(%llu)\n", size);
  573. }
  574. return 1;
  575. }
  576. __setup("hugepagesz=", hugepage_setup_sz);
  577. struct kmem_cache *hugepte_cache;
  578. static int __init hugetlbpage_init(void)
  579. {
  580. int psize;
  581. if (hugetlb_disabled) {
  582. pr_info("HugeTLB support is disabled!\n");
  583. return 0;
  584. }
  585. #if !defined(CONFIG_PPC_FSL_BOOK3E) && !defined(CONFIG_PPC_8xx)
  586. if (!radix_enabled() && !mmu_has_feature(MMU_FTR_16M_PAGE))
  587. return -ENODEV;
  588. #endif
  589. for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
  590. unsigned shift;
  591. unsigned pdshift;
  592. if (!mmu_psize_defs[psize].shift)
  593. continue;
  594. shift = mmu_psize_to_shift(psize);
  595. #ifdef CONFIG_PPC_BOOK3S_64
  596. if (shift > PGDIR_SHIFT)
  597. continue;
  598. else if (shift > PUD_SHIFT)
  599. pdshift = PGDIR_SHIFT;
  600. else if (shift > PMD_SHIFT)
  601. pdshift = PUD_SHIFT;
  602. else
  603. pdshift = PMD_SHIFT;
  604. #else
  605. if (shift < HUGEPD_PUD_SHIFT)
  606. pdshift = PMD_SHIFT;
  607. else if (shift < HUGEPD_PGD_SHIFT)
  608. pdshift = PUD_SHIFT;
  609. else
  610. pdshift = PGDIR_SHIFT;
  611. #endif
  612. if (add_huge_page_size(1ULL << shift) < 0)
  613. continue;
  614. /*
  615. * if we have pdshift and shift value same, we don't
  616. * use pgt cache for hugepd.
  617. */
  618. if (pdshift > shift)
  619. pgtable_cache_add(pdshift - shift, NULL);
  620. #if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
  621. else if (!hugepte_cache) {
  622. /*
  623. * Create a kmem cache for hugeptes. The bottom bits in
  624. * the pte have size information encoded in them, so
  625. * align them to allow this
  626. */
  627. hugepte_cache = kmem_cache_create("hugepte-cache",
  628. sizeof(pte_t),
  629. HUGEPD_SHIFT_MASK + 1,
  630. 0, NULL);
  631. if (hugepte_cache == NULL)
  632. panic("%s: Unable to create kmem cache "
  633. "for hugeptes\n", __func__);
  634. }
  635. #endif
  636. }
  637. #if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
  638. /* Default hpage size = 4M on FSL_BOOK3E and 512k on 8xx */
  639. if (mmu_psize_defs[MMU_PAGE_4M].shift)
  640. HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_4M].shift;
  641. else if (mmu_psize_defs[MMU_PAGE_512K].shift)
  642. HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_512K].shift;
  643. #else
  644. /* Set default large page size. Currently, we pick 16M or 1M
  645. * depending on what is available
  646. */
  647. if (mmu_psize_defs[MMU_PAGE_16M].shift)
  648. HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift;
  649. else if (mmu_psize_defs[MMU_PAGE_1M].shift)
  650. HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift;
  651. else if (mmu_psize_defs[MMU_PAGE_2M].shift)
  652. HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_2M].shift;
  653. #endif
  654. return 0;
  655. }
  656. arch_initcall(hugetlbpage_init);
  657. void flush_dcache_icache_hugepage(struct page *page)
  658. {
  659. int i;
  660. void *start;
  661. BUG_ON(!PageCompound(page));
  662. for (i = 0; i < (1UL << compound_order(page)); i++) {
  663. if (!PageHighMem(page)) {
  664. __flush_dcache_icache(page_address(page+i));
  665. } else {
  666. start = kmap_atomic(page+i);
  667. __flush_dcache_icache(start);
  668. kunmap_atomic(start);
  669. }
  670. }
  671. }
  672. #endif /* CONFIG_HUGETLB_PAGE */
  673. /*
  674. * We have 4 cases for pgds and pmds:
  675. * (1) invalid (all zeroes)
  676. * (2) pointer to next table, as normal; bottom 6 bits == 0
  677. * (3) leaf pte for huge page _PAGE_PTE set
  678. * (4) hugepd pointer, _PAGE_PTE = 0 and bits [2..6] indicate size of table
  679. *
  680. * So long as we atomically load page table pointers we are safe against teardown,
  681. * we can follow the address down to the the page and take a ref on it.
  682. * This function need to be called with interrupts disabled. We use this variant
  683. * when we have MSR[EE] = 0 but the paca->irq_soft_mask = IRQS_ENABLED
  684. */
  685. pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
  686. bool *is_thp, unsigned *hpage_shift)
  687. {
  688. pgd_t pgd, *pgdp;
  689. pud_t pud, *pudp;
  690. pmd_t pmd, *pmdp;
  691. pte_t *ret_pte;
  692. hugepd_t *hpdp = NULL;
  693. unsigned pdshift = PGDIR_SHIFT;
  694. if (hpage_shift)
  695. *hpage_shift = 0;
  696. if (is_thp)
  697. *is_thp = false;
  698. pgdp = pgdir + pgd_index(ea);
  699. pgd = READ_ONCE(*pgdp);
  700. /*
  701. * Always operate on the local stack value. This make sure the
  702. * value don't get updated by a parallel THP split/collapse,
  703. * page fault or a page unmap. The return pte_t * is still not
  704. * stable. So should be checked there for above conditions.
  705. */
  706. if (pgd_none(pgd))
  707. return NULL;
  708. else if (pgd_huge(pgd)) {
  709. ret_pte = (pte_t *) pgdp;
  710. goto out;
  711. } else if (is_hugepd(__hugepd(pgd_val(pgd))))
  712. hpdp = (hugepd_t *)&pgd;
  713. else {
  714. /*
  715. * Even if we end up with an unmap, the pgtable will not
  716. * be freed, because we do an rcu free and here we are
  717. * irq disabled
  718. */
  719. pdshift = PUD_SHIFT;
  720. pudp = pud_offset(&pgd, ea);
  721. pud = READ_ONCE(*pudp);
  722. if (pud_none(pud))
  723. return NULL;
  724. else if (pud_huge(pud)) {
  725. ret_pte = (pte_t *) pudp;
  726. goto out;
  727. } else if (is_hugepd(__hugepd(pud_val(pud))))
  728. hpdp = (hugepd_t *)&pud;
  729. else {
  730. pdshift = PMD_SHIFT;
  731. pmdp = pmd_offset(&pud, ea);
  732. pmd = READ_ONCE(*pmdp);
  733. /*
  734. * A hugepage collapse is captured by pmd_none, because
  735. * it mark the pmd none and do a hpte invalidate.
  736. */
  737. if (pmd_none(pmd))
  738. return NULL;
  739. if (pmd_trans_huge(pmd) || pmd_devmap(pmd)) {
  740. if (is_thp)
  741. *is_thp = true;
  742. ret_pte = (pte_t *) pmdp;
  743. goto out;
  744. }
  745. if (pmd_huge(pmd)) {
  746. ret_pte = (pte_t *) pmdp;
  747. goto out;
  748. } else if (is_hugepd(__hugepd(pmd_val(pmd))))
  749. hpdp = (hugepd_t *)&pmd;
  750. else
  751. return pte_offset_kernel(&pmd, ea);
  752. }
  753. }
  754. if (!hpdp)
  755. return NULL;
  756. ret_pte = hugepte_offset(*hpdp, ea, pdshift);
  757. pdshift = hugepd_shift(*hpdp);
  758. out:
  759. if (hpage_shift)
  760. *hpage_shift = pdshift;
  761. return ret_pte;
  762. }
  763. EXPORT_SYMBOL_GPL(__find_linux_pte);
  764. int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
  765. unsigned long end, int write, struct page **pages, int *nr)
  766. {
  767. unsigned long pte_end;
  768. struct page *head, *page;
  769. pte_t pte;
  770. int refs;
  771. pte_end = (addr + sz) & ~(sz-1);
  772. if (pte_end < end)
  773. end = pte_end;
  774. pte = READ_ONCE(*ptep);
  775. if (!pte_access_permitted(pte, write))
  776. return 0;
  777. /* hugepages are never "special" */
  778. VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
  779. refs = 0;
  780. head = pte_page(pte);
  781. page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
  782. do {
  783. VM_BUG_ON(compound_head(page) != head);
  784. pages[*nr] = page;
  785. (*nr)++;
  786. page++;
  787. refs++;
  788. } while (addr += PAGE_SIZE, addr != end);
  789. if (!page_cache_add_speculative(head, refs)) {
  790. *nr -= refs;
  791. return 0;
  792. }
  793. if (unlikely(pte_val(pte) != pte_val(*ptep))) {
  794. /* Could be optimized better */
  795. *nr -= refs;
  796. while (refs--)
  797. put_page(head);
  798. return 0;
  799. }
  800. return 1;
  801. }