pgtable.c 38 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476
  1. /*
  2. * Copyright IBM Corp. 2007, 2011
  3. * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
  4. */
  5. #include <linux/sched.h>
  6. #include <linux/kernel.h>
  7. #include <linux/errno.h>
  8. #include <linux/gfp.h>
  9. #include <linux/mm.h>
  10. #include <linux/swap.h>
  11. #include <linux/smp.h>
  12. #include <linux/highmem.h>
  13. #include <linux/pagemap.h>
  14. #include <linux/spinlock.h>
  15. #include <linux/module.h>
  16. #include <linux/quicklist.h>
  17. #include <linux/rcupdate.h>
  18. #include <linux/slab.h>
  19. #include <linux/swapops.h>
  20. #include <linux/ksm.h>
  21. #include <linux/mman.h>
  22. #include <asm/pgtable.h>
  23. #include <asm/pgalloc.h>
  24. #include <asm/tlb.h>
  25. #include <asm/tlbflush.h>
  26. #include <asm/mmu_context.h>
  27. #ifndef CONFIG_64BIT
  28. #define ALLOC_ORDER 1
  29. #define FRAG_MASK 0x0f
  30. #else
  31. #define ALLOC_ORDER 2
  32. #define FRAG_MASK 0x03
  33. #endif
  34. unsigned long *crst_table_alloc(struct mm_struct *mm)
  35. {
  36. struct page *page = alloc_pages(GFP_KERNEL, ALLOC_ORDER);
  37. if (!page)
  38. return NULL;
  39. return (unsigned long *) page_to_phys(page);
  40. }
  41. void crst_table_free(struct mm_struct *mm, unsigned long *table)
  42. {
  43. free_pages((unsigned long) table, ALLOC_ORDER);
  44. }
  45. #ifdef CONFIG_64BIT
  46. static void __crst_table_upgrade(void *arg)
  47. {
  48. struct mm_struct *mm = arg;
  49. if (current->active_mm == mm) {
  50. clear_user_asce();
  51. set_user_asce(mm);
  52. }
  53. __tlb_flush_local();
  54. }
  55. int crst_table_upgrade(struct mm_struct *mm, unsigned long limit)
  56. {
  57. unsigned long *table, *pgd;
  58. unsigned long entry;
  59. int flush;
  60. BUG_ON(limit > (1UL << 53));
  61. flush = 0;
  62. repeat:
  63. table = crst_table_alloc(mm);
  64. if (!table)
  65. return -ENOMEM;
  66. spin_lock_bh(&mm->page_table_lock);
  67. if (mm->context.asce_limit < limit) {
  68. pgd = (unsigned long *) mm->pgd;
  69. if (mm->context.asce_limit <= (1UL << 31)) {
  70. entry = _REGION3_ENTRY_EMPTY;
  71. mm->context.asce_limit = 1UL << 42;
  72. mm->context.asce_bits = _ASCE_TABLE_LENGTH |
  73. _ASCE_USER_BITS |
  74. _ASCE_TYPE_REGION3;
  75. } else {
  76. entry = _REGION2_ENTRY_EMPTY;
  77. mm->context.asce_limit = 1UL << 53;
  78. mm->context.asce_bits = _ASCE_TABLE_LENGTH |
  79. _ASCE_USER_BITS |
  80. _ASCE_TYPE_REGION2;
  81. }
  82. crst_table_init(table, entry);
  83. pgd_populate(mm, (pgd_t *) table, (pud_t *) pgd);
  84. mm->pgd = (pgd_t *) table;
  85. mm->task_size = mm->context.asce_limit;
  86. table = NULL;
  87. flush = 1;
  88. }
  89. spin_unlock_bh(&mm->page_table_lock);
  90. if (table)
  91. crst_table_free(mm, table);
  92. if (mm->context.asce_limit < limit)
  93. goto repeat;
  94. if (flush)
  95. on_each_cpu(__crst_table_upgrade, mm, 0);
  96. return 0;
  97. }
  98. void crst_table_downgrade(struct mm_struct *mm, unsigned long limit)
  99. {
  100. pgd_t *pgd;
  101. if (current->active_mm == mm) {
  102. clear_user_asce();
  103. __tlb_flush_mm(mm);
  104. }
  105. while (mm->context.asce_limit > limit) {
  106. pgd = mm->pgd;
  107. switch (pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) {
  108. case _REGION_ENTRY_TYPE_R2:
  109. mm->context.asce_limit = 1UL << 42;
  110. mm->context.asce_bits = _ASCE_TABLE_LENGTH |
  111. _ASCE_USER_BITS |
  112. _ASCE_TYPE_REGION3;
  113. break;
  114. case _REGION_ENTRY_TYPE_R3:
  115. mm->context.asce_limit = 1UL << 31;
  116. mm->context.asce_bits = _ASCE_TABLE_LENGTH |
  117. _ASCE_USER_BITS |
  118. _ASCE_TYPE_SEGMENT;
  119. break;
  120. default:
  121. BUG();
  122. }
  123. mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN);
  124. mm->task_size = mm->context.asce_limit;
  125. crst_table_free(mm, (unsigned long *) pgd);
  126. }
  127. if (current->active_mm == mm)
  128. set_user_asce(mm);
  129. }
  130. #endif
  131. #ifdef CONFIG_PGSTE
  132. /**
  133. * gmap_alloc - allocate a guest address space
  134. * @mm: pointer to the parent mm_struct
  135. * @limit: maximum size of the gmap address space
  136. *
  137. * Returns a guest address space structure.
  138. */
  139. struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit)
  140. {
  141. struct gmap *gmap;
  142. struct page *page;
  143. unsigned long *table;
  144. unsigned long etype, atype;
  145. if (limit < (1UL << 31)) {
  146. limit = (1UL << 31) - 1;
  147. atype = _ASCE_TYPE_SEGMENT;
  148. etype = _SEGMENT_ENTRY_EMPTY;
  149. } else if (limit < (1UL << 42)) {
  150. limit = (1UL << 42) - 1;
  151. atype = _ASCE_TYPE_REGION3;
  152. etype = _REGION3_ENTRY_EMPTY;
  153. } else if (limit < (1UL << 53)) {
  154. limit = (1UL << 53) - 1;
  155. atype = _ASCE_TYPE_REGION2;
  156. etype = _REGION2_ENTRY_EMPTY;
  157. } else {
  158. limit = -1UL;
  159. atype = _ASCE_TYPE_REGION1;
  160. etype = _REGION1_ENTRY_EMPTY;
  161. }
  162. gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL);
  163. if (!gmap)
  164. goto out;
  165. INIT_LIST_HEAD(&gmap->crst_list);
  166. INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL);
  167. INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC);
  168. spin_lock_init(&gmap->guest_table_lock);
  169. gmap->mm = mm;
  170. page = alloc_pages(GFP_KERNEL, ALLOC_ORDER);
  171. if (!page)
  172. goto out_free;
  173. page->index = 0;
  174. list_add(&page->lru, &gmap->crst_list);
  175. table = (unsigned long *) page_to_phys(page);
  176. crst_table_init(table, etype);
  177. gmap->table = table;
  178. gmap->asce = atype | _ASCE_TABLE_LENGTH |
  179. _ASCE_USER_BITS | __pa(table);
  180. gmap->asce_end = limit;
  181. down_write(&mm->mmap_sem);
  182. list_add(&gmap->list, &mm->context.gmap_list);
  183. up_write(&mm->mmap_sem);
  184. return gmap;
  185. out_free:
  186. kfree(gmap);
  187. out:
  188. return NULL;
  189. }
  190. EXPORT_SYMBOL_GPL(gmap_alloc);
  191. static void gmap_flush_tlb(struct gmap *gmap)
  192. {
  193. if (MACHINE_HAS_IDTE)
  194. __tlb_flush_asce(gmap->mm, gmap->asce);
  195. else
  196. __tlb_flush_global();
  197. }
  198. static void gmap_radix_tree_free(struct radix_tree_root *root)
  199. {
  200. struct radix_tree_iter iter;
  201. unsigned long indices[16];
  202. unsigned long index;
  203. void **slot;
  204. int i, nr;
  205. /* A radix tree is freed by deleting all of its entries */
  206. index = 0;
  207. do {
  208. nr = 0;
  209. radix_tree_for_each_slot(slot, root, &iter, index) {
  210. indices[nr] = iter.index;
  211. if (++nr == 16)
  212. break;
  213. }
  214. for (i = 0; i < nr; i++) {
  215. index = indices[i];
  216. radix_tree_delete(root, index);
  217. }
  218. } while (nr > 0);
  219. }
  220. /**
  221. * gmap_free - free a guest address space
  222. * @gmap: pointer to the guest address space structure
  223. */
  224. void gmap_free(struct gmap *gmap)
  225. {
  226. struct page *page, *next;
  227. /* Flush tlb. */
  228. if (MACHINE_HAS_IDTE)
  229. __tlb_flush_asce(gmap->mm, gmap->asce);
  230. else
  231. __tlb_flush_global();
  232. /* Free all segment & region tables. */
  233. list_for_each_entry_safe(page, next, &gmap->crst_list, lru)
  234. __free_pages(page, ALLOC_ORDER);
  235. gmap_radix_tree_free(&gmap->guest_to_host);
  236. gmap_radix_tree_free(&gmap->host_to_guest);
  237. down_write(&gmap->mm->mmap_sem);
  238. list_del(&gmap->list);
  239. up_write(&gmap->mm->mmap_sem);
  240. kfree(gmap);
  241. }
  242. EXPORT_SYMBOL_GPL(gmap_free);
  243. /**
  244. * gmap_enable - switch primary space to the guest address space
  245. * @gmap: pointer to the guest address space structure
  246. */
  247. void gmap_enable(struct gmap *gmap)
  248. {
  249. S390_lowcore.gmap = (unsigned long) gmap;
  250. }
  251. EXPORT_SYMBOL_GPL(gmap_enable);
  252. /**
  253. * gmap_disable - switch back to the standard primary address space
  254. * @gmap: pointer to the guest address space structure
  255. */
  256. void gmap_disable(struct gmap *gmap)
  257. {
  258. S390_lowcore.gmap = 0UL;
  259. }
  260. EXPORT_SYMBOL_GPL(gmap_disable);
  261. /*
  262. * gmap_alloc_table is assumed to be called with mmap_sem held
  263. */
  264. static int gmap_alloc_table(struct gmap *gmap, unsigned long *table,
  265. unsigned long init, unsigned long gaddr)
  266. {
  267. struct page *page;
  268. unsigned long *new;
  269. /* since we dont free the gmap table until gmap_free we can unlock */
  270. page = alloc_pages(GFP_KERNEL, ALLOC_ORDER);
  271. if (!page)
  272. return -ENOMEM;
  273. new = (unsigned long *) page_to_phys(page);
  274. crst_table_init(new, init);
  275. spin_lock(&gmap->mm->page_table_lock);
  276. if (*table & _REGION_ENTRY_INVALID) {
  277. list_add(&page->lru, &gmap->crst_list);
  278. *table = (unsigned long) new | _REGION_ENTRY_LENGTH |
  279. (*table & _REGION_ENTRY_TYPE_MASK);
  280. page->index = gaddr;
  281. page = NULL;
  282. }
  283. spin_unlock(&gmap->mm->page_table_lock);
  284. if (page)
  285. __free_pages(page, ALLOC_ORDER);
  286. return 0;
  287. }
  288. /**
  289. * __gmap_segment_gaddr - find virtual address from segment pointer
  290. * @entry: pointer to a segment table entry in the guest address space
  291. *
  292. * Returns the virtual address in the guest address space for the segment
  293. */
  294. static unsigned long __gmap_segment_gaddr(unsigned long *entry)
  295. {
  296. struct page *page;
  297. unsigned long offset, mask;
  298. offset = (unsigned long) entry / sizeof(unsigned long);
  299. offset = (offset & (PTRS_PER_PMD - 1)) * PMD_SIZE;
  300. mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1);
  301. page = virt_to_page((void *)((unsigned long) entry & mask));
  302. return page->index + offset;
  303. }
  304. /**
  305. * __gmap_unlink_by_vmaddr - unlink a single segment via a host address
  306. * @gmap: pointer to the guest address space structure
  307. * @vmaddr: address in the host process address space
  308. *
  309. * Returns 1 if a TLB flush is required
  310. */
  311. static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr)
  312. {
  313. unsigned long *entry;
  314. int flush = 0;
  315. spin_lock(&gmap->guest_table_lock);
  316. entry = radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT);
  317. if (entry) {
  318. flush = (*entry != _SEGMENT_ENTRY_INVALID);
  319. *entry = _SEGMENT_ENTRY_INVALID;
  320. }
  321. spin_unlock(&gmap->guest_table_lock);
  322. return flush;
  323. }
  324. /**
  325. * __gmap_unmap_by_gaddr - unmap a single segment via a guest address
  326. * @gmap: pointer to the guest address space structure
  327. * @gaddr: address in the guest address space
  328. *
  329. * Returns 1 if a TLB flush is required
  330. */
  331. static int __gmap_unmap_by_gaddr(struct gmap *gmap, unsigned long gaddr)
  332. {
  333. unsigned long vmaddr;
  334. vmaddr = (unsigned long) radix_tree_delete(&gmap->guest_to_host,
  335. gaddr >> PMD_SHIFT);
  336. return vmaddr ? __gmap_unlink_by_vmaddr(gmap, vmaddr) : 0;
  337. }
  338. /**
  339. * gmap_unmap_segment - unmap segment from the guest address space
  340. * @gmap: pointer to the guest address space structure
  341. * @to: address in the guest address space
  342. * @len: length of the memory area to unmap
  343. *
  344. * Returns 0 if the unmap succeeded, -EINVAL if not.
  345. */
  346. int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len)
  347. {
  348. unsigned long off;
  349. int flush;
  350. if ((to | len) & (PMD_SIZE - 1))
  351. return -EINVAL;
  352. if (len == 0 || to + len < to)
  353. return -EINVAL;
  354. flush = 0;
  355. down_write(&gmap->mm->mmap_sem);
  356. for (off = 0; off < len; off += PMD_SIZE)
  357. flush |= __gmap_unmap_by_gaddr(gmap, to + off);
  358. up_write(&gmap->mm->mmap_sem);
  359. if (flush)
  360. gmap_flush_tlb(gmap);
  361. return 0;
  362. }
  363. EXPORT_SYMBOL_GPL(gmap_unmap_segment);
  364. /**
  365. * gmap_mmap_segment - map a segment to the guest address space
  366. * @gmap: pointer to the guest address space structure
  367. * @from: source address in the parent address space
  368. * @to: target address in the guest address space
  369. * @len: length of the memory area to map
  370. *
  371. * Returns 0 if the mmap succeeded, -EINVAL or -ENOMEM if not.
  372. */
  373. int gmap_map_segment(struct gmap *gmap, unsigned long from,
  374. unsigned long to, unsigned long len)
  375. {
  376. unsigned long off;
  377. int flush;
  378. if ((from | to | len) & (PMD_SIZE - 1))
  379. return -EINVAL;
  380. if (len == 0 || from + len < from || to + len < to ||
  381. from + len > TASK_MAX_SIZE || to + len > gmap->asce_end)
  382. return -EINVAL;
  383. flush = 0;
  384. down_write(&gmap->mm->mmap_sem);
  385. for (off = 0; off < len; off += PMD_SIZE) {
  386. /* Remove old translation */
  387. flush |= __gmap_unmap_by_gaddr(gmap, to + off);
  388. /* Store new translation */
  389. if (radix_tree_insert(&gmap->guest_to_host,
  390. (to + off) >> PMD_SHIFT,
  391. (void *) from + off))
  392. break;
  393. }
  394. up_write(&gmap->mm->mmap_sem);
  395. if (flush)
  396. gmap_flush_tlb(gmap);
  397. if (off >= len)
  398. return 0;
  399. gmap_unmap_segment(gmap, to, len);
  400. return -ENOMEM;
  401. }
  402. EXPORT_SYMBOL_GPL(gmap_map_segment);
  403. /**
  404. * __gmap_translate - translate a guest address to a user space address
  405. * @gmap: pointer to guest mapping meta data structure
  406. * @gaddr: guest address
  407. *
  408. * Returns user space address which corresponds to the guest address or
  409. * -EFAULT if no such mapping exists.
  410. * This function does not establish potentially missing page table entries.
  411. * The mmap_sem of the mm that belongs to the address space must be held
  412. * when this function gets called.
  413. */
  414. unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr)
  415. {
  416. unsigned long vmaddr;
  417. vmaddr = (unsigned long)
  418. radix_tree_lookup(&gmap->guest_to_host, gaddr >> PMD_SHIFT);
  419. return vmaddr ? (vmaddr | (gaddr & ~PMD_MASK)) : -EFAULT;
  420. }
  421. EXPORT_SYMBOL_GPL(__gmap_translate);
  422. /**
  423. * gmap_translate - translate a guest address to a user space address
  424. * @gmap: pointer to guest mapping meta data structure
  425. * @gaddr: guest address
  426. *
  427. * Returns user space address which corresponds to the guest address or
  428. * -EFAULT if no such mapping exists.
  429. * This function does not establish potentially missing page table entries.
  430. */
  431. unsigned long gmap_translate(struct gmap *gmap, unsigned long gaddr)
  432. {
  433. unsigned long rc;
  434. down_read(&gmap->mm->mmap_sem);
  435. rc = __gmap_translate(gmap, gaddr);
  436. up_read(&gmap->mm->mmap_sem);
  437. return rc;
  438. }
  439. EXPORT_SYMBOL_GPL(gmap_translate);
  440. /**
  441. * gmap_unlink - disconnect a page table from the gmap shadow tables
  442. * @gmap: pointer to guest mapping meta data structure
  443. * @table: pointer to the host page table
  444. * @vmaddr: vm address associated with the host page table
  445. */
  446. static void gmap_unlink(struct mm_struct *mm, unsigned long *table,
  447. unsigned long vmaddr)
  448. {
  449. struct gmap *gmap;
  450. int flush;
  451. list_for_each_entry(gmap, &mm->context.gmap_list, list) {
  452. flush = __gmap_unlink_by_vmaddr(gmap, vmaddr);
  453. if (flush)
  454. gmap_flush_tlb(gmap);
  455. }
  456. }
  457. /**
  458. * gmap_link - set up shadow page tables to connect a host to a guest address
  459. * @gmap: pointer to guest mapping meta data structure
  460. * @gaddr: guest address
  461. * @vmaddr: vm address
  462. *
  463. * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT
  464. * if the vm address is already mapped to a different guest segment.
  465. * The mmap_sem of the mm that belongs to the address space must be held
  466. * when this function gets called.
  467. */
  468. int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
  469. {
  470. struct mm_struct *mm;
  471. unsigned long *table;
  472. spinlock_t *ptl;
  473. pgd_t *pgd;
  474. pud_t *pud;
  475. pmd_t *pmd;
  476. int rc;
  477. /* Create higher level tables in the gmap page table */
  478. table = gmap->table;
  479. if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION1) {
  480. table += (gaddr >> 53) & 0x7ff;
  481. if ((*table & _REGION_ENTRY_INVALID) &&
  482. gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY,
  483. gaddr & 0xffe0000000000000UL))
  484. return -ENOMEM;
  485. table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
  486. }
  487. if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION2) {
  488. table += (gaddr >> 42) & 0x7ff;
  489. if ((*table & _REGION_ENTRY_INVALID) &&
  490. gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY,
  491. gaddr & 0xfffffc0000000000UL))
  492. return -ENOMEM;
  493. table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
  494. }
  495. if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION3) {
  496. table += (gaddr >> 31) & 0x7ff;
  497. if ((*table & _REGION_ENTRY_INVALID) &&
  498. gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY,
  499. gaddr & 0xffffffff80000000UL))
  500. return -ENOMEM;
  501. table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
  502. }
  503. table += (gaddr >> 20) & 0x7ff;
  504. /* Walk the parent mm page table */
  505. mm = gmap->mm;
  506. pgd = pgd_offset(mm, vmaddr);
  507. VM_BUG_ON(pgd_none(*pgd));
  508. pud = pud_offset(pgd, vmaddr);
  509. VM_BUG_ON(pud_none(*pud));
  510. pmd = pmd_offset(pud, vmaddr);
  511. VM_BUG_ON(pmd_none(*pmd));
  512. /* large pmds cannot yet be handled */
  513. if (pmd_large(*pmd))
  514. return -EFAULT;
  515. /* Link gmap segment table entry location to page table. */
  516. rc = radix_tree_preload(GFP_KERNEL);
  517. if (rc)
  518. return rc;
  519. ptl = pmd_lock(mm, pmd);
  520. spin_lock(&gmap->guest_table_lock);
  521. if (*table == _SEGMENT_ENTRY_INVALID) {
  522. rc = radix_tree_insert(&gmap->host_to_guest,
  523. vmaddr >> PMD_SHIFT, table);
  524. if (!rc)
  525. *table = pmd_val(*pmd);
  526. } else
  527. rc = 0;
  528. spin_unlock(&gmap->guest_table_lock);
  529. spin_unlock(ptl);
  530. radix_tree_preload_end();
  531. return rc;
  532. }
  533. /**
  534. * gmap_fault - resolve a fault on a guest address
  535. * @gmap: pointer to guest mapping meta data structure
  536. * @gaddr: guest address
  537. * @fault_flags: flags to pass down to handle_mm_fault()
  538. *
  539. * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT
  540. * if the vm address is already mapped to a different guest segment.
  541. */
  542. int gmap_fault(struct gmap *gmap, unsigned long gaddr,
  543. unsigned int fault_flags)
  544. {
  545. unsigned long vmaddr;
  546. int rc;
  547. down_read(&gmap->mm->mmap_sem);
  548. vmaddr = __gmap_translate(gmap, gaddr);
  549. if (IS_ERR_VALUE(vmaddr)) {
  550. rc = vmaddr;
  551. goto out_up;
  552. }
  553. if (fixup_user_fault(current, gmap->mm, vmaddr, fault_flags)) {
  554. rc = -EFAULT;
  555. goto out_up;
  556. }
  557. rc = __gmap_link(gmap, gaddr, vmaddr);
  558. out_up:
  559. up_read(&gmap->mm->mmap_sem);
  560. return rc;
  561. }
  562. EXPORT_SYMBOL_GPL(gmap_fault);
  563. static void gmap_zap_swap_entry(swp_entry_t entry, struct mm_struct *mm)
  564. {
  565. if (!non_swap_entry(entry))
  566. dec_mm_counter(mm, MM_SWAPENTS);
  567. else if (is_migration_entry(entry)) {
  568. struct page *page = migration_entry_to_page(entry);
  569. if (PageAnon(page))
  570. dec_mm_counter(mm, MM_ANONPAGES);
  571. else
  572. dec_mm_counter(mm, MM_FILEPAGES);
  573. }
  574. free_swap_and_cache(entry);
  575. }
  576. /*
  577. * this function is assumed to be called with mmap_sem held
  578. */
  579. void __gmap_zap(struct gmap *gmap, unsigned long gaddr)
  580. {
  581. unsigned long vmaddr, ptev, pgstev;
  582. pte_t *ptep, pte;
  583. spinlock_t *ptl;
  584. pgste_t pgste;
  585. /* Find the vm address for the guest address */
  586. vmaddr = (unsigned long) radix_tree_lookup(&gmap->guest_to_host,
  587. gaddr >> PMD_SHIFT);
  588. if (!vmaddr)
  589. return;
  590. vmaddr |= gaddr & ~PMD_MASK;
  591. /* Get pointer to the page table entry */
  592. ptep = get_locked_pte(gmap->mm, vmaddr, &ptl);
  593. if (unlikely(!ptep))
  594. return;
  595. pte = *ptep;
  596. if (!pte_swap(pte))
  597. goto out_pte;
  598. /* Zap unused and logically-zero pages */
  599. pgste = pgste_get_lock(ptep);
  600. pgstev = pgste_val(pgste);
  601. ptev = pte_val(pte);
  602. if (((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED) ||
  603. ((pgstev & _PGSTE_GPS_ZERO) && (ptev & _PAGE_INVALID))) {
  604. gmap_zap_swap_entry(pte_to_swp_entry(pte), gmap->mm);
  605. pte_clear(gmap->mm, vmaddr, ptep);
  606. }
  607. pgste_set_unlock(ptep, pgste);
  608. out_pte:
  609. pte_unmap_unlock(ptep, ptl);
  610. }
  611. EXPORT_SYMBOL_GPL(__gmap_zap);
  612. void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to)
  613. {
  614. unsigned long gaddr, vmaddr, size;
  615. struct vm_area_struct *vma;
  616. down_read(&gmap->mm->mmap_sem);
  617. for (gaddr = from; gaddr < to;
  618. gaddr = (gaddr + PMD_SIZE) & PMD_MASK) {
  619. /* Find the vm address for the guest address */
  620. vmaddr = (unsigned long)
  621. radix_tree_lookup(&gmap->guest_to_host,
  622. gaddr >> PMD_SHIFT);
  623. if (!vmaddr)
  624. continue;
  625. vmaddr |= gaddr & ~PMD_MASK;
  626. /* Find vma in the parent mm */
  627. vma = find_vma(gmap->mm, vmaddr);
  628. size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK));
  629. zap_page_range(vma, vmaddr, size, NULL);
  630. }
  631. up_read(&gmap->mm->mmap_sem);
  632. }
  633. EXPORT_SYMBOL_GPL(gmap_discard);
  634. static LIST_HEAD(gmap_notifier_list);
  635. static DEFINE_SPINLOCK(gmap_notifier_lock);
  636. /**
  637. * gmap_register_ipte_notifier - register a pte invalidation callback
  638. * @nb: pointer to the gmap notifier block
  639. */
  640. void gmap_register_ipte_notifier(struct gmap_notifier *nb)
  641. {
  642. spin_lock(&gmap_notifier_lock);
  643. list_add(&nb->list, &gmap_notifier_list);
  644. spin_unlock(&gmap_notifier_lock);
  645. }
  646. EXPORT_SYMBOL_GPL(gmap_register_ipte_notifier);
  647. /**
  648. * gmap_unregister_ipte_notifier - remove a pte invalidation callback
  649. * @nb: pointer to the gmap notifier block
  650. */
  651. void gmap_unregister_ipte_notifier(struct gmap_notifier *nb)
  652. {
  653. spin_lock(&gmap_notifier_lock);
  654. list_del_init(&nb->list);
  655. spin_unlock(&gmap_notifier_lock);
  656. }
  657. EXPORT_SYMBOL_GPL(gmap_unregister_ipte_notifier);
  658. /**
  659. * gmap_ipte_notify - mark a range of ptes for invalidation notification
  660. * @gmap: pointer to guest mapping meta data structure
  661. * @gaddr: virtual address in the guest address space
  662. * @len: size of area
  663. *
  664. * Returns 0 if for each page in the given range a gmap mapping exists and
  665. * the invalidation notification could be set. If the gmap mapping is missing
  666. * for one or more pages -EFAULT is returned. If no memory could be allocated
  667. * -ENOMEM is returned. This function establishes missing page table entries.
  668. */
  669. int gmap_ipte_notify(struct gmap *gmap, unsigned long gaddr, unsigned long len)
  670. {
  671. unsigned long addr;
  672. spinlock_t *ptl;
  673. pte_t *ptep, entry;
  674. pgste_t pgste;
  675. int rc = 0;
  676. if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK))
  677. return -EINVAL;
  678. down_read(&gmap->mm->mmap_sem);
  679. while (len) {
  680. /* Convert gmap address and connect the page tables */
  681. addr = __gmap_translate(gmap, gaddr);
  682. if (IS_ERR_VALUE(addr)) {
  683. rc = addr;
  684. break;
  685. }
  686. /* Get the page mapped */
  687. if (fixup_user_fault(current, gmap->mm, addr, FAULT_FLAG_WRITE)) {
  688. rc = -EFAULT;
  689. break;
  690. }
  691. rc = __gmap_link(gmap, gaddr, addr);
  692. if (rc)
  693. break;
  694. /* Walk the process page table, lock and get pte pointer */
  695. ptep = get_locked_pte(gmap->mm, addr, &ptl);
  696. VM_BUG_ON(!ptep);
  697. /* Set notification bit in the pgste of the pte */
  698. entry = *ptep;
  699. if ((pte_val(entry) & (_PAGE_INVALID | _PAGE_PROTECT)) == 0) {
  700. pgste = pgste_get_lock(ptep);
  701. pgste_val(pgste) |= PGSTE_IN_BIT;
  702. pgste_set_unlock(ptep, pgste);
  703. gaddr += PAGE_SIZE;
  704. len -= PAGE_SIZE;
  705. }
  706. pte_unmap_unlock(ptep, ptl);
  707. }
  708. up_read(&gmap->mm->mmap_sem);
  709. return rc;
  710. }
  711. EXPORT_SYMBOL_GPL(gmap_ipte_notify);
  712. /**
  713. * gmap_do_ipte_notify - call all invalidation callbacks for a specific pte.
  714. * @mm: pointer to the process mm_struct
  715. * @addr: virtual address in the process address space
  716. * @pte: pointer to the page table entry
  717. *
  718. * This function is assumed to be called with the page table lock held
  719. * for the pte to notify.
  720. */
  721. void gmap_do_ipte_notify(struct mm_struct *mm, unsigned long vmaddr, pte_t *pte)
  722. {
  723. unsigned long offset, gaddr;
  724. unsigned long *table;
  725. struct gmap_notifier *nb;
  726. struct gmap *gmap;
  727. offset = ((unsigned long) pte) & (255 * sizeof(pte_t));
  728. offset = offset * (4096 / sizeof(pte_t));
  729. spin_lock(&gmap_notifier_lock);
  730. list_for_each_entry(gmap, &mm->context.gmap_list, list) {
  731. table = radix_tree_lookup(&gmap->host_to_guest,
  732. vmaddr >> PMD_SHIFT);
  733. if (!table)
  734. continue;
  735. gaddr = __gmap_segment_gaddr(table) + offset;
  736. list_for_each_entry(nb, &gmap_notifier_list, list)
  737. nb->notifier_call(gmap, gaddr);
  738. }
  739. spin_unlock(&gmap_notifier_lock);
  740. }
  741. EXPORT_SYMBOL_GPL(gmap_do_ipte_notify);
  742. static inline int page_table_with_pgste(struct page *page)
  743. {
  744. return atomic_read(&page->_mapcount) == 0;
  745. }
  746. static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm)
  747. {
  748. struct page *page;
  749. unsigned long *table;
  750. page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
  751. if (!page)
  752. return NULL;
  753. if (!pgtable_page_ctor(page)) {
  754. __free_page(page);
  755. return NULL;
  756. }
  757. atomic_set(&page->_mapcount, 0);
  758. table = (unsigned long *) page_to_phys(page);
  759. clear_table(table, _PAGE_INVALID, PAGE_SIZE/2);
  760. clear_table(table + PTRS_PER_PTE, 0, PAGE_SIZE/2);
  761. return table;
  762. }
  763. static inline void page_table_free_pgste(unsigned long *table)
  764. {
  765. struct page *page;
  766. page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
  767. pgtable_page_dtor(page);
  768. atomic_set(&page->_mapcount, -1);
  769. __free_page(page);
  770. }
  771. int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
  772. unsigned long key, bool nq)
  773. {
  774. spinlock_t *ptl;
  775. pgste_t old, new;
  776. pte_t *ptep;
  777. down_read(&mm->mmap_sem);
  778. retry:
  779. ptep = get_locked_pte(mm, addr, &ptl);
  780. if (unlikely(!ptep)) {
  781. up_read(&mm->mmap_sem);
  782. return -EFAULT;
  783. }
  784. if (!(pte_val(*ptep) & _PAGE_INVALID) &&
  785. (pte_val(*ptep) & _PAGE_PROTECT)) {
  786. pte_unmap_unlock(ptep, ptl);
  787. if (fixup_user_fault(current, mm, addr, FAULT_FLAG_WRITE)) {
  788. up_read(&mm->mmap_sem);
  789. return -EFAULT;
  790. }
  791. goto retry;
  792. }
  793. new = old = pgste_get_lock(ptep);
  794. pgste_val(new) &= ~(PGSTE_GR_BIT | PGSTE_GC_BIT |
  795. PGSTE_ACC_BITS | PGSTE_FP_BIT);
  796. pgste_val(new) |= (key & (_PAGE_CHANGED | _PAGE_REFERENCED)) << 48;
  797. pgste_val(new) |= (key & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56;
  798. if (!(pte_val(*ptep) & _PAGE_INVALID)) {
  799. unsigned long address, bits, skey;
  800. address = pte_val(*ptep) & PAGE_MASK;
  801. skey = (unsigned long) page_get_storage_key(address);
  802. bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED);
  803. skey = key & (_PAGE_ACC_BITS | _PAGE_FP_BIT);
  804. /* Set storage key ACC and FP */
  805. page_set_storage_key(address, skey, !nq);
  806. /* Merge host changed & referenced into pgste */
  807. pgste_val(new) |= bits << 52;
  808. }
  809. /* changing the guest storage key is considered a change of the page */
  810. if ((pgste_val(new) ^ pgste_val(old)) &
  811. (PGSTE_ACC_BITS | PGSTE_FP_BIT | PGSTE_GR_BIT | PGSTE_GC_BIT))
  812. pgste_val(new) |= PGSTE_UC_BIT;
  813. pgste_set_unlock(ptep, new);
  814. pte_unmap_unlock(ptep, ptl);
  815. up_read(&mm->mmap_sem);
  816. return 0;
  817. }
  818. EXPORT_SYMBOL(set_guest_storage_key);
  819. unsigned long get_guest_storage_key(struct mm_struct *mm, unsigned long addr)
  820. {
  821. spinlock_t *ptl;
  822. pgste_t pgste;
  823. pte_t *ptep;
  824. uint64_t physaddr;
  825. unsigned long key = 0;
  826. down_read(&mm->mmap_sem);
  827. ptep = get_locked_pte(mm, addr, &ptl);
  828. if (unlikely(!ptep)) {
  829. up_read(&mm->mmap_sem);
  830. return -EFAULT;
  831. }
  832. pgste = pgste_get_lock(ptep);
  833. if (pte_val(*ptep) & _PAGE_INVALID) {
  834. key |= (pgste_val(pgste) & PGSTE_ACC_BITS) >> 56;
  835. key |= (pgste_val(pgste) & PGSTE_FP_BIT) >> 56;
  836. key |= (pgste_val(pgste) & PGSTE_GR_BIT) >> 48;
  837. key |= (pgste_val(pgste) & PGSTE_GC_BIT) >> 48;
  838. } else {
  839. physaddr = pte_val(*ptep) & PAGE_MASK;
  840. key = page_get_storage_key(physaddr);
  841. /* Reflect guest's logical view, not physical */
  842. if (pgste_val(pgste) & PGSTE_GR_BIT)
  843. key |= _PAGE_REFERENCED;
  844. if (pgste_val(pgste) & PGSTE_GC_BIT)
  845. key |= _PAGE_CHANGED;
  846. }
  847. pgste_set_unlock(ptep, pgste);
  848. pte_unmap_unlock(ptep, ptl);
  849. up_read(&mm->mmap_sem);
  850. return key;
  851. }
  852. EXPORT_SYMBOL(get_guest_storage_key);
  853. #else /* CONFIG_PGSTE */
  854. static inline int page_table_with_pgste(struct page *page)
  855. {
  856. return 0;
  857. }
  858. static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm)
  859. {
  860. return NULL;
  861. }
  862. static inline void page_table_free_pgste(unsigned long *table)
  863. {
  864. }
  865. static inline void gmap_unlink(struct mm_struct *mm, unsigned long *table,
  866. unsigned long vmaddr)
  867. {
  868. }
  869. #endif /* CONFIG_PGSTE */
  870. static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits)
  871. {
  872. unsigned int old, new;
  873. do {
  874. old = atomic_read(v);
  875. new = old ^ bits;
  876. } while (atomic_cmpxchg(v, old, new) != old);
  877. return new;
  878. }
  879. /*
  880. * page table entry allocation/free routines.
  881. */
  882. unsigned long *page_table_alloc(struct mm_struct *mm)
  883. {
  884. unsigned long *uninitialized_var(table);
  885. struct page *uninitialized_var(page);
  886. unsigned int mask, bit;
  887. if (mm_has_pgste(mm))
  888. return page_table_alloc_pgste(mm);
  889. /* Allocate fragments of a 4K page as 1K/2K page table */
  890. spin_lock_bh(&mm->context.list_lock);
  891. mask = FRAG_MASK;
  892. if (!list_empty(&mm->context.pgtable_list)) {
  893. page = list_first_entry(&mm->context.pgtable_list,
  894. struct page, lru);
  895. table = (unsigned long *) page_to_phys(page);
  896. mask = atomic_read(&page->_mapcount);
  897. mask = mask | (mask >> 4);
  898. }
  899. if ((mask & FRAG_MASK) == FRAG_MASK) {
  900. spin_unlock_bh(&mm->context.list_lock);
  901. page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
  902. if (!page)
  903. return NULL;
  904. if (!pgtable_page_ctor(page)) {
  905. __free_page(page);
  906. return NULL;
  907. }
  908. atomic_set(&page->_mapcount, 1);
  909. table = (unsigned long *) page_to_phys(page);
  910. clear_table(table, _PAGE_INVALID, PAGE_SIZE);
  911. spin_lock_bh(&mm->context.list_lock);
  912. list_add(&page->lru, &mm->context.pgtable_list);
  913. } else {
  914. for (bit = 1; mask & bit; bit <<= 1)
  915. table += PTRS_PER_PTE;
  916. mask = atomic_xor_bits(&page->_mapcount, bit);
  917. if ((mask & FRAG_MASK) == FRAG_MASK)
  918. list_del(&page->lru);
  919. }
  920. spin_unlock_bh(&mm->context.list_lock);
  921. return table;
  922. }
  923. void page_table_free(struct mm_struct *mm, unsigned long *table)
  924. {
  925. struct page *page;
  926. unsigned int bit, mask;
  927. page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
  928. if (page_table_with_pgste(page))
  929. return page_table_free_pgste(table);
  930. /* Free 1K/2K page table fragment of a 4K page */
  931. bit = 1 << ((__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t)));
  932. spin_lock_bh(&mm->context.list_lock);
  933. if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK)
  934. list_del(&page->lru);
  935. mask = atomic_xor_bits(&page->_mapcount, bit);
  936. if (mask & FRAG_MASK)
  937. list_add(&page->lru, &mm->context.pgtable_list);
  938. spin_unlock_bh(&mm->context.list_lock);
  939. if (mask == 0) {
  940. pgtable_page_dtor(page);
  941. atomic_set(&page->_mapcount, -1);
  942. __free_page(page);
  943. }
  944. }
  945. static void __page_table_free_rcu(void *table, unsigned bit)
  946. {
  947. struct page *page;
  948. if (bit == FRAG_MASK)
  949. return page_table_free_pgste(table);
  950. /* Free 1K/2K page table fragment of a 4K page */
  951. page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
  952. if (atomic_xor_bits(&page->_mapcount, bit) == 0) {
  953. pgtable_page_dtor(page);
  954. atomic_set(&page->_mapcount, -1);
  955. __free_page(page);
  956. }
  957. }
  958. void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table,
  959. unsigned long vmaddr)
  960. {
  961. struct mm_struct *mm;
  962. struct page *page;
  963. unsigned int bit, mask;
  964. mm = tlb->mm;
  965. page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
  966. if (page_table_with_pgste(page)) {
  967. gmap_unlink(mm, table, vmaddr);
  968. table = (unsigned long *) (__pa(table) | FRAG_MASK);
  969. tlb_remove_table(tlb, table);
  970. return;
  971. }
  972. bit = 1 << ((__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t)));
  973. spin_lock_bh(&mm->context.list_lock);
  974. if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK)
  975. list_del(&page->lru);
  976. mask = atomic_xor_bits(&page->_mapcount, bit | (bit << 4));
  977. if (mask & FRAG_MASK)
  978. list_add_tail(&page->lru, &mm->context.pgtable_list);
  979. spin_unlock_bh(&mm->context.list_lock);
  980. table = (unsigned long *) (__pa(table) | (bit << 4));
  981. tlb_remove_table(tlb, table);
  982. }
  983. static void __tlb_remove_table(void *_table)
  984. {
  985. const unsigned long mask = (FRAG_MASK << 4) | FRAG_MASK;
  986. void *table = (void *)((unsigned long) _table & ~mask);
  987. unsigned type = (unsigned long) _table & mask;
  988. if (type)
  989. __page_table_free_rcu(table, type);
  990. else
  991. free_pages((unsigned long) table, ALLOC_ORDER);
  992. }
  993. static void tlb_remove_table_smp_sync(void *arg)
  994. {
  995. /* Simply deliver the interrupt */
  996. }
  997. static void tlb_remove_table_one(void *table)
  998. {
  999. /*
  1000. * This isn't an RCU grace period and hence the page-tables cannot be
  1001. * assumed to be actually RCU-freed.
  1002. *
  1003. * It is however sufficient for software page-table walkers that rely
  1004. * on IRQ disabling. See the comment near struct mmu_table_batch.
  1005. */
  1006. smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
  1007. __tlb_remove_table(table);
  1008. }
  1009. static void tlb_remove_table_rcu(struct rcu_head *head)
  1010. {
  1011. struct mmu_table_batch *batch;
  1012. int i;
  1013. batch = container_of(head, struct mmu_table_batch, rcu);
  1014. for (i = 0; i < batch->nr; i++)
  1015. __tlb_remove_table(batch->tables[i]);
  1016. free_page((unsigned long)batch);
  1017. }
  1018. void tlb_table_flush(struct mmu_gather *tlb)
  1019. {
  1020. struct mmu_table_batch **batch = &tlb->batch;
  1021. if (*batch) {
  1022. call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
  1023. *batch = NULL;
  1024. }
  1025. }
  1026. void tlb_remove_table(struct mmu_gather *tlb, void *table)
  1027. {
  1028. struct mmu_table_batch **batch = &tlb->batch;
  1029. tlb->mm->context.flush_mm = 1;
  1030. if (*batch == NULL) {
  1031. *batch = (struct mmu_table_batch *)
  1032. __get_free_page(GFP_NOWAIT | __GFP_NOWARN);
  1033. if (*batch == NULL) {
  1034. __tlb_flush_mm_lazy(tlb->mm);
  1035. tlb_remove_table_one(table);
  1036. return;
  1037. }
  1038. (*batch)->nr = 0;
  1039. }
  1040. (*batch)->tables[(*batch)->nr++] = table;
  1041. if ((*batch)->nr == MAX_TABLE_BATCH)
  1042. tlb_flush_mmu(tlb);
  1043. }
  1044. #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  1045. static inline void thp_split_vma(struct vm_area_struct *vma)
  1046. {
  1047. unsigned long addr;
  1048. for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE)
  1049. follow_page(vma, addr, FOLL_SPLIT);
  1050. }
  1051. static inline void thp_split_mm(struct mm_struct *mm)
  1052. {
  1053. struct vm_area_struct *vma;
  1054. for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) {
  1055. thp_split_vma(vma);
  1056. vma->vm_flags &= ~VM_HUGEPAGE;
  1057. vma->vm_flags |= VM_NOHUGEPAGE;
  1058. }
  1059. mm->def_flags |= VM_NOHUGEPAGE;
  1060. }
  1061. #else
  1062. static inline void thp_split_mm(struct mm_struct *mm)
  1063. {
  1064. }
  1065. #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
  1066. static unsigned long page_table_realloc_pmd(struct mmu_gather *tlb,
  1067. struct mm_struct *mm, pud_t *pud,
  1068. unsigned long addr, unsigned long end)
  1069. {
  1070. unsigned long next, *table, *new;
  1071. struct page *page;
  1072. spinlock_t *ptl;
  1073. pmd_t *pmd;
  1074. pmd = pmd_offset(pud, addr);
  1075. do {
  1076. next = pmd_addr_end(addr, end);
  1077. again:
  1078. if (pmd_none_or_clear_bad(pmd))
  1079. continue;
  1080. table = (unsigned long *) pmd_deref(*pmd);
  1081. page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
  1082. if (page_table_with_pgste(page))
  1083. continue;
  1084. /* Allocate new page table with pgstes */
  1085. new = page_table_alloc_pgste(mm);
  1086. if (!new)
  1087. return -ENOMEM;
  1088. ptl = pmd_lock(mm, pmd);
  1089. if (likely((unsigned long *) pmd_deref(*pmd) == table)) {
  1090. /* Nuke pmd entry pointing to the "short" page table */
  1091. pmdp_flush_lazy(mm, addr, pmd);
  1092. pmd_clear(pmd);
  1093. /* Copy ptes from old table to new table */
  1094. memcpy(new, table, PAGE_SIZE/2);
  1095. clear_table(table, _PAGE_INVALID, PAGE_SIZE/2);
  1096. /* Establish new table */
  1097. pmd_populate(mm, pmd, (pte_t *) new);
  1098. /* Free old table with rcu, there might be a walker! */
  1099. page_table_free_rcu(tlb, table, addr);
  1100. new = NULL;
  1101. }
  1102. spin_unlock(ptl);
  1103. if (new) {
  1104. page_table_free_pgste(new);
  1105. goto again;
  1106. }
  1107. } while (pmd++, addr = next, addr != end);
  1108. return addr;
  1109. }
  1110. static unsigned long page_table_realloc_pud(struct mmu_gather *tlb,
  1111. struct mm_struct *mm, pgd_t *pgd,
  1112. unsigned long addr, unsigned long end)
  1113. {
  1114. unsigned long next;
  1115. pud_t *pud;
  1116. pud = pud_offset(pgd, addr);
  1117. do {
  1118. next = pud_addr_end(addr, end);
  1119. if (pud_none_or_clear_bad(pud))
  1120. continue;
  1121. next = page_table_realloc_pmd(tlb, mm, pud, addr, next);
  1122. if (unlikely(IS_ERR_VALUE(next)))
  1123. return next;
  1124. } while (pud++, addr = next, addr != end);
  1125. return addr;
  1126. }
  1127. static unsigned long page_table_realloc(struct mmu_gather *tlb, struct mm_struct *mm,
  1128. unsigned long addr, unsigned long end)
  1129. {
  1130. unsigned long next;
  1131. pgd_t *pgd;
  1132. pgd = pgd_offset(mm, addr);
  1133. do {
  1134. next = pgd_addr_end(addr, end);
  1135. if (pgd_none_or_clear_bad(pgd))
  1136. continue;
  1137. next = page_table_realloc_pud(tlb, mm, pgd, addr, next);
  1138. if (unlikely(IS_ERR_VALUE(next)))
  1139. return next;
  1140. } while (pgd++, addr = next, addr != end);
  1141. return 0;
  1142. }
  1143. /*
  1144. * switch on pgstes for its userspace process (for kvm)
  1145. */
  1146. int s390_enable_sie(void)
  1147. {
  1148. struct task_struct *tsk = current;
  1149. struct mm_struct *mm = tsk->mm;
  1150. struct mmu_gather tlb;
  1151. /* Do we have pgstes? if yes, we are done */
  1152. if (mm_has_pgste(tsk->mm))
  1153. return 0;
  1154. down_write(&mm->mmap_sem);
  1155. /* split thp mappings and disable thp for future mappings */
  1156. thp_split_mm(mm);
  1157. /* Reallocate the page tables with pgstes */
  1158. tlb_gather_mmu(&tlb, mm, 0, TASK_SIZE);
  1159. if (!page_table_realloc(&tlb, mm, 0, TASK_SIZE))
  1160. mm->context.has_pgste = 1;
  1161. tlb_finish_mmu(&tlb, 0, TASK_SIZE);
  1162. up_write(&mm->mmap_sem);
  1163. return mm->context.has_pgste ? 0 : -ENOMEM;
  1164. }
  1165. EXPORT_SYMBOL_GPL(s390_enable_sie);
  1166. /*
  1167. * Enable storage key handling from now on and initialize the storage
  1168. * keys with the default key.
  1169. */
  1170. static int __s390_enable_skey(pte_t *pte, unsigned long addr,
  1171. unsigned long next, struct mm_walk *walk)
  1172. {
  1173. unsigned long ptev;
  1174. pgste_t pgste;
  1175. pgste = pgste_get_lock(pte);
  1176. /*
  1177. * Remove all zero page mappings,
  1178. * after establishing a policy to forbid zero page mappings
  1179. * following faults for that page will get fresh anonymous pages
  1180. */
  1181. if (is_zero_pfn(pte_pfn(*pte))) {
  1182. ptep_flush_direct(walk->mm, addr, pte);
  1183. pte_val(*pte) = _PAGE_INVALID;
  1184. }
  1185. /* Clear storage key */
  1186. pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT |
  1187. PGSTE_GR_BIT | PGSTE_GC_BIT);
  1188. ptev = pte_val(*pte);
  1189. if (!(ptev & _PAGE_INVALID) && (ptev & _PAGE_WRITE))
  1190. page_set_storage_key(ptev & PAGE_MASK, PAGE_DEFAULT_KEY, 1);
  1191. pgste_set_unlock(pte, pgste);
  1192. return 0;
  1193. }
  1194. int s390_enable_skey(void)
  1195. {
  1196. struct mm_walk walk = { .pte_entry = __s390_enable_skey };
  1197. struct mm_struct *mm = current->mm;
  1198. struct vm_area_struct *vma;
  1199. int rc = 0;
  1200. down_write(&mm->mmap_sem);
  1201. if (mm_use_skey(mm))
  1202. goto out_up;
  1203. mm->context.use_skey = 1;
  1204. for (vma = mm->mmap; vma; vma = vma->vm_next) {
  1205. if (ksm_madvise(vma, vma->vm_start, vma->vm_end,
  1206. MADV_UNMERGEABLE, &vma->vm_flags)) {
  1207. mm->context.use_skey = 0;
  1208. rc = -ENOMEM;
  1209. goto out_up;
  1210. }
  1211. }
  1212. mm->def_flags &= ~VM_MERGEABLE;
  1213. walk.mm = mm;
  1214. walk_page_range(0, TASK_SIZE, &walk);
  1215. out_up:
  1216. up_write(&mm->mmap_sem);
  1217. return rc;
  1218. }
  1219. EXPORT_SYMBOL_GPL(s390_enable_skey);
  1220. /*
  1221. * Reset CMMA state, make all pages stable again.
  1222. */
  1223. static int __s390_reset_cmma(pte_t *pte, unsigned long addr,
  1224. unsigned long next, struct mm_walk *walk)
  1225. {
  1226. pgste_t pgste;
  1227. pgste = pgste_get_lock(pte);
  1228. pgste_val(pgste) &= ~_PGSTE_GPS_USAGE_MASK;
  1229. pgste_set_unlock(pte, pgste);
  1230. return 0;
  1231. }
  1232. void s390_reset_cmma(struct mm_struct *mm)
  1233. {
  1234. struct mm_walk walk = { .pte_entry = __s390_reset_cmma };
  1235. down_write(&mm->mmap_sem);
  1236. walk.mm = mm;
  1237. walk_page_range(0, TASK_SIZE, &walk);
  1238. up_write(&mm->mmap_sem);
  1239. }
  1240. EXPORT_SYMBOL_GPL(s390_reset_cmma);
  1241. /*
  1242. * Test and reset if a guest page is dirty
  1243. */
  1244. bool gmap_test_and_clear_dirty(unsigned long address, struct gmap *gmap)
  1245. {
  1246. pte_t *pte;
  1247. spinlock_t *ptl;
  1248. bool dirty = false;
  1249. pte = get_locked_pte(gmap->mm, address, &ptl);
  1250. if (unlikely(!pte))
  1251. return false;
  1252. if (ptep_test_and_clear_user_dirty(gmap->mm, address, pte))
  1253. dirty = true;
  1254. spin_unlock(ptl);
  1255. return dirty;
  1256. }
  1257. EXPORT_SYMBOL_GPL(gmap_test_and_clear_dirty);
  1258. #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  1259. int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address,
  1260. pmd_t *pmdp)
  1261. {
  1262. VM_BUG_ON(address & ~HPAGE_PMD_MASK);
  1263. /* No need to flush TLB
  1264. * On s390 reference bits are in storage key and never in TLB */
  1265. return pmdp_test_and_clear_young(vma, address, pmdp);
  1266. }
  1267. int pmdp_set_access_flags(struct vm_area_struct *vma,
  1268. unsigned long address, pmd_t *pmdp,
  1269. pmd_t entry, int dirty)
  1270. {
  1271. VM_BUG_ON(address & ~HPAGE_PMD_MASK);
  1272. entry = pmd_mkyoung(entry);
  1273. if (dirty)
  1274. entry = pmd_mkdirty(entry);
  1275. if (pmd_same(*pmdp, entry))
  1276. return 0;
  1277. pmdp_invalidate(vma, address, pmdp);
  1278. set_pmd_at(vma->vm_mm, address, pmdp, entry);
  1279. return 1;
  1280. }
  1281. static void pmdp_splitting_flush_sync(void *arg)
  1282. {
  1283. /* Simply deliver the interrupt */
  1284. }
  1285. void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
  1286. pmd_t *pmdp)
  1287. {
  1288. VM_BUG_ON(address & ~HPAGE_PMD_MASK);
  1289. if (!test_and_set_bit(_SEGMENT_ENTRY_SPLIT_BIT,
  1290. (unsigned long *) pmdp)) {
  1291. /* need to serialize against gup-fast (IRQ disabled) */
  1292. smp_call_function(pmdp_splitting_flush_sync, NULL, 1);
  1293. }
  1294. }
  1295. void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
  1296. pgtable_t pgtable)
  1297. {
  1298. struct list_head *lh = (struct list_head *) pgtable;
  1299. assert_spin_locked(pmd_lockptr(mm, pmdp));
  1300. /* FIFO */
  1301. if (!pmd_huge_pte(mm, pmdp))
  1302. INIT_LIST_HEAD(lh);
  1303. else
  1304. list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp));
  1305. pmd_huge_pte(mm, pmdp) = pgtable;
  1306. }
  1307. pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
  1308. {
  1309. struct list_head *lh;
  1310. pgtable_t pgtable;
  1311. pte_t *ptep;
  1312. assert_spin_locked(pmd_lockptr(mm, pmdp));
  1313. /* FIFO */
  1314. pgtable = pmd_huge_pte(mm, pmdp);
  1315. lh = (struct list_head *) pgtable;
  1316. if (list_empty(lh))
  1317. pmd_huge_pte(mm, pmdp) = NULL;
  1318. else {
  1319. pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next;
  1320. list_del(lh);
  1321. }
  1322. ptep = (pte_t *) pgtable;
  1323. pte_val(*ptep) = _PAGE_INVALID;
  1324. ptep++;
  1325. pte_val(*ptep) = _PAGE_INVALID;
  1326. return pgtable;
  1327. }
  1328. #endif /* CONFIG_TRANSPARENT_HUGEPAGE */