fremap.c 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283
  1. /*
  2. * linux/mm/fremap.c
  3. *
  4. * Explicit pagetable population and nonlinear (random) mappings support.
  5. *
  6. * started by Ingo Molnar, Copyright (C) 2002, 2003
  7. */
  8. #include <linux/export.h>
  9. #include <linux/backing-dev.h>
  10. #include <linux/mm.h>
  11. #include <linux/swap.h>
  12. #include <linux/file.h>
  13. #include <linux/mman.h>
  14. #include <linux/pagemap.h>
  15. #include <linux/swapops.h>
  16. #include <linux/rmap.h>
  17. #include <linux/syscalls.h>
  18. #include <linux/mmu_notifier.h>
  19. #include <asm/mmu_context.h>
  20. #include <asm/cacheflush.h>
  21. #include <asm/tlbflush.h>
  22. #include "internal.h"
  23. static int mm_counter(struct page *page)
  24. {
  25. return PageAnon(page) ? MM_ANONPAGES : MM_FILEPAGES;
  26. }
  27. static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
  28. unsigned long addr, pte_t *ptep)
  29. {
  30. pte_t pte = *ptep;
  31. struct page *page;
  32. swp_entry_t entry;
  33. if (pte_present(pte)) {
  34. flush_cache_page(vma, addr, pte_pfn(pte));
  35. pte = ptep_clear_flush(vma, addr, ptep);
  36. page = vm_normal_page(vma, addr, pte);
  37. if (page) {
  38. if (pte_dirty(pte))
  39. set_page_dirty(page);
  40. update_hiwater_rss(mm);
  41. dec_mm_counter(mm, mm_counter(page));
  42. page_remove_rmap(page);
  43. page_cache_release(page);
  44. }
  45. } else { /* zap_pte() is not called when pte_none() */
  46. if (!pte_file(pte)) {
  47. update_hiwater_rss(mm);
  48. entry = pte_to_swp_entry(pte);
  49. if (non_swap_entry(entry)) {
  50. if (is_migration_entry(entry)) {
  51. page = migration_entry_to_page(entry);
  52. dec_mm_counter(mm, mm_counter(page));
  53. }
  54. } else {
  55. free_swap_and_cache(entry);
  56. dec_mm_counter(mm, MM_SWAPENTS);
  57. }
  58. }
  59. pte_clear_not_present_full(mm, addr, ptep, 0);
  60. }
  61. }
  62. /*
  63. * Install a file pte to a given virtual memory address, release any
  64. * previously existing mapping.
  65. */
  66. static int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
  67. unsigned long addr, unsigned long pgoff, pgprot_t prot)
  68. {
  69. int err = -ENOMEM;
  70. pte_t *pte, ptfile;
  71. spinlock_t *ptl;
  72. pte = get_locked_pte(mm, addr, &ptl);
  73. if (!pte)
  74. goto out;
  75. ptfile = pgoff_to_pte(pgoff);
  76. if (!pte_none(*pte))
  77. zap_pte(mm, vma, addr, pte);
  78. set_pte_at(mm, addr, pte, pte_file_mksoft_dirty(ptfile));
  79. /*
  80. * We don't need to run update_mmu_cache() here because the "file pte"
  81. * being installed by install_file_pte() is not a real pte - it's a
  82. * non-present entry (like a swap entry), noting what file offset should
  83. * be mapped there when there's a fault (in a non-linear vma where
  84. * that's not obvious).
  85. */
  86. pte_unmap_unlock(pte, ptl);
  87. err = 0;
  88. out:
  89. return err;
  90. }
  91. int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr,
  92. unsigned long size, pgoff_t pgoff)
  93. {
  94. struct mm_struct *mm = vma->vm_mm;
  95. int err;
  96. do {
  97. err = install_file_pte(mm, vma, addr, pgoff, vma->vm_page_prot);
  98. if (err)
  99. return err;
  100. size -= PAGE_SIZE;
  101. addr += PAGE_SIZE;
  102. pgoff++;
  103. } while (size);
  104. return 0;
  105. }
  106. EXPORT_SYMBOL(generic_file_remap_pages);
  107. /**
  108. * sys_remap_file_pages - remap arbitrary pages of an existing VM_SHARED vma
  109. * @start: start of the remapped virtual memory range
  110. * @size: size of the remapped virtual memory range
  111. * @prot: new protection bits of the range (see NOTE)
  112. * @pgoff: to-be-mapped page of the backing store file
  113. * @flags: 0 or MAP_NONBLOCKED - the later will cause no IO.
  114. *
  115. * sys_remap_file_pages remaps arbitrary pages of an existing VM_SHARED vma
  116. * (shared backing store file).
  117. *
  118. * This syscall works purely via pagetables, so it's the most efficient
  119. * way to map the same (large) file into a given virtual window. Unlike
  120. * mmap()/mremap() it does not create any new vmas. The new mappings are
  121. * also safe across swapout.
  122. *
  123. * NOTE: the @prot parameter right now is ignored (but must be zero),
  124. * and the vma's default protection is used. Arbitrary protections
  125. * might be implemented in the future.
  126. */
  127. SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
  128. unsigned long, prot, unsigned long, pgoff, unsigned long, flags)
  129. {
  130. struct mm_struct *mm = current->mm;
  131. struct address_space *mapping;
  132. struct vm_area_struct *vma;
  133. int err = -EINVAL;
  134. int has_write_lock = 0;
  135. vm_flags_t vm_flags = 0;
  136. pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. "
  137. "See Documentation/vm/remap_file_pages.txt.\n",
  138. current->comm, current->pid);
  139. if (prot)
  140. return err;
  141. /*
  142. * Sanitize the syscall parameters:
  143. */
  144. start = start & PAGE_MASK;
  145. size = size & PAGE_MASK;
  146. /* Does the address range wrap, or is the span zero-sized? */
  147. if (start + size <= start)
  148. return err;
  149. /* Does pgoff wrap? */
  150. if (pgoff + (size >> PAGE_SHIFT) < pgoff)
  151. return err;
  152. /* Can we represent this offset inside this architecture's pte's? */
  153. #if PTE_FILE_MAX_BITS < BITS_PER_LONG
  154. if (pgoff + (size >> PAGE_SHIFT) >= (1UL << PTE_FILE_MAX_BITS))
  155. return err;
  156. #endif
  157. /* We need down_write() to change vma->vm_flags. */
  158. down_read(&mm->mmap_sem);
  159. retry:
  160. vma = find_vma(mm, start);
  161. /*
  162. * Make sure the vma is shared, that it supports prefaulting,
  163. * and that the remapped range is valid and fully within
  164. * the single existing vma.
  165. */
  166. if (!vma || !(vma->vm_flags & VM_SHARED))
  167. goto out;
  168. if (!vma->vm_ops || !vma->vm_ops->remap_pages)
  169. goto out;
  170. if (start < vma->vm_start || start + size > vma->vm_end)
  171. goto out;
  172. /* Must set VM_NONLINEAR before any pages are populated. */
  173. if (!(vma->vm_flags & VM_NONLINEAR)) {
  174. /*
  175. * vm_private_data is used as a swapout cursor
  176. * in a VM_NONLINEAR vma.
  177. */
  178. if (vma->vm_private_data)
  179. goto out;
  180. /* Don't need a nonlinear mapping, exit success */
  181. if (pgoff == linear_page_index(vma, start)) {
  182. err = 0;
  183. goto out;
  184. }
  185. if (!has_write_lock) {
  186. get_write_lock:
  187. up_read(&mm->mmap_sem);
  188. down_write(&mm->mmap_sem);
  189. has_write_lock = 1;
  190. goto retry;
  191. }
  192. mapping = vma->vm_file->f_mapping;
  193. /*
  194. * page_mkclean doesn't work on nonlinear vmas, so if
  195. * dirty pages need to be accounted, emulate with linear
  196. * vmas.
  197. */
  198. if (mapping_cap_account_dirty(mapping)) {
  199. unsigned long addr;
  200. struct file *file = get_file(vma->vm_file);
  201. /* mmap_region may free vma; grab the info now */
  202. vm_flags = vma->vm_flags;
  203. addr = mmap_region(file, start, size, vm_flags, pgoff);
  204. fput(file);
  205. if (IS_ERR_VALUE(addr)) {
  206. err = addr;
  207. } else {
  208. BUG_ON(addr != start);
  209. err = 0;
  210. }
  211. goto out_freed;
  212. }
  213. mutex_lock(&mapping->i_mmap_mutex);
  214. flush_dcache_mmap_lock(mapping);
  215. vma->vm_flags |= VM_NONLINEAR;
  216. vma_interval_tree_remove(vma, &mapping->i_mmap);
  217. vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
  218. flush_dcache_mmap_unlock(mapping);
  219. mutex_unlock(&mapping->i_mmap_mutex);
  220. }
  221. if (vma->vm_flags & VM_LOCKED) {
  222. /*
  223. * drop PG_Mlocked flag for over-mapped range
  224. */
  225. if (!has_write_lock)
  226. goto get_write_lock;
  227. vm_flags = vma->vm_flags;
  228. munlock_vma_pages_range(vma, start, start + size);
  229. vma->vm_flags = vm_flags;
  230. }
  231. mmu_notifier_invalidate_range_start(mm, start, start + size);
  232. err = vma->vm_ops->remap_pages(vma, start, size, pgoff);
  233. mmu_notifier_invalidate_range_end(mm, start, start + size);
  234. /*
  235. * We can't clear VM_NONLINEAR because we'd have to do
  236. * it after ->populate completes, and that would prevent
  237. * downgrading the lock. (Locks can't be upgraded).
  238. */
  239. out:
  240. if (vma)
  241. vm_flags = vma->vm_flags;
  242. out_freed:
  243. if (likely(!has_write_lock))
  244. up_read(&mm->mmap_sem);
  245. else
  246. up_write(&mm->mmap_sem);
  247. if (!err && ((vm_flags & VM_LOCKED) || !(flags & MAP_NONBLOCK)))
  248. mm_populate(start, size);
  249. return err;
  250. }