Browse Source

Merge branch 'akpm' (patches from Andrew Morton)

Merge more patches from Andrew Morton:
 "The rest of MM.  Plus one misc cleanup"

* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (35 commits)
  mm/Kconfig: add MMU dependency for MIGRATION.
  kernel: replace strict_strto*() with kstrto*()
  mm, thp: count thp_fault_fallback anytime thp fault fails
  thp: consolidate code between handle_mm_fault() and do_huge_pmd_anonymous_page()
  thp: do_huge_pmd_anonymous_page() cleanup
  thp: move maybe_pmd_mkwrite() out of mk_huge_pmd()
  mm: cleanup add_to_page_cache_locked()
  thp: account anon transparent huge pages into NR_ANON_PAGES
  truncate: drop 'oldsize' truncate_pagecache() parameter
  mm: make lru_add_drain_all() selective
  memcg: document cgroup dirty/writeback memory statistics
  memcg: add per cgroup writeback pages accounting
  memcg: check for proper lock held in mem_cgroup_update_page_stat
  memcg: remove MEMCG_NR_FILE_MAPPED
  memcg: reduce function dereference
  memcg: avoid overflow caused by PAGE_ALIGN
  memcg: rename RESOURCE_MAX to RES_COUNTER_MAX
  memcg: correct RESOURCE_MAX to ULLONG_MAX
  mm: memcg: do not trap chargers with full callstack on OOM
  mm: memcg: rework and document OOM waiting and wakeup
  ...
Linus Torvalds 12 years ago
parent
commit
ac4de9543a
79 changed files with 973 additions and 919 deletions
  1. 2 0
      Documentation/cgroups/memory.txt
  2. 4 3
      arch/alpha/mm/fault.c
  3. 4 7
      arch/arc/mm/fault.c
  4. 13 10
      arch/arm/mm/fault.c
  5. 17 14
      arch/arm64/mm/fault.c
  6. 3 1
      arch/avr32/mm/fault.c
  7. 4 2
      arch/cris/mm/fault.c
  8. 6 4
      arch/frv/mm/fault.c
  9. 4 2
      arch/hexagon/mm/vm_fault.c
  10. 4 2
      arch/ia64/mm/fault.c
  11. 6 4
      arch/m32r/mm/fault.c
  12. 2 0
      arch/m68k/mm/fault.c
  13. 4 2
      arch/metag/mm/fault.c
  14. 5 2
      arch/microblaze/mm/fault.c
  15. 6 2
      arch/mips/mm/fault.c
  16. 2 0
      arch/mn10300/mm/fault.c
  17. 1 0
      arch/openrisc/mm/fault.c
  18. 5 2
      arch/parisc/mm/fault.c
  19. 4 3
      arch/powerpc/mm/fault.c
  20. 2 0
      arch/s390/mm/fault.c
  21. 6 7
      arch/score/mm/fault.c
  22. 6 3
      arch/sh/mm/fault.c
  23. 9 3
      arch/sparc/mm/fault_32.c
  24. 4 2
      arch/sparc/mm/fault_64.c
  25. 5 8
      arch/tile/mm/fault.c
  26. 14 8
      arch/um/kernel/trap.c
  27. 13 9
      arch/unicore32/mm/fault.c
  28. 22 21
      arch/x86/mm/fault.c
  29. 2 0
      arch/xtensa/mm/fault.c
  30. 0 6
      drivers/base/node.c
  31. 1 1
      fs/adfs/inode.c
  32. 1 1
      fs/affs/file.c
  33. 1 1
      fs/bfs/file.c
  34. 1 3
      fs/btrfs/free-space-cache.c
  35. 1 1
      fs/btrfs/inode.c
  36. 1 4
      fs/cifs/inode.c
  37. 1 1
      fs/exofs/inode.c
  38. 1 1
      fs/ext2/inode.c
  39. 1 2
      fs/ext4/inode.c
  40. 1 1
      fs/fat/inode.c
  41. 1 1
      fs/fuse/dir.c
  42. 1 1
      fs/fuse/inode.c
  43. 2 2
      fs/gfs2/bmap.c
  44. 1 1
      fs/hfs/inode.c
  45. 1 1
      fs/hfsplus/inode.c
  46. 1 1
      fs/hpfs/file.c
  47. 1 1
      fs/jfs/inode.c
  48. 1 1
      fs/minix/inode.c
  49. 1 3
      fs/nfs/inode.c
  50. 1 1
      fs/nilfs2/inode.c
  51. 1 1
      fs/ntfs/file.c
  52. 1 1
      fs/omfs/file.c
  53. 0 6
      fs/proc/meminfo.c
  54. 1 1
      fs/sysv/itree.c
  55. 1 1
      fs/udf/inode.c
  56. 1 1
      fs/ufs/inode.c
  57. 2 2
      fs/xfs/xfs_aops.c
  58. 0 3
      include/linux/huge_mm.h
  59. 130 18
      include/linux/memcontrol.h
  60. 4 2
      include/linux/mm.h
  61. 1 1
      include/linux/res_counter.h
  62. 7 0
      include/linux/sched.h
  63. 1 1
      include/linux/swap.h
  64. 1 1
      kernel/gcov/fs.c
  65. 1 1
      kernel/ksysfs.c
  66. 7 7
      kernel/params.c
  67. 16 9
      kernel/res_counter.c
  68. 2 2
      mm/Kconfig
  69. 35 24
      mm/filemap.c
  70. 56 73
      mm/huge_memory.c
  71. 337 534
      mm/memcontrol.c
  72. 39 13
      mm/memory.c
  73. 5 2
      mm/oom_kill.c
  74. 15 0
      mm/page-writeback.c
  75. 11 11
      mm/rmap.c
  76. 39 5
      mm/swap.c
  77. 2 7
      mm/truncate.c
  78. 52 31
      mm/vmscan.c
  79. 5 5
      net/ipv4/tcp_memcontrol.c

+ 2 - 0
Documentation/cgroups/memory.txt

@@ -490,6 +490,8 @@ pgpgin		- # of charging events to the memory cgroup. The charging
 pgpgout		- # of uncharging events to the memory cgroup. The uncharging
 pgpgout		- # of uncharging events to the memory cgroup. The uncharging
 		event happens each time a page is unaccounted from the cgroup.
 		event happens each time a page is unaccounted from the cgroup.
 swap		- # of bytes of swap usage
 swap		- # of bytes of swap usage
+writeback	- # of bytes of file/anon cache that are queued for syncing to
+		disk.
 inactive_anon	- # of bytes of anonymous and swap cache memory on inactive
 inactive_anon	- # of bytes of anonymous and swap cache memory on inactive
 		LRU list.
 		LRU list.
 active_anon	- # of bytes of anonymous and swap cache memory on active
 active_anon	- # of bytes of anonymous and swap cache memory on active

+ 4 - 3
arch/alpha/mm/fault.c

@@ -89,8 +89,7 @@ do_page_fault(unsigned long address, unsigned long mmcsr,
 	const struct exception_table_entry *fixup;
 	const struct exception_table_entry *fixup;
 	int fault, si_code = SEGV_MAPERR;
 	int fault, si_code = SEGV_MAPERR;
 	siginfo_t info;
 	siginfo_t info;
-	unsigned int flags = (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
-			      (cause > 0 ? FAULT_FLAG_WRITE : 0));
+	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
 
 
 	/* As of EV6, a load into $31/$f31 is a prefetch, and never faults
 	/* As of EV6, a load into $31/$f31 is a prefetch, and never faults
 	   (or is suppressed by the PALcode).  Support that for older CPUs
 	   (or is suppressed by the PALcode).  Support that for older CPUs
@@ -115,7 +114,8 @@ do_page_fault(unsigned long address, unsigned long mmcsr,
 	if (address >= TASK_SIZE)
 	if (address >= TASK_SIZE)
 		goto vmalloc_fault;
 		goto vmalloc_fault;
 #endif
 #endif
-
+	if (user_mode(regs))
+		flags |= FAULT_FLAG_USER;
 retry:
 retry:
 	down_read(&mm->mmap_sem);
 	down_read(&mm->mmap_sem);
 	vma = find_vma(mm, address);
 	vma = find_vma(mm, address);
@@ -142,6 +142,7 @@ retry:
 	} else {
 	} else {
 		if (!(vma->vm_flags & VM_WRITE))
 		if (!(vma->vm_flags & VM_WRITE))
 			goto bad_area;
 			goto bad_area;
+		flags |= FAULT_FLAG_WRITE;
 	}
 	}
 
 
 	/* If for any reason at all we couldn't handle the fault,
 	/* If for any reason at all we couldn't handle the fault,

+ 4 - 7
arch/arc/mm/fault.c

@@ -60,8 +60,7 @@ void do_page_fault(struct pt_regs *regs, unsigned long address)
 	siginfo_t info;
 	siginfo_t info;
 	int fault, ret;
 	int fault, ret;
 	int write = regs->ecr_cause & ECR_C_PROTV_STORE;  /* ST/EX */
 	int write = regs->ecr_cause & ECR_C_PROTV_STORE;  /* ST/EX */
-	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
-				(write ? FAULT_FLAG_WRITE : 0);
+	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
 
 
 	/*
 	/*
 	 * We fault-in kernel-space virtual memory on-demand. The
 	 * We fault-in kernel-space virtual memory on-demand. The
@@ -89,6 +88,8 @@ void do_page_fault(struct pt_regs *regs, unsigned long address)
 	if (in_atomic() || !mm)
 	if (in_atomic() || !mm)
 		goto no_context;
 		goto no_context;
 
 
+	if (user_mode(regs))
+		flags |= FAULT_FLAG_USER;
 retry:
 retry:
 	down_read(&mm->mmap_sem);
 	down_read(&mm->mmap_sem);
 	vma = find_vma(mm, address);
 	vma = find_vma(mm, address);
@@ -117,12 +118,12 @@ good_area:
 	if (write) {
 	if (write) {
 		if (!(vma->vm_flags & VM_WRITE))
 		if (!(vma->vm_flags & VM_WRITE))
 			goto bad_area;
 			goto bad_area;
+		flags |= FAULT_FLAG_WRITE;
 	} else {
 	} else {
 		if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
 		if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
 			goto bad_area;
 			goto bad_area;
 	}
 	}
 
 
-survive:
 	/*
 	/*
 	 * If for any reason at all we couldn't handle the fault,
 	 * If for any reason at all we couldn't handle the fault,
 	 * make sure we exit gracefully rather than endlessly redo
 	 * make sure we exit gracefully rather than endlessly redo
@@ -201,10 +202,6 @@ no_context:
 	die("Oops", regs, address);
 	die("Oops", regs, address);
 
 
 out_of_memory:
 out_of_memory:
-	if (is_global_init(tsk)) {
-		yield();
-		goto survive;
-	}
 	up_read(&mm->mmap_sem);
 	up_read(&mm->mmap_sem);
 
 
 	if (user_mode(regs)) {
 	if (user_mode(regs)) {

+ 13 - 10
arch/arm/mm/fault.c

@@ -261,9 +261,7 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 	struct task_struct *tsk;
 	struct task_struct *tsk;
 	struct mm_struct *mm;
 	struct mm_struct *mm;
 	int fault, sig, code;
 	int fault, sig, code;
-	int write = fsr & FSR_WRITE;
-	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
-				(write ? FAULT_FLAG_WRITE : 0);
+	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
 
 
 	if (notify_page_fault(regs, fsr))
 	if (notify_page_fault(regs, fsr))
 		return 0;
 		return 0;
@@ -282,6 +280,11 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 	if (in_atomic() || !mm)
 	if (in_atomic() || !mm)
 		goto no_context;
 		goto no_context;
 
 
+	if (user_mode(regs))
+		flags |= FAULT_FLAG_USER;
+	if (fsr & FSR_WRITE)
+		flags |= FAULT_FLAG_WRITE;
+
 	/*
 	/*
 	 * As per x86, we may deadlock here.  However, since the kernel only
 	 * As per x86, we may deadlock here.  However, since the kernel only
 	 * validly references user space from well defined areas of the code,
 	 * validly references user space from well defined areas of the code,
@@ -349,6 +352,13 @@ retry:
 	if (likely(!(fault & (VM_FAULT_ERROR | VM_FAULT_BADMAP | VM_FAULT_BADACCESS))))
 	if (likely(!(fault & (VM_FAULT_ERROR | VM_FAULT_BADMAP | VM_FAULT_BADACCESS))))
 		return 0;
 		return 0;
 
 
+	/*
+	 * If we are in kernel mode at this point, we
+	 * have no context to handle this fault with.
+	 */
+	if (!user_mode(regs))
+		goto no_context;
+
 	if (fault & VM_FAULT_OOM) {
 	if (fault & VM_FAULT_OOM) {
 		/*
 		/*
 		 * We ran out of memory, call the OOM killer, and return to
 		 * We ran out of memory, call the OOM killer, and return to
@@ -359,13 +369,6 @@ retry:
 		return 0;
 		return 0;
 	}
 	}
 
 
-	/*
-	 * If we are in kernel mode at this point, we
-	 * have no context to handle this fault with.
-	 */
-	if (!user_mode(regs))
-		goto no_context;
-
 	if (fault & VM_FAULT_SIGBUS) {
 	if (fault & VM_FAULT_SIGBUS) {
 		/*
 		/*
 		 * We had some memory, but were unable to
 		 * We had some memory, but were unable to

+ 17 - 14
arch/arm64/mm/fault.c

@@ -199,13 +199,6 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
 	unsigned long vm_flags = VM_READ | VM_WRITE | VM_EXEC;
 	unsigned long vm_flags = VM_READ | VM_WRITE | VM_EXEC;
 	unsigned int mm_flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
 	unsigned int mm_flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
 
 
-	if (esr & ESR_LNX_EXEC) {
-		vm_flags = VM_EXEC;
-	} else if ((esr & ESR_WRITE) && !(esr & ESR_CM)) {
-		vm_flags = VM_WRITE;
-		mm_flags |= FAULT_FLAG_WRITE;
-	}
-
 	tsk = current;
 	tsk = current;
 	mm  = tsk->mm;
 	mm  = tsk->mm;
 
 
@@ -220,6 +213,16 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
 	if (in_atomic() || !mm)
 	if (in_atomic() || !mm)
 		goto no_context;
 		goto no_context;
 
 
+	if (user_mode(regs))
+		mm_flags |= FAULT_FLAG_USER;
+
+	if (esr & ESR_LNX_EXEC) {
+		vm_flags = VM_EXEC;
+	} else if ((esr & ESR_WRITE) && !(esr & ESR_CM)) {
+		vm_flags = VM_WRITE;
+		mm_flags |= FAULT_FLAG_WRITE;
+	}
+
 	/*
 	/*
 	 * As per x86, we may deadlock here. However, since the kernel only
 	 * As per x86, we may deadlock here. However, since the kernel only
 	 * validly references user space from well defined areas of the code,
 	 * validly references user space from well defined areas of the code,
@@ -288,6 +291,13 @@ retry:
 			      VM_FAULT_BADACCESS))))
 			      VM_FAULT_BADACCESS))))
 		return 0;
 		return 0;
 
 
+	/*
+	 * If we are in kernel mode at this point, we have no context to
+	 * handle this fault with.
+	 */
+	if (!user_mode(regs))
+		goto no_context;
+
 	if (fault & VM_FAULT_OOM) {
 	if (fault & VM_FAULT_OOM) {
 		/*
 		/*
 		 * We ran out of memory, call the OOM killer, and return to
 		 * We ran out of memory, call the OOM killer, and return to
@@ -298,13 +308,6 @@ retry:
 		return 0;
 		return 0;
 	}
 	}
 
 
-	/*
-	 * If we are in kernel mode at this point, we have no context to
-	 * handle this fault with.
-	 */
-	if (!user_mode(regs))
-		goto no_context;
-
 	if (fault & VM_FAULT_SIGBUS) {
 	if (fault & VM_FAULT_SIGBUS) {
 		/*
 		/*
 		 * We had some memory, but were unable to successfully fix up
 		 * We had some memory, but were unable to successfully fix up

+ 3 - 1
arch/avr32/mm/fault.c

@@ -86,6 +86,8 @@ asmlinkage void do_page_fault(unsigned long ecr, struct pt_regs *regs)
 
 
 	local_irq_enable();
 	local_irq_enable();
 
 
+	if (user_mode(regs))
+		flags |= FAULT_FLAG_USER;
 retry:
 retry:
 	down_read(&mm->mmap_sem);
 	down_read(&mm->mmap_sem);
 
 
@@ -228,9 +230,9 @@ no_context:
 	 */
 	 */
 out_of_memory:
 out_of_memory:
 	up_read(&mm->mmap_sem);
 	up_read(&mm->mmap_sem);
-	pagefault_out_of_memory();
 	if (!user_mode(regs))
 	if (!user_mode(regs))
 		goto no_context;
 		goto no_context;
+	pagefault_out_of_memory();
 	return;
 	return;
 
 
 do_sigbus:
 do_sigbus:

+ 4 - 2
arch/cris/mm/fault.c

@@ -58,8 +58,7 @@ do_page_fault(unsigned long address, struct pt_regs *regs,
 	struct vm_area_struct * vma;
 	struct vm_area_struct * vma;
 	siginfo_t info;
 	siginfo_t info;
 	int fault;
 	int fault;
-	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
-				((writeaccess & 1) ? FAULT_FLAG_WRITE : 0);
+	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
 
 
 	D(printk(KERN_DEBUG
 	D(printk(KERN_DEBUG
 		 "Page fault for %lX on %X at %lX, prot %d write %d\n",
 		 "Page fault for %lX on %X at %lX, prot %d write %d\n",
@@ -117,6 +116,8 @@ do_page_fault(unsigned long address, struct pt_regs *regs,
 	if (in_atomic() || !mm)
 	if (in_atomic() || !mm)
 		goto no_context;
 		goto no_context;
 
 
+	if (user_mode(regs))
+		flags |= FAULT_FLAG_USER;
 retry:
 retry:
 	down_read(&mm->mmap_sem);
 	down_read(&mm->mmap_sem);
 	vma = find_vma(mm, address);
 	vma = find_vma(mm, address);
@@ -155,6 +156,7 @@ retry:
 	} else if (writeaccess == 1) {
 	} else if (writeaccess == 1) {
 		if (!(vma->vm_flags & VM_WRITE))
 		if (!(vma->vm_flags & VM_WRITE))
 			goto bad_area;
 			goto bad_area;
+		flags |= FAULT_FLAG_WRITE;
 	} else {
 	} else {
 		if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
 		if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
 			goto bad_area;
 			goto bad_area;

+ 6 - 4
arch/frv/mm/fault.c

@@ -34,11 +34,11 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear
 	struct vm_area_struct *vma;
 	struct vm_area_struct *vma;
 	struct mm_struct *mm;
 	struct mm_struct *mm;
 	unsigned long _pme, lrai, lrad, fixup;
 	unsigned long _pme, lrai, lrad, fixup;
+	unsigned long flags = 0;
 	siginfo_t info;
 	siginfo_t info;
 	pgd_t *pge;
 	pgd_t *pge;
 	pud_t *pue;
 	pud_t *pue;
 	pte_t *pte;
 	pte_t *pte;
-	int write;
 	int fault;
 	int fault;
 
 
 #if 0
 #if 0
@@ -81,6 +81,9 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear
 	if (in_atomic() || !mm)
 	if (in_atomic() || !mm)
 		goto no_context;
 		goto no_context;
 
 
+	if (user_mode(__frame))
+		flags |= FAULT_FLAG_USER;
+
 	down_read(&mm->mmap_sem);
 	down_read(&mm->mmap_sem);
 
 
 	vma = find_vma(mm, ear0);
 	vma = find_vma(mm, ear0);
@@ -129,7 +132,6 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear
  */
  */
  good_area:
  good_area:
 	info.si_code = SEGV_ACCERR;
 	info.si_code = SEGV_ACCERR;
-	write = 0;
 	switch (esr0 & ESR0_ATXC) {
 	switch (esr0 & ESR0_ATXC) {
 	default:
 	default:
 		/* handle write to write protected page */
 		/* handle write to write protected page */
@@ -140,7 +142,7 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear
 #endif
 #endif
 		if (!(vma->vm_flags & VM_WRITE))
 		if (!(vma->vm_flags & VM_WRITE))
 			goto bad_area;
 			goto bad_area;
-		write = 1;
+		flags |= FAULT_FLAG_WRITE;
 		break;
 		break;
 
 
 		 /* handle read from protected page */
 		 /* handle read from protected page */
@@ -162,7 +164,7 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear
 	 * make sure we exit gracefully rather than endlessly redo
 	 * make sure we exit gracefully rather than endlessly redo
 	 * the fault.
 	 * the fault.
 	 */
 	 */
-	fault = handle_mm_fault(mm, vma, ear0, write ? FAULT_FLAG_WRITE : 0);
+	fault = handle_mm_fault(mm, vma, ear0, flags);
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
 			goto out_of_memory;

+ 4 - 2
arch/hexagon/mm/vm_fault.c

@@ -53,8 +53,7 @@ void do_page_fault(unsigned long address, long cause, struct pt_regs *regs)
 	int si_code = SEGV_MAPERR;
 	int si_code = SEGV_MAPERR;
 	int fault;
 	int fault;
 	const struct exception_table_entry *fixup;
 	const struct exception_table_entry *fixup;
-	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
-				 (cause > 0 ? FAULT_FLAG_WRITE : 0);
+	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
 
 
 	/*
 	/*
 	 * If we're in an interrupt or have no user context,
 	 * If we're in an interrupt or have no user context,
@@ -65,6 +64,8 @@ void do_page_fault(unsigned long address, long cause, struct pt_regs *regs)
 
 
 	local_irq_enable();
 	local_irq_enable();
 
 
+	if (user_mode(regs))
+		flags |= FAULT_FLAG_USER;
 retry:
 retry:
 	down_read(&mm->mmap_sem);
 	down_read(&mm->mmap_sem);
 	vma = find_vma(mm, address);
 	vma = find_vma(mm, address);
@@ -96,6 +97,7 @@ good_area:
 	case FLT_STORE:
 	case FLT_STORE:
 		if (!(vma->vm_flags & VM_WRITE))
 		if (!(vma->vm_flags & VM_WRITE))
 			goto bad_area;
 			goto bad_area;
+		flags |= FAULT_FLAG_WRITE;
 		break;
 		break;
 	}
 	}
 
 

+ 4 - 2
arch/ia64/mm/fault.c

@@ -90,8 +90,6 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re
 	mask = ((((isr >> IA64_ISR_X_BIT) & 1UL) << VM_EXEC_BIT)
 	mask = ((((isr >> IA64_ISR_X_BIT) & 1UL) << VM_EXEC_BIT)
 		| (((isr >> IA64_ISR_W_BIT) & 1UL) << VM_WRITE_BIT));
 		| (((isr >> IA64_ISR_W_BIT) & 1UL) << VM_WRITE_BIT));
 
 
-	flags |= ((mask & VM_WRITE) ? FAULT_FLAG_WRITE : 0);
-
 	/* mmap_sem is performance critical.... */
 	/* mmap_sem is performance critical.... */
 	prefetchw(&mm->mmap_sem);
 	prefetchw(&mm->mmap_sem);
 
 
@@ -119,6 +117,10 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re
 	if (notify_page_fault(regs, TRAP_BRKPT))
 	if (notify_page_fault(regs, TRAP_BRKPT))
 		return;
 		return;
 
 
+	if (user_mode(regs))
+		flags |= FAULT_FLAG_USER;
+	if (mask & VM_WRITE)
+		flags |= FAULT_FLAG_WRITE;
 retry:
 retry:
 	down_read(&mm->mmap_sem);
 	down_read(&mm->mmap_sem);
 
 

+ 6 - 4
arch/m32r/mm/fault.c

@@ -78,7 +78,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code,
 	struct mm_struct *mm;
 	struct mm_struct *mm;
 	struct vm_area_struct * vma;
 	struct vm_area_struct * vma;
 	unsigned long page, addr;
 	unsigned long page, addr;
-	int write;
+	unsigned long flags = 0;
 	int fault;
 	int fault;
 	siginfo_t info;
 	siginfo_t info;
 
 
@@ -117,6 +117,9 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code,
 	if (in_atomic() || !mm)
 	if (in_atomic() || !mm)
 		goto bad_area_nosemaphore;
 		goto bad_area_nosemaphore;
 
 
+	if (error_code & ACE_USERMODE)
+		flags |= FAULT_FLAG_USER;
+
 	/* When running in the kernel we expect faults to occur only to
 	/* When running in the kernel we expect faults to occur only to
 	 * addresses in user space.  All other faults represent errors in the
 	 * addresses in user space.  All other faults represent errors in the
 	 * kernel and should generate an OOPS.  Unfortunately, in the case of an
 	 * kernel and should generate an OOPS.  Unfortunately, in the case of an
@@ -166,14 +169,13 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code,
  */
  */
 good_area:
 good_area:
 	info.si_code = SEGV_ACCERR;
 	info.si_code = SEGV_ACCERR;
-	write = 0;
 	switch (error_code & (ACE_WRITE|ACE_PROTECTION)) {
 	switch (error_code & (ACE_WRITE|ACE_PROTECTION)) {
 		default:	/* 3: write, present */
 		default:	/* 3: write, present */
 			/* fall through */
 			/* fall through */
 		case ACE_WRITE:	/* write, not present */
 		case ACE_WRITE:	/* write, not present */
 			if (!(vma->vm_flags & VM_WRITE))
 			if (!(vma->vm_flags & VM_WRITE))
 				goto bad_area;
 				goto bad_area;
-			write++;
+			flags |= FAULT_FLAG_WRITE;
 			break;
 			break;
 		case ACE_PROTECTION:	/* read, present */
 		case ACE_PROTECTION:	/* read, present */
 		case 0:		/* read, not present */
 		case 0:		/* read, not present */
@@ -194,7 +196,7 @@ good_area:
 	 */
 	 */
 	addr = (address & PAGE_MASK);
 	addr = (address & PAGE_MASK);
 	set_thread_fault_code(error_code);
 	set_thread_fault_code(error_code);
-	fault = handle_mm_fault(mm, vma, addr, write ? FAULT_FLAG_WRITE : 0);
+	fault = handle_mm_fault(mm, vma, addr, flags);
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
 			goto out_of_memory;

+ 2 - 0
arch/m68k/mm/fault.c

@@ -88,6 +88,8 @@ int do_page_fault(struct pt_regs *regs, unsigned long address,
 	if (in_atomic() || !mm)
 	if (in_atomic() || !mm)
 		goto no_context;
 		goto no_context;
 
 
+	if (user_mode(regs))
+		flags |= FAULT_FLAG_USER;
 retry:
 retry:
 	down_read(&mm->mmap_sem);
 	down_read(&mm->mmap_sem);
 
 

+ 4 - 2
arch/metag/mm/fault.c

@@ -53,8 +53,7 @@ int do_page_fault(struct pt_regs *regs, unsigned long address,
 	struct vm_area_struct *vma, *prev_vma;
 	struct vm_area_struct *vma, *prev_vma;
 	siginfo_t info;
 	siginfo_t info;
 	int fault;
 	int fault;
-	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
-				(write_access ? FAULT_FLAG_WRITE : 0);
+	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
 
 
 	tsk = current;
 	tsk = current;
 
 
@@ -109,6 +108,8 @@ int do_page_fault(struct pt_regs *regs, unsigned long address,
 	if (in_atomic() || !mm)
 	if (in_atomic() || !mm)
 		goto no_context;
 		goto no_context;
 
 
+	if (user_mode(regs))
+		flags |= FAULT_FLAG_USER;
 retry:
 retry:
 	down_read(&mm->mmap_sem);
 	down_read(&mm->mmap_sem);
 
 
@@ -121,6 +122,7 @@ good_area:
 	if (write_access) {
 	if (write_access) {
 		if (!(vma->vm_flags & VM_WRITE))
 		if (!(vma->vm_flags & VM_WRITE))
 			goto bad_area;
 			goto bad_area;
+		flags |= FAULT_FLAG_WRITE;
 	} else {
 	} else {
 		if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
 		if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
 			goto bad_area;
 			goto bad_area;

+ 5 - 2
arch/microblaze/mm/fault.c

@@ -92,8 +92,7 @@ void do_page_fault(struct pt_regs *regs, unsigned long address,
 	int code = SEGV_MAPERR;
 	int code = SEGV_MAPERR;
 	int is_write = error_code & ESR_S;
 	int is_write = error_code & ESR_S;
 	int fault;
 	int fault;
-	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
-					 (is_write ? FAULT_FLAG_WRITE : 0);
+	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
 
 
 	regs->ear = address;
 	regs->ear = address;
 	regs->esr = error_code;
 	regs->esr = error_code;
@@ -121,6 +120,9 @@ void do_page_fault(struct pt_regs *regs, unsigned long address,
 		die("Weird page fault", regs, SIGSEGV);
 		die("Weird page fault", regs, SIGSEGV);
 	}
 	}
 
 
+	if (user_mode(regs))
+		flags |= FAULT_FLAG_USER;
+
 	/* When running in the kernel we expect faults to occur only to
 	/* When running in the kernel we expect faults to occur only to
 	 * addresses in user space.  All other faults represent errors in the
 	 * addresses in user space.  All other faults represent errors in the
 	 * kernel and should generate an OOPS.  Unfortunately, in the case of an
 	 * kernel and should generate an OOPS.  Unfortunately, in the case of an
@@ -199,6 +201,7 @@ good_area:
 	if (unlikely(is_write)) {
 	if (unlikely(is_write)) {
 		if (unlikely(!(vma->vm_flags & VM_WRITE)))
 		if (unlikely(!(vma->vm_flags & VM_WRITE)))
 			goto bad_area;
 			goto bad_area;
+		flags |= FAULT_FLAG_WRITE;
 	/* a read */
 	/* a read */
 	} else {
 	} else {
 		/* protection fault */
 		/* protection fault */

+ 6 - 2
arch/mips/mm/fault.c

@@ -42,8 +42,7 @@ static void __kprobes __do_page_fault(struct pt_regs *regs, unsigned long write,
 	const int field = sizeof(unsigned long) * 2;
 	const int field = sizeof(unsigned long) * 2;
 	siginfo_t info;
 	siginfo_t info;
 	int fault;
 	int fault;
-	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
-						 (write ? FAULT_FLAG_WRITE : 0);
+	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
 
 
 #if 0
 #if 0
 	printk("Cpu%d[%s:%d:%0*lx:%ld:%0*lx]\n", raw_smp_processor_id(),
 	printk("Cpu%d[%s:%d:%0*lx:%ld:%0*lx]\n", raw_smp_processor_id(),
@@ -93,6 +92,8 @@ static void __kprobes __do_page_fault(struct pt_regs *regs, unsigned long write,
 	if (in_atomic() || !mm)
 	if (in_atomic() || !mm)
 		goto bad_area_nosemaphore;
 		goto bad_area_nosemaphore;
 
 
+	if (user_mode(regs))
+		flags |= FAULT_FLAG_USER;
 retry:
 retry:
 	down_read(&mm->mmap_sem);
 	down_read(&mm->mmap_sem);
 	vma = find_vma(mm, address);
 	vma = find_vma(mm, address);
@@ -114,6 +115,7 @@ good_area:
 	if (write) {
 	if (write) {
 		if (!(vma->vm_flags & VM_WRITE))
 		if (!(vma->vm_flags & VM_WRITE))
 			goto bad_area;
 			goto bad_area;
+		flags |= FAULT_FLAG_WRITE;
 	} else {
 	} else {
 		if (cpu_has_rixi) {
 		if (cpu_has_rixi) {
 			if (address == regs->cp0_epc && !(vma->vm_flags & VM_EXEC)) {
 			if (address == regs->cp0_epc && !(vma->vm_flags & VM_EXEC)) {
@@ -241,6 +243,8 @@ out_of_memory:
 	 * (which will retry the fault, or kill us if we got oom-killed).
 	 * (which will retry the fault, or kill us if we got oom-killed).
 	 */
 	 */
 	up_read(&mm->mmap_sem);
 	up_read(&mm->mmap_sem);
+	if (!user_mode(regs))
+		goto no_context;
 	pagefault_out_of_memory();
 	pagefault_out_of_memory();
 	return;
 	return;
 
 

+ 2 - 0
arch/mn10300/mm/fault.c

@@ -171,6 +171,8 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long fault_code,
 	if (in_atomic() || !mm)
 	if (in_atomic() || !mm)
 		goto no_context;
 		goto no_context;
 
 
+	if ((fault_code & MMUFCR_xFC_ACCESS) == MMUFCR_xFC_ACCESS_USR)
+		flags |= FAULT_FLAG_USER;
 retry:
 retry:
 	down_read(&mm->mmap_sem);
 	down_read(&mm->mmap_sem);
 
 

+ 1 - 0
arch/openrisc/mm/fault.c

@@ -86,6 +86,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long address,
 	if (user_mode(regs)) {
 	if (user_mode(regs)) {
 		/* Exception was in userspace: reenable interrupts */
 		/* Exception was in userspace: reenable interrupts */
 		local_irq_enable();
 		local_irq_enable();
+		flags |= FAULT_FLAG_USER;
 	} else {
 	} else {
 		/* If exception was in a syscall, then IRQ's may have
 		/* If exception was in a syscall, then IRQ's may have
 		 * been enabled or disabled.  If they were enabled,
 		 * been enabled or disabled.  If they were enabled,

+ 5 - 2
arch/parisc/mm/fault.c

@@ -180,6 +180,10 @@ void do_page_fault(struct pt_regs *regs, unsigned long code,
 	if (in_atomic() || !mm)
 	if (in_atomic() || !mm)
 		goto no_context;
 		goto no_context;
 
 
+	if (user_mode(regs))
+		flags |= FAULT_FLAG_USER;
+	if (acc_type & VM_WRITE)
+		flags |= FAULT_FLAG_WRITE;
 retry:
 retry:
 	down_read(&mm->mmap_sem);
 	down_read(&mm->mmap_sem);
 	vma = find_vma_prev(mm, address, &prev_vma);
 	vma = find_vma_prev(mm, address, &prev_vma);
@@ -203,8 +207,7 @@ good_area:
 	 * fault.
 	 * fault.
 	 */
 	 */
 
 
-	fault = handle_mm_fault(mm, vma, address,
-			flags | ((acc_type & VM_WRITE) ? FAULT_FLAG_WRITE : 0));
+	fault = handle_mm_fault(mm, vma, address, flags);
 
 
 	if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))
 	if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))
 		return;
 		return;

+ 4 - 3
arch/powerpc/mm/fault.c

@@ -223,9 +223,6 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
 	is_write = error_code & ESR_DST;
 	is_write = error_code & ESR_DST;
 #endif /* CONFIG_4xx || CONFIG_BOOKE */
 #endif /* CONFIG_4xx || CONFIG_BOOKE */
 
 
-	if (is_write)
-		flags |= FAULT_FLAG_WRITE;
-
 #ifdef CONFIG_PPC_ICSWX
 #ifdef CONFIG_PPC_ICSWX
 	/*
 	/*
 	 * we need to do this early because this "data storage
 	 * we need to do this early because this "data storage
@@ -288,6 +285,9 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
 	if (user_mode(regs))
 	if (user_mode(regs))
 		store_update_sp = store_updates_sp(regs);
 		store_update_sp = store_updates_sp(regs);
 
 
+	if (user_mode(regs))
+		flags |= FAULT_FLAG_USER;
+
 	/* When running in the kernel we expect faults to occur only to
 	/* When running in the kernel we expect faults to occur only to
 	 * addresses in user space.  All other faults represent errors in the
 	 * addresses in user space.  All other faults represent errors in the
 	 * kernel and should generate an OOPS.  Unfortunately, in the case of an
 	 * kernel and should generate an OOPS.  Unfortunately, in the case of an
@@ -415,6 +415,7 @@ good_area:
 	} else if (is_write) {
 	} else if (is_write) {
 		if (!(vma->vm_flags & VM_WRITE))
 		if (!(vma->vm_flags & VM_WRITE))
 			goto bad_area;
 			goto bad_area;
+		flags |= FAULT_FLAG_WRITE;
 	/* a read */
 	/* a read */
 	} else {
 	} else {
 		/* protection fault */
 		/* protection fault */

+ 2 - 0
arch/s390/mm/fault.c

@@ -302,6 +302,8 @@ static inline int do_exception(struct pt_regs *regs, int access)
 	address = trans_exc_code & __FAIL_ADDR_MASK;
 	address = trans_exc_code & __FAIL_ADDR_MASK;
 	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
 	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
 	flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
 	flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
+	if (user_mode(regs))
+		flags |= FAULT_FLAG_USER;
 	if (access == VM_WRITE || (trans_exc_code & store_indication) == 0x400)
 	if (access == VM_WRITE || (trans_exc_code & store_indication) == 0x400)
 		flags |= FAULT_FLAG_WRITE;
 		flags |= FAULT_FLAG_WRITE;
 	down_read(&mm->mmap_sem);
 	down_read(&mm->mmap_sem);

+ 6 - 7
arch/score/mm/fault.c

@@ -47,6 +47,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long write,
 	struct task_struct *tsk = current;
 	struct task_struct *tsk = current;
 	struct mm_struct *mm = tsk->mm;
 	struct mm_struct *mm = tsk->mm;
 	const int field = sizeof(unsigned long) * 2;
 	const int field = sizeof(unsigned long) * 2;
+	unsigned long flags = 0;
 	siginfo_t info;
 	siginfo_t info;
 	int fault;
 	int fault;
 
 
@@ -75,6 +76,9 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long write,
 	if (in_atomic() || !mm)
 	if (in_atomic() || !mm)
 		goto bad_area_nosemaphore;
 		goto bad_area_nosemaphore;
 
 
+	if (user_mode(regs))
+		flags |= FAULT_FLAG_USER;
+
 	down_read(&mm->mmap_sem);
 	down_read(&mm->mmap_sem);
 	vma = find_vma(mm, address);
 	vma = find_vma(mm, address);
 	if (!vma)
 	if (!vma)
@@ -95,18 +99,18 @@ good_area:
 	if (write) {
 	if (write) {
 		if (!(vma->vm_flags & VM_WRITE))
 		if (!(vma->vm_flags & VM_WRITE))
 			goto bad_area;
 			goto bad_area;
+		flags |= FAULT_FLAG_WRITE;
 	} else {
 	} else {
 		if (!(vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)))
 		if (!(vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)))
 			goto bad_area;
 			goto bad_area;
 	}
 	}
 
 
-survive:
 	/*
 	/*
 	* If for any reason at all we couldn't handle the fault,
 	* If for any reason at all we couldn't handle the fault,
 	* make sure we exit gracefully rather than endlessly redo
 	* make sure we exit gracefully rather than endlessly redo
 	* the fault.
 	* the fault.
 	*/
 	*/
-	fault = handle_mm_fault(mm, vma, address, write);
+	fault = handle_mm_fault(mm, vma, address, flags);
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
 			goto out_of_memory;
@@ -167,11 +171,6 @@ no_context:
 	*/
 	*/
 out_of_memory:
 out_of_memory:
 	up_read(&mm->mmap_sem);
 	up_read(&mm->mmap_sem);
-	if (is_global_init(tsk)) {
-		yield();
-		down_read(&mm->mmap_sem);
-		goto survive;
-	}
 	if (!user_mode(regs))
 	if (!user_mode(regs))
 		goto no_context;
 		goto no_context;
 	pagefault_out_of_memory();
 	pagefault_out_of_memory();

+ 6 - 3
arch/sh/mm/fault.c

@@ -400,9 +400,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
 	struct mm_struct *mm;
 	struct mm_struct *mm;
 	struct vm_area_struct * vma;
 	struct vm_area_struct * vma;
 	int fault;
 	int fault;
-	int write = error_code & FAULT_CODE_WRITE;
-	unsigned int flags = (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
-			      (write ? FAULT_FLAG_WRITE : 0));
+	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
 
 
 	tsk = current;
 	tsk = current;
 	mm = tsk->mm;
 	mm = tsk->mm;
@@ -476,6 +474,11 @@ good_area:
 
 
 	set_thread_fault_code(error_code);
 	set_thread_fault_code(error_code);
 
 
+	if (user_mode(regs))
+		flags |= FAULT_FLAG_USER;
+	if (error_code & FAULT_CODE_WRITE)
+		flags |= FAULT_FLAG_WRITE;
+
 	/*
 	/*
 	 * If for any reason at all we couldn't handle the fault,
 	 * If for any reason at all we couldn't handle the fault,
 	 * make sure we exit gracefully rather than endlessly redo
 	 * make sure we exit gracefully rather than endlessly redo

+ 9 - 3
arch/sparc/mm/fault_32.c

@@ -177,8 +177,7 @@ asmlinkage void do_sparc_fault(struct pt_regs *regs, int text_fault, int write,
 	unsigned long g2;
 	unsigned long g2;
 	int from_user = !(regs->psr & PSR_PS);
 	int from_user = !(regs->psr & PSR_PS);
 	int fault, code;
 	int fault, code;
-	unsigned int flags = (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
-			      (write ? FAULT_FLAG_WRITE : 0));
+	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
 
 
 	if (text_fault)
 	if (text_fault)
 		address = regs->pc;
 		address = regs->pc;
@@ -235,6 +234,11 @@ good_area:
 			goto bad_area;
 			goto bad_area;
 	}
 	}
 
 
+	if (from_user)
+		flags |= FAULT_FLAG_USER;
+	if (write)
+		flags |= FAULT_FLAG_WRITE;
+
 	/*
 	/*
 	 * If for any reason at all we couldn't handle the fault,
 	 * If for any reason at all we couldn't handle the fault,
 	 * make sure we exit gracefully rather than endlessly redo
 	 * make sure we exit gracefully rather than endlessly redo
@@ -383,6 +387,7 @@ static void force_user_fault(unsigned long address, int write)
 	struct vm_area_struct *vma;
 	struct vm_area_struct *vma;
 	struct task_struct *tsk = current;
 	struct task_struct *tsk = current;
 	struct mm_struct *mm = tsk->mm;
 	struct mm_struct *mm = tsk->mm;
+	unsigned int flags = FAULT_FLAG_USER;
 	int code;
 	int code;
 
 
 	code = SEGV_MAPERR;
 	code = SEGV_MAPERR;
@@ -402,11 +407,12 @@ good_area:
 	if (write) {
 	if (write) {
 		if (!(vma->vm_flags & VM_WRITE))
 		if (!(vma->vm_flags & VM_WRITE))
 			goto bad_area;
 			goto bad_area;
+		flags |= FAULT_FLAG_WRITE;
 	} else {
 	} else {
 		if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
 		if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
 			goto bad_area;
 			goto bad_area;
 	}
 	}
-	switch (handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0)) {
+	switch (handle_mm_fault(mm, vma, address, flags)) {
 	case VM_FAULT_SIGBUS:
 	case VM_FAULT_SIGBUS:
 	case VM_FAULT_OOM:
 	case VM_FAULT_OOM:
 		goto do_sigbus;
 		goto do_sigbus;

+ 4 - 2
arch/sparc/mm/fault_64.c

@@ -315,7 +315,8 @@ asmlinkage void __kprobes do_sparc64_fault(struct pt_regs *regs)
 			bad_kernel_pc(regs, address);
 			bad_kernel_pc(regs, address);
 			return;
 			return;
 		}
 		}
-	}
+	} else
+		flags |= FAULT_FLAG_USER;
 
 
 	/*
 	/*
 	 * If we're in an interrupt or have no user
 	 * If we're in an interrupt or have no user
@@ -418,13 +419,14 @@ good_area:
 		    vma->vm_file != NULL)
 		    vma->vm_file != NULL)
 			set_thread_fault_code(fault_code |
 			set_thread_fault_code(fault_code |
 					      FAULT_CODE_BLKCOMMIT);
 					      FAULT_CODE_BLKCOMMIT);
+
+		flags |= FAULT_FLAG_WRITE;
 	} else {
 	} else {
 		/* Allow reads even for write-only mappings */
 		/* Allow reads even for write-only mappings */
 		if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
 		if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
 			goto bad_area;
 			goto bad_area;
 	}
 	}
 
 
-	flags |= ((fault_code & FAULT_CODE_WRITE) ? FAULT_FLAG_WRITE : 0);
 	fault = handle_mm_fault(mm, vma, address, flags);
 	fault = handle_mm_fault(mm, vma, address, flags);
 
 
 	if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))
 	if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))

+ 5 - 8
arch/tile/mm/fault.c

@@ -280,8 +280,7 @@ static int handle_page_fault(struct pt_regs *regs,
 	if (!is_page_fault)
 	if (!is_page_fault)
 		write = 1;
 		write = 1;
 
 
-	flags = (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
-		 (write ? FAULT_FLAG_WRITE : 0));
+	flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
 
 
 	is_kernel_mode = !user_mode(regs);
 	is_kernel_mode = !user_mode(regs);
 
 
@@ -365,6 +364,9 @@ static int handle_page_fault(struct pt_regs *regs,
 		goto bad_area_nosemaphore;
 		goto bad_area_nosemaphore;
 	}
 	}
 
 
+	if (!is_kernel_mode)
+		flags |= FAULT_FLAG_USER;
+
 	/*
 	/*
 	 * When running in the kernel we expect faults to occur only to
 	 * When running in the kernel we expect faults to occur only to
 	 * addresses in user space.  All other faults represent errors in the
 	 * addresses in user space.  All other faults represent errors in the
@@ -425,12 +427,12 @@ good_area:
 #endif
 #endif
 		if (!(vma->vm_flags & VM_WRITE))
 		if (!(vma->vm_flags & VM_WRITE))
 			goto bad_area;
 			goto bad_area;
+		flags |= FAULT_FLAG_WRITE;
 	} else {
 	} else {
 		if (!is_page_fault || !(vma->vm_flags & VM_READ))
 		if (!is_page_fault || !(vma->vm_flags & VM_READ))
 			goto bad_area;
 			goto bad_area;
 	}
 	}
 
 
- survive:
 	/*
 	/*
 	 * If for any reason at all we couldn't handle the fault,
 	 * If for any reason at all we couldn't handle the fault,
 	 * make sure we exit gracefully rather than endlessly redo
 	 * make sure we exit gracefully rather than endlessly redo
@@ -555,11 +557,6 @@ no_context:
  */
  */
 out_of_memory:
 out_of_memory:
 	up_read(&mm->mmap_sem);
 	up_read(&mm->mmap_sem);
-	if (is_global_init(tsk)) {
-		yield();
-		down_read(&mm->mmap_sem);
-		goto survive;
-	}
 	if (is_kernel_mode)
 	if (is_kernel_mode)
 		goto no_context;
 		goto no_context;
 	pagefault_out_of_memory();
 	pagefault_out_of_memory();

+ 14 - 8
arch/um/kernel/trap.c

@@ -30,8 +30,7 @@ int handle_page_fault(unsigned long address, unsigned long ip,
 	pmd_t *pmd;
 	pmd_t *pmd;
 	pte_t *pte;
 	pte_t *pte;
 	int err = -EFAULT;
 	int err = -EFAULT;
-	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
-				 (is_write ? FAULT_FLAG_WRITE : 0);
+	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
 
 
 	*code_out = SEGV_MAPERR;
 	*code_out = SEGV_MAPERR;
 
 
@@ -42,6 +41,8 @@ int handle_page_fault(unsigned long address, unsigned long ip,
 	if (in_atomic())
 	if (in_atomic())
 		goto out_nosemaphore;
 		goto out_nosemaphore;
 
 
+	if (is_user)
+		flags |= FAULT_FLAG_USER;
 retry:
 retry:
 	down_read(&mm->mmap_sem);
 	down_read(&mm->mmap_sem);
 	vma = find_vma(mm, address);
 	vma = find_vma(mm, address);
@@ -58,12 +59,15 @@ retry:
 
 
 good_area:
 good_area:
 	*code_out = SEGV_ACCERR;
 	*code_out = SEGV_ACCERR;
-	if (is_write && !(vma->vm_flags & VM_WRITE))
-		goto out;
-
-	/* Don't require VM_READ|VM_EXEC for write faults! */
-	if (!is_write && !(vma->vm_flags & (VM_READ | VM_EXEC)))
-		goto out;
+	if (is_write) {
+		if (!(vma->vm_flags & VM_WRITE))
+			goto out;
+		flags |= FAULT_FLAG_WRITE;
+	} else {
+		/* Don't require VM_READ|VM_EXEC for write faults! */
+		if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
+			goto out;
+	}
 
 
 	do {
 	do {
 		int fault;
 		int fault;
@@ -124,6 +128,8 @@ out_of_memory:
 	 * (which will retry the fault, or kill us if we got oom-killed).
 	 * (which will retry the fault, or kill us if we got oom-killed).
 	 */
 	 */
 	up_read(&mm->mmap_sem);
 	up_read(&mm->mmap_sem);
+	if (!is_user)
+		goto out_nosemaphore;
 	pagefault_out_of_memory();
 	pagefault_out_of_memory();
 	return 0;
 	return 0;
 }
 }

+ 13 - 9
arch/unicore32/mm/fault.c

@@ -209,8 +209,7 @@ static int do_pf(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 	struct task_struct *tsk;
 	struct task_struct *tsk;
 	struct mm_struct *mm;
 	struct mm_struct *mm;
 	int fault, sig, code;
 	int fault, sig, code;
-	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
-				 ((!(fsr ^ 0x12)) ? FAULT_FLAG_WRITE : 0);
+	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
 
 
 	tsk = current;
 	tsk = current;
 	mm = tsk->mm;
 	mm = tsk->mm;
@@ -222,6 +221,11 @@ static int do_pf(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 	if (in_atomic() || !mm)
 	if (in_atomic() || !mm)
 		goto no_context;
 		goto no_context;
 
 
+	if (user_mode(regs))
+		flags |= FAULT_FLAG_USER;
+	if (!(fsr ^ 0x12))
+		flags |= FAULT_FLAG_WRITE;
+
 	/*
 	/*
 	 * As per x86, we may deadlock here.  However, since the kernel only
 	 * As per x86, we may deadlock here.  However, since the kernel only
 	 * validly references user space from well defined areas of the code,
 	 * validly references user space from well defined areas of the code,
@@ -278,6 +282,13 @@ retry:
 	       (VM_FAULT_ERROR | VM_FAULT_BADMAP | VM_FAULT_BADACCESS))))
 	       (VM_FAULT_ERROR | VM_FAULT_BADMAP | VM_FAULT_BADACCESS))))
 		return 0;
 		return 0;
 
 
+	/*
+	 * If we are in kernel mode at this point, we
+	 * have no context to handle this fault with.
+	 */
+	if (!user_mode(regs))
+		goto no_context;
+
 	if (fault & VM_FAULT_OOM) {
 	if (fault & VM_FAULT_OOM) {
 		/*
 		/*
 		 * We ran out of memory, call the OOM killer, and return to
 		 * We ran out of memory, call the OOM killer, and return to
@@ -288,13 +299,6 @@ retry:
 		return 0;
 		return 0;
 	}
 	}
 
 
-	/*
-	 * If we are in kernel mode at this point, we
-	 * have no context to handle this fault with.
-	 */
-	if (!user_mode(regs))
-		goto no_context;
-
 	if (fault & VM_FAULT_SIGBUS) {
 	if (fault & VM_FAULT_SIGBUS) {
 		/*
 		/*
 		 * We had some memory, but were unable to
 		 * We had some memory, but were unable to

+ 22 - 21
arch/x86/mm/fault.c

@@ -842,23 +842,15 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
 	force_sig_info_fault(SIGBUS, code, address, tsk, fault);
 	force_sig_info_fault(SIGBUS, code, address, tsk, fault);
 }
 }
 
 
-static noinline int
+static noinline void
 mm_fault_error(struct pt_regs *regs, unsigned long error_code,
 mm_fault_error(struct pt_regs *regs, unsigned long error_code,
 	       unsigned long address, unsigned int fault)
 	       unsigned long address, unsigned int fault)
 {
 {
-	/*
-	 * Pagefault was interrupted by SIGKILL. We have no reason to
-	 * continue pagefault.
-	 */
-	if (fatal_signal_pending(current)) {
-		if (!(fault & VM_FAULT_RETRY))
-			up_read(&current->mm->mmap_sem);
-		if (!(error_code & PF_USER))
-			no_context(regs, error_code, address, 0, 0);
-		return 1;
+	if (fatal_signal_pending(current) && !(error_code & PF_USER)) {
+		up_read(&current->mm->mmap_sem);
+		no_context(regs, error_code, address, 0, 0);
+		return;
 	}
 	}
-	if (!(fault & VM_FAULT_ERROR))
-		return 0;
 
 
 	if (fault & VM_FAULT_OOM) {
 	if (fault & VM_FAULT_OOM) {
 		/* Kernel mode? Handle exceptions or die: */
 		/* Kernel mode? Handle exceptions or die: */
@@ -866,7 +858,7 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
 			up_read(&current->mm->mmap_sem);
 			up_read(&current->mm->mmap_sem);
 			no_context(regs, error_code, address,
 			no_context(regs, error_code, address,
 				   SIGSEGV, SEGV_MAPERR);
 				   SIGSEGV, SEGV_MAPERR);
-			return 1;
+			return;
 		}
 		}
 
 
 		up_read(&current->mm->mmap_sem);
 		up_read(&current->mm->mmap_sem);
@@ -884,7 +876,6 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
 		else
 		else
 			BUG();
 			BUG();
 	}
 	}
-	return 1;
 }
 }
 
 
 static int spurious_fault_check(unsigned long error_code, pte_t *pte)
 static int spurious_fault_check(unsigned long error_code, pte_t *pte)
@@ -1011,9 +1002,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code)
 	unsigned long address;
 	unsigned long address;
 	struct mm_struct *mm;
 	struct mm_struct *mm;
 	int fault;
 	int fault;
-	int write = error_code & PF_WRITE;
-	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
-					(write ? FAULT_FLAG_WRITE : 0);
+	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
 
 
 	tsk = current;
 	tsk = current;
 	mm = tsk->mm;
 	mm = tsk->mm;
@@ -1083,6 +1072,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code)
 	if (user_mode_vm(regs)) {
 	if (user_mode_vm(regs)) {
 		local_irq_enable();
 		local_irq_enable();
 		error_code |= PF_USER;
 		error_code |= PF_USER;
+		flags |= FAULT_FLAG_USER;
 	} else {
 	} else {
 		if (regs->flags & X86_EFLAGS_IF)
 		if (regs->flags & X86_EFLAGS_IF)
 			local_irq_enable();
 			local_irq_enable();
@@ -1109,6 +1099,9 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code)
 		return;
 		return;
 	}
 	}
 
 
+	if (error_code & PF_WRITE)
+		flags |= FAULT_FLAG_WRITE;
+
 	/*
 	/*
 	 * When running in the kernel we expect faults to occur only to
 	 * When running in the kernel we expect faults to occur only to
 	 * addresses in user space.  All other faults represent errors in
 	 * addresses in user space.  All other faults represent errors in
@@ -1187,9 +1180,17 @@ good_area:
 	 */
 	 */
 	fault = handle_mm_fault(mm, vma, address, flags);
 	fault = handle_mm_fault(mm, vma, address, flags);
 
 
-	if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) {
-		if (mm_fault_error(regs, error_code, address, fault))
-			return;
+	/*
+	 * If we need to retry but a fatal signal is pending, handle the
+	 * signal first. We do not need to release the mmap_sem because it
+	 * would already be released in __lock_page_or_retry in mm/filemap.c.
+	 */
+	if (unlikely((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)))
+		return;
+
+	if (unlikely(fault & VM_FAULT_ERROR)) {
+		mm_fault_error(regs, error_code, address, fault);
+		return;
 	}
 	}
 
 
 	/*
 	/*

+ 2 - 0
arch/xtensa/mm/fault.c

@@ -72,6 +72,8 @@ void do_page_fault(struct pt_regs *regs)
 	       address, exccause, regs->pc, is_write? "w":"", is_exec? "x":"");
 	       address, exccause, regs->pc, is_write? "w":"", is_exec? "x":"");
 #endif
 #endif
 
 
+	if (user_mode(regs))
+		flags |= FAULT_FLAG_USER;
 retry:
 retry:
 	down_read(&mm->mmap_sem);
 	down_read(&mm->mmap_sem);
 	vma = find_vma(mm, address);
 	vma = find_vma(mm, address);

+ 0 - 6
drivers/base/node.c

@@ -125,13 +125,7 @@ static ssize_t node_read_meminfo(struct device *dev,
 		       nid, K(node_page_state(nid, NR_WRITEBACK)),
 		       nid, K(node_page_state(nid, NR_WRITEBACK)),
 		       nid, K(node_page_state(nid, NR_FILE_PAGES)),
 		       nid, K(node_page_state(nid, NR_FILE_PAGES)),
 		       nid, K(node_page_state(nid, NR_FILE_MAPPED)),
 		       nid, K(node_page_state(nid, NR_FILE_MAPPED)),
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-		       nid, K(node_page_state(nid, NR_ANON_PAGES)
-			+ node_page_state(nid, NR_ANON_TRANSPARENT_HUGEPAGES) *
-			HPAGE_PMD_NR),
-#else
 		       nid, K(node_page_state(nid, NR_ANON_PAGES)),
 		       nid, K(node_page_state(nid, NR_ANON_PAGES)),
-#endif
 		       nid, K(node_page_state(nid, NR_SHMEM)),
 		       nid, K(node_page_state(nid, NR_SHMEM)),
 		       nid, node_page_state(nid, NR_KERNEL_STACK) *
 		       nid, node_page_state(nid, NR_KERNEL_STACK) *
 				THREAD_SIZE / 1024,
 				THREAD_SIZE / 1024,

+ 1 - 1
fs/adfs/inode.c

@@ -50,7 +50,7 @@ static void adfs_write_failed(struct address_space *mapping, loff_t to)
 	struct inode *inode = mapping->host;
 	struct inode *inode = mapping->host;
 
 
 	if (to > inode->i_size)
 	if (to > inode->i_size)
-		truncate_pagecache(inode, to, inode->i_size);
+		truncate_pagecache(inode, inode->i_size);
 }
 }
 
 
 static int adfs_write_begin(struct file *file, struct address_space *mapping,
 static int adfs_write_begin(struct file *file, struct address_space *mapping,

+ 1 - 1
fs/affs/file.c

@@ -406,7 +406,7 @@ static void affs_write_failed(struct address_space *mapping, loff_t to)
 	struct inode *inode = mapping->host;
 	struct inode *inode = mapping->host;
 
 
 	if (to > inode->i_size) {
 	if (to > inode->i_size) {
-		truncate_pagecache(inode, to, inode->i_size);
+		truncate_pagecache(inode, inode->i_size);
 		affs_truncate(inode);
 		affs_truncate(inode);
 	}
 	}
 }
 }

+ 1 - 1
fs/bfs/file.c

@@ -166,7 +166,7 @@ static void bfs_write_failed(struct address_space *mapping, loff_t to)
 	struct inode *inode = mapping->host;
 	struct inode *inode = mapping->host;
 
 
 	if (to > inode->i_size)
 	if (to > inode->i_size)
-		truncate_pagecache(inode, to, inode->i_size);
+		truncate_pagecache(inode, inode->i_size);
 }
 }
 
 
 static int bfs_write_begin(struct file *file, struct address_space *mapping,
 static int bfs_write_begin(struct file *file, struct address_space *mapping,

+ 1 - 3
fs/btrfs/free-space-cache.c

@@ -221,12 +221,10 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
 				    struct btrfs_path *path,
 				    struct btrfs_path *path,
 				    struct inode *inode)
 				    struct inode *inode)
 {
 {
-	loff_t oldsize;
 	int ret = 0;
 	int ret = 0;
 
 
-	oldsize = i_size_read(inode);
 	btrfs_i_size_write(inode, 0);
 	btrfs_i_size_write(inode, 0);
-	truncate_pagecache(inode, oldsize, 0);
+	truncate_pagecache(inode, 0);
 
 
 	/*
 	/*
 	 * We don't need an orphan item because truncating the free space cache
 	 * We don't need an orphan item because truncating the free space cache

+ 1 - 1
fs/btrfs/inode.c

@@ -4349,7 +4349,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
 		inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb);
 		inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb);
 
 
 	if (newsize > oldsize) {
 	if (newsize > oldsize) {
-		truncate_pagecache(inode, oldsize, newsize);
+		truncate_pagecache(inode, newsize);
 		ret = btrfs_cont_expand(inode, oldsize, newsize);
 		ret = btrfs_cont_expand(inode, oldsize, newsize);
 		if (ret)
 		if (ret)
 			return ret;
 			return ret;

+ 1 - 4
fs/cifs/inode.c

@@ -1856,14 +1856,11 @@ static int cifs_truncate_page(struct address_space *mapping, loff_t from)
 
 
 static void cifs_setsize(struct inode *inode, loff_t offset)
 static void cifs_setsize(struct inode *inode, loff_t offset)
 {
 {
-	loff_t oldsize;
-
 	spin_lock(&inode->i_lock);
 	spin_lock(&inode->i_lock);
-	oldsize = inode->i_size;
 	i_size_write(inode, offset);
 	i_size_write(inode, offset);
 	spin_unlock(&inode->i_lock);
 	spin_unlock(&inode->i_lock);
 
 
-	truncate_pagecache(inode, oldsize, offset);
+	truncate_pagecache(inode, offset);
 }
 }
 
 
 static int
 static int

+ 1 - 1
fs/exofs/inode.c

@@ -861,7 +861,7 @@ static int exofs_writepage(struct page *page, struct writeback_control *wbc)
 static void _write_failed(struct inode *inode, loff_t to)
 static void _write_failed(struct inode *inode, loff_t to)
 {
 {
 	if (to > inode->i_size)
 	if (to > inode->i_size)
-		truncate_pagecache(inode, to, inode->i_size);
+		truncate_pagecache(inode, inode->i_size);
 }
 }
 
 
 int exofs_write_begin(struct file *file, struct address_space *mapping,
 int exofs_write_begin(struct file *file, struct address_space *mapping,

+ 1 - 1
fs/ext2/inode.c

@@ -58,7 +58,7 @@ static void ext2_write_failed(struct address_space *mapping, loff_t to)
 	struct inode *inode = mapping->host;
 	struct inode *inode = mapping->host;
 
 
 	if (to > inode->i_size) {
 	if (to > inode->i_size) {
-		truncate_pagecache(inode, to, inode->i_size);
+		truncate_pagecache(inode, inode->i_size);
 		ext2_truncate_blocks(inode, inode->i_size);
 		ext2_truncate_blocks(inode, inode->i_size);
 	}
 	}
 }
 }

+ 1 - 2
fs/ext4/inode.c

@@ -4587,7 +4587,6 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 
 
 	if (attr->ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) {
 	if (attr->ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) {
 		handle_t *handle;
 		handle_t *handle;
-		loff_t oldsize = inode->i_size;
 
 
 		if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
 		if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
 			struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 			struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -4650,7 +4649,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 		 * Truncate pagecache after we've waited for commit
 		 * Truncate pagecache after we've waited for commit
 		 * in data=journal mode to make pages freeable.
 		 * in data=journal mode to make pages freeable.
 		 */
 		 */
-		truncate_pagecache(inode, oldsize, inode->i_size);
+			truncate_pagecache(inode, inode->i_size);
 	}
 	}
 	/*
 	/*
 	 * We want to call ext4_truncate() even if attr->ia_size ==
 	 * We want to call ext4_truncate() even if attr->ia_size ==

+ 1 - 1
fs/fat/inode.c

@@ -147,7 +147,7 @@ static void fat_write_failed(struct address_space *mapping, loff_t to)
 	struct inode *inode = mapping->host;
 	struct inode *inode = mapping->host;
 
 
 	if (to > inode->i_size) {
 	if (to > inode->i_size) {
-		truncate_pagecache(inode, to, inode->i_size);
+		truncate_pagecache(inode, inode->i_size);
 		fat_truncate_blocks(inode, inode->i_size);
 		fat_truncate_blocks(inode, inode->i_size);
 	}
 	}
 }
 }

+ 1 - 1
fs/fuse/dir.c

@@ -1678,7 +1678,7 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr,
 	 * FUSE_NOWRITE, otherwise fuse_launder_page() would deadlock.
 	 * FUSE_NOWRITE, otherwise fuse_launder_page() would deadlock.
 	 */
 	 */
 	if (S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) {
 	if (S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) {
-		truncate_pagecache(inode, oldsize, outarg.attr.size);
+		truncate_pagecache(inode, outarg.attr.size);
 		invalidate_inode_pages2(inode->i_mapping);
 		invalidate_inode_pages2(inode->i_mapping);
 	}
 	}
 
 

+ 1 - 1
fs/fuse/inode.c

@@ -218,7 +218,7 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
 		bool inval = false;
 		bool inval = false;
 
 
 		if (oldsize != attr->size) {
 		if (oldsize != attr->size) {
-			truncate_pagecache(inode, oldsize, attr->size);
+			truncate_pagecache(inode, attr->size);
 			inval = true;
 			inval = true;
 		} else if (fc->auto_inval_data) {
 		} else if (fc->auto_inval_data) {
 			struct timespec new_mtime = {
 			struct timespec new_mtime = {

+ 2 - 2
fs/gfs2/bmap.c

@@ -1016,7 +1016,7 @@ static int gfs2_journaled_truncate(struct inode *inode, u64 oldsize, u64 newsize
 		chunk = oldsize - newsize;
 		chunk = oldsize - newsize;
 		if (chunk > max_chunk)
 		if (chunk > max_chunk)
 			chunk = max_chunk;
 			chunk = max_chunk;
-		truncate_pagecache(inode, oldsize, oldsize - chunk);
+		truncate_pagecache(inode, oldsize - chunk);
 		oldsize -= chunk;
 		oldsize -= chunk;
 		gfs2_trans_end(sdp);
 		gfs2_trans_end(sdp);
 		error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES);
 		error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES);
@@ -1067,7 +1067,7 @@ static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize)
 	if (journaled)
 	if (journaled)
 		error = gfs2_journaled_truncate(inode, oldsize, newsize);
 		error = gfs2_journaled_truncate(inode, oldsize, newsize);
 	else
 	else
-		truncate_pagecache(inode, oldsize, newsize);
+		truncate_pagecache(inode, newsize);
 
 
 	if (error) {
 	if (error) {
 		brelse(dibh);
 		brelse(dibh);

+ 1 - 1
fs/hfs/inode.c

@@ -41,7 +41,7 @@ static void hfs_write_failed(struct address_space *mapping, loff_t to)
 	struct inode *inode = mapping->host;
 	struct inode *inode = mapping->host;
 
 
 	if (to > inode->i_size) {
 	if (to > inode->i_size) {
-		truncate_pagecache(inode, to, inode->i_size);
+		truncate_pagecache(inode, inode->i_size);
 		hfs_file_truncate(inode);
 		hfs_file_truncate(inode);
 	}
 	}
 }
 }

+ 1 - 1
fs/hfsplus/inode.c

@@ -36,7 +36,7 @@ static void hfsplus_write_failed(struct address_space *mapping, loff_t to)
 	struct inode *inode = mapping->host;
 	struct inode *inode = mapping->host;
 
 
 	if (to > inode->i_size) {
 	if (to > inode->i_size) {
-		truncate_pagecache(inode, to, inode->i_size);
+		truncate_pagecache(inode, inode->i_size);
 		hfsplus_file_truncate(inode);
 		hfsplus_file_truncate(inode);
 	}
 	}
 }
 }

+ 1 - 1
fs/hpfs/file.c

@@ -138,7 +138,7 @@ static void hpfs_write_failed(struct address_space *mapping, loff_t to)
 	hpfs_lock(inode->i_sb);
 	hpfs_lock(inode->i_sb);
 
 
 	if (to > inode->i_size) {
 	if (to > inode->i_size) {
-		truncate_pagecache(inode, to, inode->i_size);
+		truncate_pagecache(inode, inode->i_size);
 		hpfs_truncate(inode);
 		hpfs_truncate(inode);
 	}
 	}
 
 

+ 1 - 1
fs/jfs/inode.c

@@ -306,7 +306,7 @@ static void jfs_write_failed(struct address_space *mapping, loff_t to)
 	struct inode *inode = mapping->host;
 	struct inode *inode = mapping->host;
 
 
 	if (to > inode->i_size) {
 	if (to > inode->i_size) {
-		truncate_pagecache(inode, to, inode->i_size);
+		truncate_pagecache(inode, inode->i_size);
 		jfs_truncate(inode);
 		jfs_truncate(inode);
 	}
 	}
 }
 }

+ 1 - 1
fs/minix/inode.c

@@ -400,7 +400,7 @@ static void minix_write_failed(struct address_space *mapping, loff_t to)
 	struct inode *inode = mapping->host;
 	struct inode *inode = mapping->host;
 
 
 	if (to > inode->i_size) {
 	if (to > inode->i_size) {
-		truncate_pagecache(inode, to, inode->i_size);
+		truncate_pagecache(inode, inode->i_size);
 		minix_truncate(inode);
 		minix_truncate(inode);
 	}
 	}
 }
 }

+ 1 - 3
fs/nfs/inode.c

@@ -541,7 +541,6 @@ EXPORT_SYMBOL_GPL(nfs_setattr);
  */
  */
 static int nfs_vmtruncate(struct inode * inode, loff_t offset)
 static int nfs_vmtruncate(struct inode * inode, loff_t offset)
 {
 {
-	loff_t oldsize;
 	int err;
 	int err;
 
 
 	err = inode_newsize_ok(inode, offset);
 	err = inode_newsize_ok(inode, offset);
@@ -549,11 +548,10 @@ static int nfs_vmtruncate(struct inode * inode, loff_t offset)
 		goto out;
 		goto out;
 
 
 	spin_lock(&inode->i_lock);
 	spin_lock(&inode->i_lock);
-	oldsize = inode->i_size;
 	i_size_write(inode, offset);
 	i_size_write(inode, offset);
 	spin_unlock(&inode->i_lock);
 	spin_unlock(&inode->i_lock);
 
 
-	truncate_pagecache(inode, oldsize, offset);
+	truncate_pagecache(inode, offset);
 out:
 out:
 	return err;
 	return err;
 }
 }

+ 1 - 1
fs/nilfs2/inode.c

@@ -254,7 +254,7 @@ void nilfs_write_failed(struct address_space *mapping, loff_t to)
 	struct inode *inode = mapping->host;
 	struct inode *inode = mapping->host;
 
 
 	if (to > inode->i_size) {
 	if (to > inode->i_size) {
-		truncate_pagecache(inode, to, inode->i_size);
+		truncate_pagecache(inode, inode->i_size);
 		nilfs_truncate(inode);
 		nilfs_truncate(inode);
 	}
 	}
 }
 }

+ 1 - 1
fs/ntfs/file.c

@@ -1768,7 +1768,7 @@ static void ntfs_write_failed(struct address_space *mapping, loff_t to)
 	struct inode *inode = mapping->host;
 	struct inode *inode = mapping->host;
 
 
 	if (to > inode->i_size) {
 	if (to > inode->i_size) {
-		truncate_pagecache(inode, to, inode->i_size);
+		truncate_pagecache(inode, inode->i_size);
 		ntfs_truncate_vfs(inode);
 		ntfs_truncate_vfs(inode);
 	}
 	}
 }
 }

+ 1 - 1
fs/omfs/file.c

@@ -311,7 +311,7 @@ static void omfs_write_failed(struct address_space *mapping, loff_t to)
 	struct inode *inode = mapping->host;
 	struct inode *inode = mapping->host;
 
 
 	if (to > inode->i_size) {
 	if (to > inode->i_size) {
-		truncate_pagecache(inode, to, inode->i_size);
+		truncate_pagecache(inode, inode->i_size);
 		omfs_truncate(inode);
 		omfs_truncate(inode);
 	}
 	}
 }
 }

+ 0 - 6
fs/proc/meminfo.c

@@ -132,13 +132,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 		K(i.freeswap),
 		K(i.freeswap),
 		K(global_page_state(NR_FILE_DIRTY)),
 		K(global_page_state(NR_FILE_DIRTY)),
 		K(global_page_state(NR_WRITEBACK)),
 		K(global_page_state(NR_WRITEBACK)),
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-		K(global_page_state(NR_ANON_PAGES)
-		  + global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) *
-		  HPAGE_PMD_NR),
-#else
 		K(global_page_state(NR_ANON_PAGES)),
 		K(global_page_state(NR_ANON_PAGES)),
-#endif
 		K(global_page_state(NR_FILE_MAPPED)),
 		K(global_page_state(NR_FILE_MAPPED)),
 		K(global_page_state(NR_SHMEM)),
 		K(global_page_state(NR_SHMEM)),
 		K(global_page_state(NR_SLAB_RECLAIMABLE) +
 		K(global_page_state(NR_SLAB_RECLAIMABLE) +

+ 1 - 1
fs/sysv/itree.c

@@ -469,7 +469,7 @@ static void sysv_write_failed(struct address_space *mapping, loff_t to)
 	struct inode *inode = mapping->host;
 	struct inode *inode = mapping->host;
 
 
 	if (to > inode->i_size) {
 	if (to > inode->i_size) {
-		truncate_pagecache(inode, to, inode->i_size);
+		truncate_pagecache(inode, inode->i_size);
 		sysv_truncate(inode);
 		sysv_truncate(inode);
 	}
 	}
 }
 }

+ 1 - 1
fs/udf/inode.c

@@ -172,7 +172,7 @@ static void udf_write_failed(struct address_space *mapping, loff_t to)
 	loff_t isize = inode->i_size;
 	loff_t isize = inode->i_size;
 
 
 	if (to > isize) {
 	if (to > isize) {
-		truncate_pagecache(inode, to, isize);
+		truncate_pagecache(inode, isize);
 		if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
 		if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
 			down_write(&iinfo->i_data_sem);
 			down_write(&iinfo->i_data_sem);
 			udf_clear_extent_cache(inode);
 			udf_clear_extent_cache(inode);

+ 1 - 1
fs/ufs/inode.c

@@ -531,7 +531,7 @@ static void ufs_write_failed(struct address_space *mapping, loff_t to)
 	struct inode *inode = mapping->host;
 	struct inode *inode = mapping->host;
 
 
 	if (to > inode->i_size)
 	if (to > inode->i_size)
-		truncate_pagecache(inode, to, inode->i_size);
+		truncate_pagecache(inode, inode->i_size);
 }
 }
 
 
 static int ufs_write_begin(struct file *file, struct address_space *mapping,
 static int ufs_write_begin(struct file *file, struct address_space *mapping,

+ 2 - 2
fs/xfs/xfs_aops.c

@@ -1582,7 +1582,7 @@ xfs_vm_write_begin(
 		unlock_page(page);
 		unlock_page(page);
 
 
 		if (pos + len > i_size_read(inode))
 		if (pos + len > i_size_read(inode))
-			truncate_pagecache(inode, pos + len, i_size_read(inode));
+			truncate_pagecache(inode, i_size_read(inode));
 
 
 		page_cache_release(page);
 		page_cache_release(page);
 		page = NULL;
 		page = NULL;
@@ -1618,7 +1618,7 @@ xfs_vm_write_end(
 		loff_t		to = pos + len;
 		loff_t		to = pos + len;
 
 
 		if (to > isize) {
 		if (to > isize) {
-			truncate_pagecache(inode, to, isize);
+			truncate_pagecache(inode, isize);
 			xfs_vm_kill_delalloc_range(inode, isize, to);
 			xfs_vm_kill_delalloc_range(inode, isize, to);
 		}
 		}
 	}
 	}

+ 0 - 3
include/linux/huge_mm.h

@@ -96,9 +96,6 @@ extern int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 			  pmd_t *dst_pmd, pmd_t *src_pmd,
 			  pmd_t *dst_pmd, pmd_t *src_pmd,
 			  struct vm_area_struct *vma,
 			  struct vm_area_struct *vma,
 			  unsigned long addr, unsigned long end);
 			  unsigned long addr, unsigned long end);
-extern int handle_pte_fault(struct mm_struct *mm,
-			    struct vm_area_struct *vma, unsigned long address,
-			    pte_t *pte, pmd_t *pmd, unsigned int flags);
 extern int split_huge_page_to_list(struct page *page, struct list_head *list);
 extern int split_huge_page_to_list(struct page *page, struct list_head *list);
 static inline int split_huge_page(struct page *page)
 static inline int split_huge_page(struct page *page)
 {
 {

+ 130 - 18
include/linux/memcontrol.h

@@ -30,9 +30,21 @@ struct page;
 struct mm_struct;
 struct mm_struct;
 struct kmem_cache;
 struct kmem_cache;
 
 
-/* Stats that can be updated by kernel. */
-enum mem_cgroup_page_stat_item {
-	MEMCG_NR_FILE_MAPPED, /* # of pages charged as file rss */
+/*
+ * The corresponding mem_cgroup_stat_names is defined in mm/memcontrol.c,
+ * These two lists should keep in accord with each other.
+ */
+enum mem_cgroup_stat_index {
+	/*
+	 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
+	 */
+	MEM_CGROUP_STAT_CACHE,		/* # of pages charged as cache */
+	MEM_CGROUP_STAT_RSS,		/* # of pages charged as anon rss */
+	MEM_CGROUP_STAT_RSS_HUGE,	/* # of pages charged as anon huge */
+	MEM_CGROUP_STAT_FILE_MAPPED,	/* # of pages charged as file rss */
+	MEM_CGROUP_STAT_WRITEBACK,	/* # of pages under writeback */
+	MEM_CGROUP_STAT_SWAP,		/* # of pages, swapped out */
+	MEM_CGROUP_STAT_NSTATS,
 };
 };
 
 
 struct mem_cgroup_reclaim_cookie {
 struct mem_cgroup_reclaim_cookie {
@@ -41,6 +53,23 @@ struct mem_cgroup_reclaim_cookie {
 	unsigned int generation;
 	unsigned int generation;
 };
 };
 
 
+enum mem_cgroup_filter_t {
+	VISIT,		/* visit current node */
+	SKIP,		/* skip the current node and continue traversal */
+	SKIP_TREE,	/* skip the whole subtree and continue traversal */
+};
+
+/*
+ * mem_cgroup_filter_t predicate might instruct mem_cgroup_iter_cond how to
+ * iterate through the hierarchy tree. Each tree element is checked by the
+ * predicate before it is returned by the iterator. If a filter returns
+ * SKIP or SKIP_TREE then the iterator code continues traversal (with the
+ * next node down the hierarchy or the next node that doesn't belong under the
+ * memcg's subtree).
+ */
+typedef enum mem_cgroup_filter_t
+(*mem_cgroup_iter_filter)(struct mem_cgroup *memcg, struct mem_cgroup *root);
+
 #ifdef CONFIG_MEMCG
 #ifdef CONFIG_MEMCG
 /*
 /*
  * All "charge" functions with gfp_mask should use GFP_KERNEL or
  * All "charge" functions with gfp_mask should use GFP_KERNEL or
@@ -108,9 +137,18 @@ mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
 extern void mem_cgroup_end_migration(struct mem_cgroup *memcg,
 extern void mem_cgroup_end_migration(struct mem_cgroup *memcg,
 	struct page *oldpage, struct page *newpage, bool migration_ok);
 	struct page *oldpage, struct page *newpage, bool migration_ok);
 
 
-struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
-				   struct mem_cgroup *,
-				   struct mem_cgroup_reclaim_cookie *);
+struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root,
+				   struct mem_cgroup *prev,
+				   struct mem_cgroup_reclaim_cookie *reclaim,
+				   mem_cgroup_iter_filter cond);
+
+static inline struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
+				   struct mem_cgroup *prev,
+				   struct mem_cgroup_reclaim_cookie *reclaim)
+{
+	return mem_cgroup_iter_cond(root, prev, reclaim, NULL);
+}
+
 void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
 void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
 
 
 /*
 /*
@@ -125,6 +163,48 @@ extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
 extern void mem_cgroup_replace_page_cache(struct page *oldpage,
 extern void mem_cgroup_replace_page_cache(struct page *oldpage,
 					struct page *newpage);
 					struct page *newpage);
 
 
+/**
+ * mem_cgroup_toggle_oom - toggle the memcg OOM killer for the current task
+ * @new: true to enable, false to disable
+ *
+ * Toggle whether a failed memcg charge should invoke the OOM killer
+ * or just return -ENOMEM.  Returns the previous toggle state.
+ *
+ * NOTE: Any path that enables the OOM killer before charging must
+ *       call mem_cgroup_oom_synchronize() afterward to finalize the
+ *       OOM handling and clean up.
+ */
+static inline bool mem_cgroup_toggle_oom(bool new)
+{
+	bool old;
+
+	old = current->memcg_oom.may_oom;
+	current->memcg_oom.may_oom = new;
+
+	return old;
+}
+
+static inline void mem_cgroup_enable_oom(void)
+{
+	bool old = mem_cgroup_toggle_oom(true);
+
+	WARN_ON(old == true);
+}
+
+static inline void mem_cgroup_disable_oom(void)
+{
+	bool old = mem_cgroup_toggle_oom(false);
+
+	WARN_ON(old == false);
+}
+
+static inline bool task_in_memcg_oom(struct task_struct *p)
+{
+	return p->memcg_oom.in_memcg_oom;
+}
+
+bool mem_cgroup_oom_synchronize(void);
+
 #ifdef CONFIG_MEMCG_SWAP
 #ifdef CONFIG_MEMCG_SWAP
 extern int do_swap_account;
 extern int do_swap_account;
 #endif
 #endif
@@ -165,24 +245,24 @@ static inline void mem_cgroup_end_update_page_stat(struct page *page,
 }
 }
 
 
 void mem_cgroup_update_page_stat(struct page *page,
 void mem_cgroup_update_page_stat(struct page *page,
-				 enum mem_cgroup_page_stat_item idx,
+				 enum mem_cgroup_stat_index idx,
 				 int val);
 				 int val);
 
 
 static inline void mem_cgroup_inc_page_stat(struct page *page,
 static inline void mem_cgroup_inc_page_stat(struct page *page,
-					    enum mem_cgroup_page_stat_item idx)
+					    enum mem_cgroup_stat_index idx)
 {
 {
 	mem_cgroup_update_page_stat(page, idx, 1);
 	mem_cgroup_update_page_stat(page, idx, 1);
 }
 }
 
 
 static inline void mem_cgroup_dec_page_stat(struct page *page,
 static inline void mem_cgroup_dec_page_stat(struct page *page,
-					    enum mem_cgroup_page_stat_item idx)
+					    enum mem_cgroup_stat_index idx)
 {
 {
 	mem_cgroup_update_page_stat(page, idx, -1);
 	mem_cgroup_update_page_stat(page, idx, -1);
 }
 }
 
 
-unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
-						gfp_t gfp_mask,
-						unsigned long *total_scanned);
+enum mem_cgroup_filter_t
+mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg,
+		struct mem_cgroup *root);
 
 
 void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx);
 void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx);
 static inline void mem_cgroup_count_vm_event(struct mm_struct *mm,
 static inline void mem_cgroup_count_vm_event(struct mm_struct *mm,
@@ -296,6 +376,15 @@ static inline void mem_cgroup_end_migration(struct mem_cgroup *memcg,
 		struct page *oldpage, struct page *newpage, bool migration_ok)
 		struct page *oldpage, struct page *newpage, bool migration_ok)
 {
 {
 }
 }
+static inline struct mem_cgroup *
+mem_cgroup_iter_cond(struct mem_cgroup *root,
+		struct mem_cgroup *prev,
+		struct mem_cgroup_reclaim_cookie *reclaim,
+		mem_cgroup_iter_filter cond)
+{
+	/* first call must return non-NULL, second return NULL */
+	return (struct mem_cgroup *)(unsigned long)!prev;
+}
 
 
 static inline struct mem_cgroup *
 static inline struct mem_cgroup *
 mem_cgroup_iter(struct mem_cgroup *root,
 mem_cgroup_iter(struct mem_cgroup *root,
@@ -348,22 +437,45 @@ static inline void mem_cgroup_end_update_page_stat(struct page *page,
 {
 {
 }
 }
 
 
+static inline bool mem_cgroup_toggle_oom(bool new)
+{
+	return false;
+}
+
+static inline void mem_cgroup_enable_oom(void)
+{
+}
+
+static inline void mem_cgroup_disable_oom(void)
+{
+}
+
+static inline bool task_in_memcg_oom(struct task_struct *p)
+{
+	return false;
+}
+
+static inline bool mem_cgroup_oom_synchronize(void)
+{
+	return false;
+}
+
 static inline void mem_cgroup_inc_page_stat(struct page *page,
 static inline void mem_cgroup_inc_page_stat(struct page *page,
-					    enum mem_cgroup_page_stat_item idx)
+					    enum mem_cgroup_stat_index idx)
 {
 {
 }
 }
 
 
 static inline void mem_cgroup_dec_page_stat(struct page *page,
 static inline void mem_cgroup_dec_page_stat(struct page *page,
-					    enum mem_cgroup_page_stat_item idx)
+					    enum mem_cgroup_stat_index idx)
 {
 {
 }
 }
 
 
 static inline
 static inline
-unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
-					    gfp_t gfp_mask,
-					    unsigned long *total_scanned)
+enum mem_cgroup_filter_t
+mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg,
+		struct mem_cgroup *root)
 {
 {
-	return 0;
+	return VISIT;
 }
 }
 
 
 static inline void mem_cgroup_split_huge_fixup(struct page *head)
 static inline void mem_cgroup_split_huge_fixup(struct page *head)

+ 4 - 2
include/linux/mm.h

@@ -176,6 +176,7 @@ extern pgprot_t protection_map[16];
 #define FAULT_FLAG_RETRY_NOWAIT	0x10	/* Don't drop mmap_sem and wait when retrying */
 #define FAULT_FLAG_RETRY_NOWAIT	0x10	/* Don't drop mmap_sem and wait when retrying */
 #define FAULT_FLAG_KILLABLE	0x20	/* The fault task is in SIGKILL killable region */
 #define FAULT_FLAG_KILLABLE	0x20	/* The fault task is in SIGKILL killable region */
 #define FAULT_FLAG_TRIED	0x40	/* second try */
 #define FAULT_FLAG_TRIED	0x40	/* second try */
+#define FAULT_FLAG_USER		0x80	/* The fault originated in userspace */
 
 
 /*
 /*
  * vm_fault is filled by the the pagefault handler and passed to the vma's
  * vm_fault is filled by the the pagefault handler and passed to the vma's
@@ -876,11 +877,12 @@ static inline int page_mapped(struct page *page)
 #define VM_FAULT_NOPAGE	0x0100	/* ->fault installed the pte, not return page */
 #define VM_FAULT_NOPAGE	0x0100	/* ->fault installed the pte, not return page */
 #define VM_FAULT_LOCKED	0x0200	/* ->fault locked the returned page */
 #define VM_FAULT_LOCKED	0x0200	/* ->fault locked the returned page */
 #define VM_FAULT_RETRY	0x0400	/* ->fault blocked, must retry */
 #define VM_FAULT_RETRY	0x0400	/* ->fault blocked, must retry */
+#define VM_FAULT_FALLBACK 0x0800	/* huge page fault failed, fall back to small */
 
 
 #define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */
 #define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */
 
 
 #define VM_FAULT_ERROR	(VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_HWPOISON | \
 #define VM_FAULT_ERROR	(VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_HWPOISON | \
-			 VM_FAULT_HWPOISON_LARGE)
+			 VM_FAULT_FALLBACK | VM_FAULT_HWPOISON_LARGE)
 
 
 /* Encode hstate index for a hwpoisoned large page */
 /* Encode hstate index for a hwpoisoned large page */
 #define VM_FAULT_SET_HINDEX(x) ((x) << 12)
 #define VM_FAULT_SET_HINDEX(x) ((x) << 12)
@@ -984,7 +986,7 @@ static inline void unmap_shared_mapping_range(struct address_space *mapping,
 	unmap_mapping_range(mapping, holebegin, holelen, 0);
 	unmap_mapping_range(mapping, holebegin, holelen, 0);
 }
 }
 
 
-extern void truncate_pagecache(struct inode *inode, loff_t old, loff_t new);
+extern void truncate_pagecache(struct inode *inode, loff_t new);
 extern void truncate_setsize(struct inode *inode, loff_t newsize);
 extern void truncate_setsize(struct inode *inode, loff_t newsize);
 void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end);
 void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end);
 int truncate_inode_page(struct address_space *mapping, struct page *page);
 int truncate_inode_page(struct address_space *mapping, struct page *page);

+ 1 - 1
include/linux/res_counter.h

@@ -54,7 +54,7 @@ struct res_counter {
 	struct res_counter *parent;
 	struct res_counter *parent;
 };
 };
 
 
-#define RESOURCE_MAX (unsigned long long)LLONG_MAX
+#define RES_COUNTER_MAX ULLONG_MAX
 
 
 /**
 /**
  * Helpers to interact with userspace
  * Helpers to interact with userspace

+ 7 - 0
include/linux/sched.h

@@ -1393,6 +1393,13 @@ struct task_struct {
 		unsigned long memsw_nr_pages; /* uncharged mem+swap usage */
 		unsigned long memsw_nr_pages; /* uncharged mem+swap usage */
 	} memcg_batch;
 	} memcg_batch;
 	unsigned int memcg_kmem_skip_account;
 	unsigned int memcg_kmem_skip_account;
+	struct memcg_oom_info {
+		unsigned int may_oom:1;
+		unsigned int in_memcg_oom:1;
+		unsigned int oom_locked:1;
+		int wakeups;
+		struct mem_cgroup *wait_on_memcg;
+	} memcg_oom;
 #endif
 #endif
 #ifdef CONFIG_UPROBES
 #ifdef CONFIG_UPROBES
 	struct uprobe_task *utask;
 	struct uprobe_task *utask;

+ 1 - 1
include/linux/swap.h

@@ -280,7 +280,7 @@ extern void activate_page(struct page *);
 extern void mark_page_accessed(struct page *);
 extern void mark_page_accessed(struct page *);
 extern void lru_add_drain(void);
 extern void lru_add_drain(void);
 extern void lru_add_drain_cpu(int cpu);
 extern void lru_add_drain_cpu(int cpu);
-extern int lru_add_drain_all(void);
+extern void lru_add_drain_all(void);
 extern void rotate_reclaimable_page(struct page *page);
 extern void rotate_reclaimable_page(struct page *page);
 extern void deactivate_page(struct page *page);
 extern void deactivate_page(struct page *page);
 extern void swap_setup(void);
 extern void swap_setup(void);

+ 1 - 1
kernel/gcov/fs.c

@@ -74,7 +74,7 @@ static int __init gcov_persist_setup(char *str)
 {
 {
 	unsigned long val;
 	unsigned long val;
 
 
-	if (strict_strtoul(str, 0, &val)) {
+	if (kstrtoul(str, 0, &val)) {
 		pr_warning("invalid gcov_persist parameter '%s'\n", str);
 		pr_warning("invalid gcov_persist parameter '%s'\n", str);
 		return 0;
 		return 0;
 	}
 	}

+ 1 - 1
kernel/ksysfs.c

@@ -113,7 +113,7 @@ static ssize_t kexec_crash_size_store(struct kobject *kobj,
 	unsigned long cnt;
 	unsigned long cnt;
 	int ret;
 	int ret;
 
 
-	if (strict_strtoul(buf, 0, &cnt))
+	if (kstrtoul(buf, 0, &cnt))
 		return -EINVAL;
 		return -EINVAL;
 
 
 	ret = crash_shrink_memory(cnt);
 	ret = crash_shrink_memory(cnt);

+ 7 - 7
kernel/params.c

@@ -253,13 +253,13 @@ int parse_args(const char *doing,
 	EXPORT_SYMBOL(param_ops_##name)
 	EXPORT_SYMBOL(param_ops_##name)
 
 
 
 
-STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", unsigned long, strict_strtoul);
-STANDARD_PARAM_DEF(short, short, "%hi", long, strict_strtol);
-STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", unsigned long, strict_strtoul);
-STANDARD_PARAM_DEF(int, int, "%i", long, strict_strtol);
-STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, strict_strtoul);
-STANDARD_PARAM_DEF(long, long, "%li", long, strict_strtol);
-STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, strict_strtoul);
+STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", unsigned long, kstrtoul);
+STANDARD_PARAM_DEF(short, short, "%hi", long, kstrtoul);
+STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", unsigned long, kstrtoul);
+STANDARD_PARAM_DEF(int, int, "%i", long, kstrtoul);
+STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, kstrtoul);
+STANDARD_PARAM_DEF(long, long, "%li", long, kstrtoul);
+STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, kstrtoul);
 
 
 int param_set_charp(const char *val, const struct kernel_param *kp)
 int param_set_charp(const char *val, const struct kernel_param *kp)
 {
 {

+ 16 - 9
kernel/res_counter.c

@@ -17,8 +17,8 @@
 void res_counter_init(struct res_counter *counter, struct res_counter *parent)
 void res_counter_init(struct res_counter *counter, struct res_counter *parent)
 {
 {
 	spin_lock_init(&counter->lock);
 	spin_lock_init(&counter->lock);
-	counter->limit = RESOURCE_MAX;
-	counter->soft_limit = RESOURCE_MAX;
+	counter->limit = RES_COUNTER_MAX;
+	counter->soft_limit = RES_COUNTER_MAX;
 	counter->parent = parent;
 	counter->parent = parent;
 }
 }
 
 
@@ -178,23 +178,30 @@ u64 res_counter_read_u64(struct res_counter *counter, int member)
 #endif
 #endif
 
 
 int res_counter_memparse_write_strategy(const char *buf,
 int res_counter_memparse_write_strategy(const char *buf,
-					unsigned long long *res)
+					unsigned long long *resp)
 {
 {
 	char *end;
 	char *end;
+	unsigned long long res;
 
 
-	/* return RESOURCE_MAX(unlimited) if "-1" is specified */
+	/* return RES_COUNTER_MAX(unlimited) if "-1" is specified */
 	if (*buf == '-') {
 	if (*buf == '-') {
-		*res = simple_strtoull(buf + 1, &end, 10);
-		if (*res != 1 || *end != '\0')
+		res = simple_strtoull(buf + 1, &end, 10);
+		if (res != 1 || *end != '\0')
 			return -EINVAL;
 			return -EINVAL;
-		*res = RESOURCE_MAX;
+		*resp = RES_COUNTER_MAX;
 		return 0;
 		return 0;
 	}
 	}
 
 
-	*res = memparse(buf, &end);
+	res = memparse(buf, &end);
 	if (*end != '\0')
 	if (*end != '\0')
 		return -EINVAL;
 		return -EINVAL;
 
 
-	*res = PAGE_ALIGN(*res);
+	if (PAGE_ALIGN(res) >= res)
+		res = PAGE_ALIGN(res);
+	else
+		res = RES_COUNTER_MAX;
+
+	*resp = res;
+
 	return 0;
 	return 0;
 }
 }

+ 2 - 2
mm/Kconfig

@@ -245,7 +245,7 @@ config COMPACTION
 config MIGRATION
 config MIGRATION
 	bool "Page migration"
 	bool "Page migration"
 	def_bool y
 	def_bool y
-	depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE || COMPACTION || CMA
+	depends on (NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE || COMPACTION || CMA) && MMU
 	help
 	help
 	  Allows the migration of the physical location of pages of processes
 	  Allows the migration of the physical location of pages of processes
 	  while the virtual addresses are not changed. This is useful in
 	  while the virtual addresses are not changed. This is useful in
@@ -480,7 +480,7 @@ config FRONTSWAP
 
 
 config CMA
 config CMA
 	bool "Contiguous Memory Allocator"
 	bool "Contiguous Memory Allocator"
-	depends on HAVE_MEMBLOCK
+	depends on HAVE_MEMBLOCK && MMU
 	select MIGRATION
 	select MIGRATION
 	select MEMORY_ISOLATION
 	select MEMORY_ISOLATION
 	help
 	help

+ 35 - 24
mm/filemap.c

@@ -467,32 +467,34 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
 	error = mem_cgroup_cache_charge(page, current->mm,
 	error = mem_cgroup_cache_charge(page, current->mm,
 					gfp_mask & GFP_RECLAIM_MASK);
 					gfp_mask & GFP_RECLAIM_MASK);
 	if (error)
 	if (error)
-		goto out;
+		return error;
 
 
 	error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM);
 	error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM);
-	if (error == 0) {
-		page_cache_get(page);
-		page->mapping = mapping;
-		page->index = offset;
-
-		spin_lock_irq(&mapping->tree_lock);
-		error = radix_tree_insert(&mapping->page_tree, offset, page);
-		if (likely(!error)) {
-			mapping->nrpages++;
-			__inc_zone_page_state(page, NR_FILE_PAGES);
-			spin_unlock_irq(&mapping->tree_lock);
-			trace_mm_filemap_add_to_page_cache(page);
-		} else {
-			page->mapping = NULL;
-			/* Leave page->index set: truncation relies upon it */
-			spin_unlock_irq(&mapping->tree_lock);
-			mem_cgroup_uncharge_cache_page(page);
-			page_cache_release(page);
-		}
-		radix_tree_preload_end();
-	} else
+	if (error) {
 		mem_cgroup_uncharge_cache_page(page);
 		mem_cgroup_uncharge_cache_page(page);
-out:
+		return error;
+	}
+
+	page_cache_get(page);
+	page->mapping = mapping;
+	page->index = offset;
+
+	spin_lock_irq(&mapping->tree_lock);
+	error = radix_tree_insert(&mapping->page_tree, offset, page);
+	radix_tree_preload_end();
+	if (unlikely(error))
+		goto err_insert;
+	mapping->nrpages++;
+	__inc_zone_page_state(page, NR_FILE_PAGES);
+	spin_unlock_irq(&mapping->tree_lock);
+	trace_mm_filemap_add_to_page_cache(page);
+	return 0;
+err_insert:
+	page->mapping = NULL;
+	/* Leave page->index set: truncation relies upon it */
+	spin_unlock_irq(&mapping->tree_lock);
+	mem_cgroup_uncharge_cache_page(page);
+	page_cache_release(page);
 	return error;
 	return error;
 }
 }
 EXPORT_SYMBOL(add_to_page_cache_locked);
 EXPORT_SYMBOL(add_to_page_cache_locked);
@@ -1614,6 +1616,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	struct inode *inode = mapping->host;
 	struct inode *inode = mapping->host;
 	pgoff_t offset = vmf->pgoff;
 	pgoff_t offset = vmf->pgoff;
 	struct page *page;
 	struct page *page;
+	bool memcg_oom;
 	pgoff_t size;
 	pgoff_t size;
 	int ret = 0;
 	int ret = 0;
 
 
@@ -1622,7 +1625,11 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 		return VM_FAULT_SIGBUS;
 		return VM_FAULT_SIGBUS;
 
 
 	/*
 	/*
-	 * Do we have something in the page cache already?
+	 * Do we have something in the page cache already?  Either
+	 * way, try readahead, but disable the memcg OOM killer for it
+	 * as readahead is optional and no errors are propagated up
+	 * the fault stack.  The OOM killer is enabled while trying to
+	 * instantiate the faulting page individually below.
 	 */
 	 */
 	page = find_get_page(mapping, offset);
 	page = find_get_page(mapping, offset);
 	if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) {
 	if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) {
@@ -1630,10 +1637,14 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 		 * We found the page, so try async readahead before
 		 * We found the page, so try async readahead before
 		 * waiting for the lock.
 		 * waiting for the lock.
 		 */
 		 */
+		memcg_oom = mem_cgroup_toggle_oom(false);
 		do_async_mmap_readahead(vma, ra, file, page, offset);
 		do_async_mmap_readahead(vma, ra, file, page, offset);
+		mem_cgroup_toggle_oom(memcg_oom);
 	} else if (!page) {
 	} else if (!page) {
 		/* No page in the page cache at all */
 		/* No page in the page cache at all */
+		memcg_oom = mem_cgroup_toggle_oom(false);
 		do_sync_mmap_readahead(vma, ra, file, offset);
 		do_sync_mmap_readahead(vma, ra, file, offset);
+		mem_cgroup_toggle_oom(memcg_oom);
 		count_vm_event(PGMAJFAULT);
 		count_vm_event(PGMAJFAULT);
 		mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
 		mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
 		ret = VM_FAULT_MAJOR;
 		ret = VM_FAULT_MAJOR;

+ 56 - 73
mm/huge_memory.c

@@ -695,11 +695,10 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
 	return pmd;
 	return pmd;
 }
 }
 
 
-static inline pmd_t mk_huge_pmd(struct page *page, struct vm_area_struct *vma)
+static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot)
 {
 {
 	pmd_t entry;
 	pmd_t entry;
-	entry = mk_pmd(page, vma->vm_page_prot);
-	entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+	entry = mk_pmd(page, prot);
 	entry = pmd_mkhuge(entry);
 	entry = pmd_mkhuge(entry);
 	return entry;
 	return entry;
 }
 }
@@ -732,7 +731,8 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
 		pte_free(mm, pgtable);
 		pte_free(mm, pgtable);
 	} else {
 	} else {
 		pmd_t entry;
 		pmd_t entry;
-		entry = mk_huge_pmd(page, vma);
+		entry = mk_huge_pmd(page, vma->vm_page_prot);
+		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
 		page_add_new_anon_rmap(page, vma, haddr);
 		page_add_new_anon_rmap(page, vma, haddr);
 		pgtable_trans_huge_deposit(mm, pmd, pgtable);
 		pgtable_trans_huge_deposit(mm, pmd, pgtable);
 		set_pmd_at(mm, haddr, pmd, entry);
 		set_pmd_at(mm, haddr, pmd, entry);
@@ -788,77 +788,57 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 {
 {
 	struct page *page;
 	struct page *page;
 	unsigned long haddr = address & HPAGE_PMD_MASK;
 	unsigned long haddr = address & HPAGE_PMD_MASK;
-	pte_t *pte;
 
 
-	if (haddr >= vma->vm_start && haddr + HPAGE_PMD_SIZE <= vma->vm_end) {
-		if (unlikely(anon_vma_prepare(vma)))
-			return VM_FAULT_OOM;
-		if (unlikely(khugepaged_enter(vma)))
+	if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
+		return VM_FAULT_FALLBACK;
+	if (unlikely(anon_vma_prepare(vma)))
+		return VM_FAULT_OOM;
+	if (unlikely(khugepaged_enter(vma)))
+		return VM_FAULT_OOM;
+	if (!(flags & FAULT_FLAG_WRITE) &&
+			transparent_hugepage_use_zero_page()) {
+		pgtable_t pgtable;
+		struct page *zero_page;
+		bool set;
+		pgtable = pte_alloc_one(mm, haddr);
+		if (unlikely(!pgtable))
 			return VM_FAULT_OOM;
 			return VM_FAULT_OOM;
-		if (!(flags & FAULT_FLAG_WRITE) &&
-				transparent_hugepage_use_zero_page()) {
-			pgtable_t pgtable;
-			struct page *zero_page;
-			bool set;
-			pgtable = pte_alloc_one(mm, haddr);
-			if (unlikely(!pgtable))
-				return VM_FAULT_OOM;
-			zero_page = get_huge_zero_page();
-			if (unlikely(!zero_page)) {
-				pte_free(mm, pgtable);
-				count_vm_event(THP_FAULT_FALLBACK);
-				goto out;
-			}
-			spin_lock(&mm->page_table_lock);
-			set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd,
-					zero_page);
-			spin_unlock(&mm->page_table_lock);
-			if (!set) {
-				pte_free(mm, pgtable);
-				put_huge_zero_page();
-			}
-			return 0;
-		}
-		page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
-					  vma, haddr, numa_node_id(), 0);
-		if (unlikely(!page)) {
+		zero_page = get_huge_zero_page();
+		if (unlikely(!zero_page)) {
+			pte_free(mm, pgtable);
 			count_vm_event(THP_FAULT_FALLBACK);
 			count_vm_event(THP_FAULT_FALLBACK);
-			goto out;
-		}
-		count_vm_event(THP_FAULT_ALLOC);
-		if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) {
-			put_page(page);
-			goto out;
+			return VM_FAULT_FALLBACK;
 		}
 		}
-		if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd,
-							  page))) {
-			mem_cgroup_uncharge_page(page);
-			put_page(page);
-			goto out;
+		spin_lock(&mm->page_table_lock);
+		set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd,
+				zero_page);
+		spin_unlock(&mm->page_table_lock);
+		if (!set) {
+			pte_free(mm, pgtable);
+			put_huge_zero_page();
 		}
 		}
-
 		return 0;
 		return 0;
 	}
 	}
-out:
-	/*
-	 * Use __pte_alloc instead of pte_alloc_map, because we can't
-	 * run pte_offset_map on the pmd, if an huge pmd could
-	 * materialize from under us from a different thread.
-	 */
-	if (unlikely(pmd_none(*pmd)) &&
-	    unlikely(__pte_alloc(mm, vma, pmd, address)))
-		return VM_FAULT_OOM;
-	/* if an huge pmd materialized from under us just retry later */
-	if (unlikely(pmd_trans_huge(*pmd)))
-		return 0;
-	/*
-	 * A regular pmd is established and it can't morph into a huge pmd
-	 * from under us anymore at this point because we hold the mmap_sem
-	 * read mode and khugepaged takes it in write mode. So now it's
-	 * safe to run pte_offset_map().
-	 */
-	pte = pte_offset_map(pmd, address);
-	return handle_pte_fault(mm, vma, address, pte, pmd, flags);
+	page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
+			vma, haddr, numa_node_id(), 0);
+	if (unlikely(!page)) {
+		count_vm_event(THP_FAULT_FALLBACK);
+		return VM_FAULT_FALLBACK;
+	}
+	if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) {
+		put_page(page);
+		count_vm_event(THP_FAULT_FALLBACK);
+		return VM_FAULT_FALLBACK;
+	}
+	if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page))) {
+		mem_cgroup_uncharge_page(page);
+		put_page(page);
+		count_vm_event(THP_FAULT_FALLBACK);
+		return VM_FAULT_FALLBACK;
+	}
+
+	count_vm_event(THP_FAULT_ALLOC);
+	return 0;
 }
 }
 
 
 int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
@@ -1170,7 +1150,6 @@ alloc:
 		new_page = NULL;
 		new_page = NULL;
 
 
 	if (unlikely(!new_page)) {
 	if (unlikely(!new_page)) {
-		count_vm_event(THP_FAULT_FALLBACK);
 		if (is_huge_zero_pmd(orig_pmd)) {
 		if (is_huge_zero_pmd(orig_pmd)) {
 			ret = do_huge_pmd_wp_zero_page_fallback(mm, vma,
 			ret = do_huge_pmd_wp_zero_page_fallback(mm, vma,
 					address, pmd, orig_pmd, haddr);
 					address, pmd, orig_pmd, haddr);
@@ -1181,9 +1160,9 @@ alloc:
 				split_huge_page(page);
 				split_huge_page(page);
 			put_page(page);
 			put_page(page);
 		}
 		}
+		count_vm_event(THP_FAULT_FALLBACK);
 		goto out;
 		goto out;
 	}
 	}
-	count_vm_event(THP_FAULT_ALLOC);
 
 
 	if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
 	if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
 		put_page(new_page);
 		put_page(new_page);
@@ -1191,10 +1170,13 @@ alloc:
 			split_huge_page(page);
 			split_huge_page(page);
 			put_page(page);
 			put_page(page);
 		}
 		}
+		count_vm_event(THP_FAULT_FALLBACK);
 		ret |= VM_FAULT_OOM;
 		ret |= VM_FAULT_OOM;
 		goto out;
 		goto out;
 	}
 	}
 
 
+	count_vm_event(THP_FAULT_ALLOC);
+
 	if (is_huge_zero_pmd(orig_pmd))
 	if (is_huge_zero_pmd(orig_pmd))
 		clear_huge_page(new_page, haddr, HPAGE_PMD_NR);
 		clear_huge_page(new_page, haddr, HPAGE_PMD_NR);
 	else
 	else
@@ -1215,7 +1197,8 @@ alloc:
 		goto out_mn;
 		goto out_mn;
 	} else {
 	} else {
 		pmd_t entry;
 		pmd_t entry;
-		entry = mk_huge_pmd(new_page, vma);
+		entry = mk_huge_pmd(new_page, vma->vm_page_prot);
+		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
 		pmdp_clear_flush(vma, haddr, pmd);
 		pmdp_clear_flush(vma, haddr, pmd);
 		page_add_new_anon_rmap(new_page, vma, haddr);
 		page_add_new_anon_rmap(new_page, vma, haddr);
 		set_pmd_at(mm, haddr, pmd, entry);
 		set_pmd_at(mm, haddr, pmd, entry);
@@ -1666,7 +1649,6 @@ static void __split_huge_page_refcount(struct page *page,
 	BUG_ON(atomic_read(&page->_count) <= 0);
 	BUG_ON(atomic_read(&page->_count) <= 0);
 
 
 	__mod_zone_page_state(zone, NR_ANON_TRANSPARENT_HUGEPAGES, -1);
 	__mod_zone_page_state(zone, NR_ANON_TRANSPARENT_HUGEPAGES, -1);
-	__mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR);
 
 
 	ClearPageCompound(page);
 	ClearPageCompound(page);
 	compound_unlock(page);
 	compound_unlock(page);
@@ -2364,7 +2346,8 @@ static void collapse_huge_page(struct mm_struct *mm,
 	__SetPageUptodate(new_page);
 	__SetPageUptodate(new_page);
 	pgtable = pmd_pgtable(_pmd);
 	pgtable = pmd_pgtable(_pmd);
 
 
-	_pmd = mk_huge_pmd(new_page, vma);
+	_pmd = mk_huge_pmd(new_page, vma->vm_page_prot);
+	_pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
 
 
 	/*
 	/*
 	 * spin_lock() below is not the equivalent of smp_wmb(), so
 	 * spin_lock() below is not the equivalent of smp_wmb(), so

+ 337 - 534
mm/memcontrol.c

@@ -39,7 +39,6 @@
 #include <linux/limits.h>
 #include <linux/limits.h>
 #include <linux/export.h>
 #include <linux/export.h>
 #include <linux/mutex.h>
 #include <linux/mutex.h>
-#include <linux/rbtree.h>
 #include <linux/slab.h>
 #include <linux/slab.h>
 #include <linux/swap.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
 #include <linux/swapops.h>
@@ -85,26 +84,12 @@ static int really_do_swap_account __initdata = 0;
 #endif
 #endif
 
 
 
 
-/*
- * Statistics for memory cgroup.
- */
-enum mem_cgroup_stat_index {
-	/*
-	 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
-	 */
-	MEM_CGROUP_STAT_CACHE,		/* # of pages charged as cache */
-	MEM_CGROUP_STAT_RSS,		/* # of pages charged as anon rss */
-	MEM_CGROUP_STAT_RSS_HUGE,	/* # of pages charged as anon huge */
-	MEM_CGROUP_STAT_FILE_MAPPED,	/* # of pages charged as file rss */
-	MEM_CGROUP_STAT_SWAP,		/* # of pages, swapped out */
-	MEM_CGROUP_STAT_NSTATS,
-};
-
 static const char * const mem_cgroup_stat_names[] = {
 static const char * const mem_cgroup_stat_names[] = {
 	"cache",
 	"cache",
 	"rss",
 	"rss",
 	"rss_huge",
 	"rss_huge",
 	"mapped_file",
 	"mapped_file",
+	"writeback",
 	"swap",
 	"swap",
 };
 };
 
 
@@ -175,10 +160,6 @@ struct mem_cgroup_per_zone {
 
 
 	struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
 	struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
 
 
-	struct rb_node		tree_node;	/* RB tree node */
-	unsigned long long	usage_in_excess;/* Set to the value by which */
-						/* the soft limit is exceeded*/
-	bool			on_tree;
 	struct mem_cgroup	*memcg;		/* Back pointer, we cannot */
 	struct mem_cgroup	*memcg;		/* Back pointer, we cannot */
 						/* use container_of	   */
 						/* use container_of	   */
 };
 };
@@ -187,26 +168,6 @@ struct mem_cgroup_per_node {
 	struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
 	struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
 };
 };
 
 
-/*
- * Cgroups above their limits are maintained in a RB-Tree, independent of
- * their hierarchy representation
- */
-
-struct mem_cgroup_tree_per_zone {
-	struct rb_root rb_root;
-	spinlock_t lock;
-};
-
-struct mem_cgroup_tree_per_node {
-	struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
-};
-
-struct mem_cgroup_tree {
-	struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
-};
-
-static struct mem_cgroup_tree soft_limit_tree __read_mostly;
-
 struct mem_cgroup_threshold {
 struct mem_cgroup_threshold {
 	struct eventfd_ctx *eventfd;
 	struct eventfd_ctx *eventfd;
 	u64 threshold;
 	u64 threshold;
@@ -280,6 +241,7 @@ struct mem_cgroup {
 
 
 	bool		oom_lock;
 	bool		oom_lock;
 	atomic_t	under_oom;
 	atomic_t	under_oom;
+	atomic_t	oom_wakeups;
 
 
 	int	swappiness;
 	int	swappiness;
 	/* OOM-Killer disable */
 	/* OOM-Killer disable */
@@ -304,7 +266,7 @@ struct mem_cgroup {
 	 * Should we move charges of a task when a task is moved into this
 	 * Should we move charges of a task when a task is moved into this
 	 * mem_cgroup ? And what type of charges should we move ?
 	 * mem_cgroup ? And what type of charges should we move ?
 	 */
 	 */
-	unsigned long 	move_charge_at_immigrate;
+	unsigned long move_charge_at_immigrate;
 	/*
 	/*
 	 * set > 0 if pages under this cgroup are moving to other cgroup.
 	 * set > 0 if pages under this cgroup are moving to other cgroup.
 	 */
 	 */
@@ -341,6 +303,22 @@ struct mem_cgroup {
 	atomic_t	numainfo_events;
 	atomic_t	numainfo_events;
 	atomic_t	numainfo_updating;
 	atomic_t	numainfo_updating;
 #endif
 #endif
+	/*
+	 * Protects soft_contributed transitions.
+	 * See mem_cgroup_update_soft_limit
+	 */
+	spinlock_t soft_lock;
+
+	/*
+	 * If true then this group has increased parents' children_in_excess
+	 * when it got over the soft limit.
+	 * When a group falls bellow the soft limit, parents' children_in_excess
+	 * is decreased and soft_contributed changed to false.
+	 */
+	bool soft_contributed;
+
+	/* Number of children that are in soft limit excess */
+	atomic_t children_in_excess;
 
 
 	struct mem_cgroup_per_node *nodeinfo[0];
 	struct mem_cgroup_per_node *nodeinfo[0];
 	/* WARNING: nodeinfo must be the last member here */
 	/* WARNING: nodeinfo must be the last member here */
@@ -444,7 +422,6 @@ static bool move_file(void)
  * limit reclaim to prevent infinite loops, if they ever occur.
  * limit reclaim to prevent infinite loops, if they ever occur.
  */
  */
 #define	MEM_CGROUP_MAX_RECLAIM_LOOPS		100
 #define	MEM_CGROUP_MAX_RECLAIM_LOOPS		100
-#define	MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS	2
 
 
 enum charge_type {
 enum charge_type {
 	MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
 	MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
@@ -671,164 +648,6 @@ page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page)
 	return mem_cgroup_zoneinfo(memcg, nid, zid);
 	return mem_cgroup_zoneinfo(memcg, nid, zid);
 }
 }
 
 
-static struct mem_cgroup_tree_per_zone *
-soft_limit_tree_node_zone(int nid, int zid)
-{
-	return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
-}
-
-static struct mem_cgroup_tree_per_zone *
-soft_limit_tree_from_page(struct page *page)
-{
-	int nid = page_to_nid(page);
-	int zid = page_zonenum(page);
-
-	return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
-}
-
-static void
-__mem_cgroup_insert_exceeded(struct mem_cgroup *memcg,
-				struct mem_cgroup_per_zone *mz,
-				struct mem_cgroup_tree_per_zone *mctz,
-				unsigned long long new_usage_in_excess)
-{
-	struct rb_node **p = &mctz->rb_root.rb_node;
-	struct rb_node *parent = NULL;
-	struct mem_cgroup_per_zone *mz_node;
-
-	if (mz->on_tree)
-		return;
-
-	mz->usage_in_excess = new_usage_in_excess;
-	if (!mz->usage_in_excess)
-		return;
-	while (*p) {
-		parent = *p;
-		mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
-					tree_node);
-		if (mz->usage_in_excess < mz_node->usage_in_excess)
-			p = &(*p)->rb_left;
-		/*
-		 * We can't avoid mem cgroups that are over their soft
-		 * limit by the same amount
-		 */
-		else if (mz->usage_in_excess >= mz_node->usage_in_excess)
-			p = &(*p)->rb_right;
-	}
-	rb_link_node(&mz->tree_node, parent, p);
-	rb_insert_color(&mz->tree_node, &mctz->rb_root);
-	mz->on_tree = true;
-}
-
-static void
-__mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
-				struct mem_cgroup_per_zone *mz,
-				struct mem_cgroup_tree_per_zone *mctz)
-{
-	if (!mz->on_tree)
-		return;
-	rb_erase(&mz->tree_node, &mctz->rb_root);
-	mz->on_tree = false;
-}
-
-static void
-mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
-				struct mem_cgroup_per_zone *mz,
-				struct mem_cgroup_tree_per_zone *mctz)
-{
-	spin_lock(&mctz->lock);
-	__mem_cgroup_remove_exceeded(memcg, mz, mctz);
-	spin_unlock(&mctz->lock);
-}
-
-
-static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
-{
-	unsigned long long excess;
-	struct mem_cgroup_per_zone *mz;
-	struct mem_cgroup_tree_per_zone *mctz;
-	int nid = page_to_nid(page);
-	int zid = page_zonenum(page);
-	mctz = soft_limit_tree_from_page(page);
-
-	/*
-	 * Necessary to update all ancestors when hierarchy is used.
-	 * because their event counter is not touched.
-	 */
-	for (; memcg; memcg = parent_mem_cgroup(memcg)) {
-		mz = mem_cgroup_zoneinfo(memcg, nid, zid);
-		excess = res_counter_soft_limit_excess(&memcg->res);
-		/*
-		 * We have to update the tree if mz is on RB-tree or
-		 * mem is over its softlimit.
-		 */
-		if (excess || mz->on_tree) {
-			spin_lock(&mctz->lock);
-			/* if on-tree, remove it */
-			if (mz->on_tree)
-				__mem_cgroup_remove_exceeded(memcg, mz, mctz);
-			/*
-			 * Insert again. mz->usage_in_excess will be updated.
-			 * If excess is 0, no tree ops.
-			 */
-			__mem_cgroup_insert_exceeded(memcg, mz, mctz, excess);
-			spin_unlock(&mctz->lock);
-		}
-	}
-}
-
-static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
-{
-	int node, zone;
-	struct mem_cgroup_per_zone *mz;
-	struct mem_cgroup_tree_per_zone *mctz;
-
-	for_each_node(node) {
-		for (zone = 0; zone < MAX_NR_ZONES; zone++) {
-			mz = mem_cgroup_zoneinfo(memcg, node, zone);
-			mctz = soft_limit_tree_node_zone(node, zone);
-			mem_cgroup_remove_exceeded(memcg, mz, mctz);
-		}
-	}
-}
-
-static struct mem_cgroup_per_zone *
-__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
-{
-	struct rb_node *rightmost = NULL;
-	struct mem_cgroup_per_zone *mz;
-
-retry:
-	mz = NULL;
-	rightmost = rb_last(&mctz->rb_root);
-	if (!rightmost)
-		goto done;		/* Nothing to reclaim from */
-
-	mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
-	/*
-	 * Remove the node now but someone else can add it back,
-	 * we will to add it back at the end of reclaim to its correct
-	 * position in the tree.
-	 */
-	__mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
-	if (!res_counter_soft_limit_excess(&mz->memcg->res) ||
-		!css_tryget(&mz->memcg->css))
-		goto retry;
-done:
-	return mz;
-}
-
-static struct mem_cgroup_per_zone *
-mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
-{
-	struct mem_cgroup_per_zone *mz;
-
-	spin_lock(&mctz->lock);
-	mz = __mem_cgroup_largest_soft_limit_node(mctz);
-	spin_unlock(&mctz->lock);
-	return mz;
-}
-
 /*
 /*
  * Implementation Note: reading percpu statistics for memcg.
  * Implementation Note: reading percpu statistics for memcg.
  *
  *
@@ -1002,6 +821,48 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
 	return false;
 	return false;
 }
 }
 
 
+/*
+ * Called from rate-limited memcg_check_events when enough
+ * MEM_CGROUP_TARGET_SOFTLIMIT events are accumulated and it makes sure
+ * that all the parents up the hierarchy will be notified that this group
+ * is in excess or that it is not in excess anymore. mmecg->soft_contributed
+ * makes the transition a single action whenever the state flips from one to
+ * the other.
+ */
+static void mem_cgroup_update_soft_limit(struct mem_cgroup *memcg)
+{
+	unsigned long long excess = res_counter_soft_limit_excess(&memcg->res);
+	struct mem_cgroup *parent = memcg;
+	int delta = 0;
+
+	spin_lock(&memcg->soft_lock);
+	if (excess) {
+		if (!memcg->soft_contributed) {
+			delta = 1;
+			memcg->soft_contributed = true;
+		}
+	} else {
+		if (memcg->soft_contributed) {
+			delta = -1;
+			memcg->soft_contributed = false;
+		}
+	}
+
+	/*
+	 * Necessary to update all ancestors when hierarchy is used
+	 * because their event counter is not touched.
+	 * We track children even outside the hierarchy for the root
+	 * cgroup because tree walk starting at root should visit
+	 * all cgroups and we want to prevent from pointless tree
+	 * walk if no children is below the limit.
+	 */
+	while (delta && (parent = parent_mem_cgroup(parent)))
+		atomic_add(delta, &parent->children_in_excess);
+	if (memcg != root_mem_cgroup && !root_mem_cgroup->use_hierarchy)
+		atomic_add(delta, &root_mem_cgroup->children_in_excess);
+	spin_unlock(&memcg->soft_lock);
+}
+
 /*
 /*
  * Check events in order.
  * Check events in order.
  *
  *
@@ -1025,7 +886,7 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
 
 
 		mem_cgroup_threshold(memcg);
 		mem_cgroup_threshold(memcg);
 		if (unlikely(do_softlimit))
 		if (unlikely(do_softlimit))
-			mem_cgroup_update_tree(memcg, page);
+			mem_cgroup_update_soft_limit(memcg);
 #if MAX_NUMNODES > 1
 #if MAX_NUMNODES > 1
 		if (unlikely(do_numainfo))
 		if (unlikely(do_numainfo))
 			atomic_inc(&memcg->numainfo_events);
 			atomic_inc(&memcg->numainfo_events);
@@ -1068,6 +929,15 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
 	return memcg;
 	return memcg;
 }
 }
 
 
+static enum mem_cgroup_filter_t
+mem_cgroup_filter(struct mem_cgroup *memcg, struct mem_cgroup *root,
+		mem_cgroup_iter_filter cond)
+{
+	if (!cond)
+		return VISIT;
+	return cond(memcg, root);
+}
+
 /*
 /*
  * Returns a next (in a pre-order walk) alive memcg (with elevated css
  * Returns a next (in a pre-order walk) alive memcg (with elevated css
  * ref. count) or NULL if the whole root's subtree has been visited.
  * ref. count) or NULL if the whole root's subtree has been visited.
@@ -1075,7 +945,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
  * helper function to be used by mem_cgroup_iter
  * helper function to be used by mem_cgroup_iter
  */
  */
 static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root,
 static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root,
-		struct mem_cgroup *last_visited)
+		struct mem_cgroup *last_visited, mem_cgroup_iter_filter cond)
 {
 {
 	struct cgroup_subsys_state *prev_css, *next_css;
 	struct cgroup_subsys_state *prev_css, *next_css;
 
 
@@ -1093,11 +963,31 @@ skip_node:
 	if (next_css) {
 	if (next_css) {
 		struct mem_cgroup *mem = mem_cgroup_from_css(next_css);
 		struct mem_cgroup *mem = mem_cgroup_from_css(next_css);
 
 
-		if (css_tryget(&mem->css))
-			return mem;
-		else {
+		switch (mem_cgroup_filter(mem, root, cond)) {
+		case SKIP:
 			prev_css = next_css;
 			prev_css = next_css;
 			goto skip_node;
 			goto skip_node;
+		case SKIP_TREE:
+			if (mem == root)
+				return NULL;
+			/*
+			 * css_rightmost_descendant is not an optimal way to
+			 * skip through a subtree (especially for imbalanced
+			 * trees leaning to right) but that's what we have right
+			 * now. More effective solution would be traversing
+			 * right-up for first non-NULL without calling
+			 * css_next_descendant_pre afterwards.
+			 */
+			prev_css = css_rightmost_descendant(next_css);
+			goto skip_node;
+		case VISIT:
+			if (css_tryget(&mem->css))
+				return mem;
+			else {
+				prev_css = next_css;
+				goto skip_node;
+			}
+			break;
 		}
 		}
 	}
 	}
 
 
@@ -1161,6 +1051,7 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
  * @root: hierarchy root
  * @root: hierarchy root
  * @prev: previously returned memcg, NULL on first invocation
  * @prev: previously returned memcg, NULL on first invocation
  * @reclaim: cookie for shared reclaim walks, NULL for full walks
  * @reclaim: cookie for shared reclaim walks, NULL for full walks
+ * @cond: filter for visited nodes, NULL for no filter
  *
  *
  * Returns references to children of the hierarchy below @root, or
  * Returns references to children of the hierarchy below @root, or
  * @root itself, or %NULL after a full round-trip.
  * @root itself, or %NULL after a full round-trip.
@@ -1173,15 +1064,18 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
  * divide up the memcgs in the hierarchy among all concurrent
  * divide up the memcgs in the hierarchy among all concurrent
  * reclaimers operating on the same zone and priority.
  * reclaimers operating on the same zone and priority.
  */
  */
-struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
+struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root,
 				   struct mem_cgroup *prev,
 				   struct mem_cgroup *prev,
-				   struct mem_cgroup_reclaim_cookie *reclaim)
+				   struct mem_cgroup_reclaim_cookie *reclaim,
+				   mem_cgroup_iter_filter cond)
 {
 {
 	struct mem_cgroup *memcg = NULL;
 	struct mem_cgroup *memcg = NULL;
 	struct mem_cgroup *last_visited = NULL;
 	struct mem_cgroup *last_visited = NULL;
 
 
-	if (mem_cgroup_disabled())
-		return NULL;
+	if (mem_cgroup_disabled()) {
+		/* first call must return non-NULL, second return NULL */
+		return (struct mem_cgroup *)(unsigned long)!prev;
+	}
 
 
 	if (!root)
 	if (!root)
 		root = root_mem_cgroup;
 		root = root_mem_cgroup;
@@ -1192,7 +1086,9 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
 	if (!root->use_hierarchy && root != root_mem_cgroup) {
 	if (!root->use_hierarchy && root != root_mem_cgroup) {
 		if (prev)
 		if (prev)
 			goto out_css_put;
 			goto out_css_put;
-		return root;
+		if (mem_cgroup_filter(root, root, cond) == VISIT)
+			return root;
+		return NULL;
 	}
 	}
 
 
 	rcu_read_lock();
 	rcu_read_lock();
@@ -1215,7 +1111,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
 			last_visited = mem_cgroup_iter_load(iter, root, &seq);
 			last_visited = mem_cgroup_iter_load(iter, root, &seq);
 		}
 		}
 
 
-		memcg = __mem_cgroup_iter_next(root, last_visited);
+		memcg = __mem_cgroup_iter_next(root, last_visited, cond);
 
 
 		if (reclaim) {
 		if (reclaim) {
 			mem_cgroup_iter_update(iter, last_visited, memcg, seq);
 			mem_cgroup_iter_update(iter, last_visited, memcg, seq);
@@ -1226,7 +1122,11 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
 				reclaim->generation = iter->generation;
 				reclaim->generation = iter->generation;
 		}
 		}
 
 
-		if (prev && !memcg)
+		/*
+		 * We have finished the whole tree walk or no group has been
+		 * visited because filter told us to skip the root node.
+		 */
+		if (!memcg && (prev || (cond && !last_visited)))
 			goto out_unlock;
 			goto out_unlock;
 	}
 	}
 out_unlock:
 out_unlock:
@@ -1867,6 +1767,7 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
 	return total;
 	return total;
 }
 }
 
 
+#if MAX_NUMNODES > 1
 /**
 /**
  * test_mem_cgroup_node_reclaimable
  * test_mem_cgroup_node_reclaimable
  * @memcg: the target memcg
  * @memcg: the target memcg
@@ -1889,7 +1790,6 @@ static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
 	return false;
 	return false;
 
 
 }
 }
-#if MAX_NUMNODES > 1
 
 
 /*
 /*
  * Always updating the nodemask is not very good - even if we have an empty
  * Always updating the nodemask is not very good - even if we have an empty
@@ -1957,115 +1857,64 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
 	return node;
 	return node;
 }
 }
 
 
-/*
- * Check all nodes whether it contains reclaimable pages or not.
- * For quick scan, we make use of scan_nodes. This will allow us to skip
- * unused nodes. But scan_nodes is lazily updated and may not cotain
- * enough new information. We need to do double check.
- */
-static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
-{
-	int nid;
-
-	/*
-	 * quick check...making use of scan_node.
-	 * We can skip unused nodes.
-	 */
-	if (!nodes_empty(memcg->scan_nodes)) {
-		for (nid = first_node(memcg->scan_nodes);
-		     nid < MAX_NUMNODES;
-		     nid = next_node(nid, memcg->scan_nodes)) {
-
-			if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
-				return true;
-		}
-	}
-	/*
-	 * Check rest of nodes.
-	 */
-	for_each_node_state(nid, N_MEMORY) {
-		if (node_isset(nid, memcg->scan_nodes))
-			continue;
-		if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
-			return true;
-	}
-	return false;
-}
-
 #else
 #else
 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
 {
 {
 	return 0;
 	return 0;
 }
 }
 
 
-static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
-{
-	return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);
-}
 #endif
 #endif
 
 
-static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
-				   struct zone *zone,
-				   gfp_t gfp_mask,
-				   unsigned long *total_scanned)
-{
-	struct mem_cgroup *victim = NULL;
-	int total = 0;
-	int loop = 0;
-	unsigned long excess;
-	unsigned long nr_scanned;
-	struct mem_cgroup_reclaim_cookie reclaim = {
-		.zone = zone,
-		.priority = 0,
-	};
-
-	excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT;
-
-	while (1) {
-		victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
-		if (!victim) {
-			loop++;
-			if (loop >= 2) {
-				/*
-				 * If we have not been able to reclaim
-				 * anything, it might because there are
-				 * no reclaimable pages under this hierarchy
-				 */
-				if (!total)
-					break;
-				/*
-				 * We want to do more targeted reclaim.
-				 * excess >> 2 is not to excessive so as to
-				 * reclaim too much, nor too less that we keep
-				 * coming back to reclaim from this cgroup
-				 */
-				if (total >= (excess >> 2) ||
-					(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
-					break;
-			}
-			continue;
-		}
-		if (!mem_cgroup_reclaimable(victim, false))
-			continue;
-		total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
-						     zone, &nr_scanned);
-		*total_scanned += nr_scanned;
-		if (!res_counter_soft_limit_excess(&root_memcg->res))
+/*
+ * A group is eligible for the soft limit reclaim under the given root
+ * hierarchy if
+ *	a) it is over its soft limit
+ *	b) any parent up the hierarchy is over its soft limit
+ *
+ * If the given group doesn't have any children over the limit then it
+ * doesn't make any sense to iterate its subtree.
+ */
+enum mem_cgroup_filter_t
+mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg,
+		struct mem_cgroup *root)
+{
+	struct mem_cgroup *parent;
+
+	if (!memcg)
+		memcg = root_mem_cgroup;
+	parent = memcg;
+
+	if (res_counter_soft_limit_excess(&memcg->res))
+		return VISIT;
+
+	/*
+	 * If any parent up to the root in the hierarchy is over its soft limit
+	 * then we have to obey and reclaim from this group as well.
+	 */
+	while ((parent = parent_mem_cgroup(parent))) {
+		if (res_counter_soft_limit_excess(&parent->res))
+			return VISIT;
+		if (parent == root)
 			break;
 			break;
 	}
 	}
-	mem_cgroup_iter_break(root_memcg, victim);
-	return total;
+
+	if (!atomic_read(&memcg->children_in_excess))
+		return SKIP_TREE;
+	return SKIP;
 }
 }
 
 
+static DEFINE_SPINLOCK(memcg_oom_lock);
+
 /*
 /*
  * Check OOM-Killer is already running under our hierarchy.
  * Check OOM-Killer is already running under our hierarchy.
  * If someone is running, return false.
  * If someone is running, return false.
- * Has to be called with memcg_oom_lock
  */
  */
-static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg)
+static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
 {
 {
 	struct mem_cgroup *iter, *failed = NULL;
 	struct mem_cgroup *iter, *failed = NULL;
 
 
+	spin_lock(&memcg_oom_lock);
+
 	for_each_mem_cgroup_tree(iter, memcg) {
 	for_each_mem_cgroup_tree(iter, memcg) {
 		if (iter->oom_lock) {
 		if (iter->oom_lock) {
 			/*
 			/*
@@ -2079,33 +1928,33 @@ static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg)
 			iter->oom_lock = true;
 			iter->oom_lock = true;
 	}
 	}
 
 
-	if (!failed)
-		return true;
-
-	/*
-	 * OK, we failed to lock the whole subtree so we have to clean up
-	 * what we set up to the failing subtree
-	 */
-	for_each_mem_cgroup_tree(iter, memcg) {
-		if (iter == failed) {
-			mem_cgroup_iter_break(memcg, iter);
-			break;
+	if (failed) {
+		/*
+		 * OK, we failed to lock the whole subtree so we have
+		 * to clean up what we set up to the failing subtree
+		 */
+		for_each_mem_cgroup_tree(iter, memcg) {
+			if (iter == failed) {
+				mem_cgroup_iter_break(memcg, iter);
+				break;
+			}
+			iter->oom_lock = false;
 		}
 		}
-		iter->oom_lock = false;
 	}
 	}
-	return false;
+
+	spin_unlock(&memcg_oom_lock);
+
+	return !failed;
 }
 }
 
 
-/*
- * Has to be called with memcg_oom_lock
- */
-static int mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
+static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
 {
 {
 	struct mem_cgroup *iter;
 	struct mem_cgroup *iter;
 
 
+	spin_lock(&memcg_oom_lock);
 	for_each_mem_cgroup_tree(iter, memcg)
 	for_each_mem_cgroup_tree(iter, memcg)
 		iter->oom_lock = false;
 		iter->oom_lock = false;
-	return 0;
+	spin_unlock(&memcg_oom_lock);
 }
 }
 
 
 static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
 static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
@@ -2129,7 +1978,6 @@ static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
 		atomic_add_unless(&iter->under_oom, -1, 0);
 		atomic_add_unless(&iter->under_oom, -1, 0);
 }
 }
 
 
-static DEFINE_SPINLOCK(memcg_oom_lock);
 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
 
 
 struct oom_wait_info {
 struct oom_wait_info {
@@ -2159,6 +2007,7 @@ static int memcg_oom_wake_function(wait_queue_t *wait,
 
 
 static void memcg_wakeup_oom(struct mem_cgroup *memcg)
 static void memcg_wakeup_oom(struct mem_cgroup *memcg)
 {
 {
+	atomic_inc(&memcg->oom_wakeups);
 	/* for filtering, pass "memcg" as argument. */
 	/* for filtering, pass "memcg" as argument. */
 	__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
 	__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
 }
 }
@@ -2170,56 +2019,136 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
 }
 }
 
 
 /*
 /*
- * try to call OOM killer. returns false if we should exit memory-reclaim loop.
+ * try to call OOM killer
  */
  */
-static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask,
-				  int order)
+static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
 {
 {
-	struct oom_wait_info owait;
-	bool locked, need_to_kill;
+	bool locked;
+	int wakeups;
 
 
-	owait.memcg = memcg;
-	owait.wait.flags = 0;
-	owait.wait.func = memcg_oom_wake_function;
-	owait.wait.private = current;
-	INIT_LIST_HEAD(&owait.wait.task_list);
-	need_to_kill = true;
-	mem_cgroup_mark_under_oom(memcg);
+	if (!current->memcg_oom.may_oom)
+		return;
+
+	current->memcg_oom.in_memcg_oom = 1;
 
 
-	/* At first, try to OOM lock hierarchy under memcg.*/
-	spin_lock(&memcg_oom_lock);
-	locked = mem_cgroup_oom_lock(memcg);
 	/*
 	/*
-	 * Even if signal_pending(), we can't quit charge() loop without
-	 * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
-	 * under OOM is always welcomed, use TASK_KILLABLE here.
+	 * As with any blocking lock, a contender needs to start
+	 * listening for wakeups before attempting the trylock,
+	 * otherwise it can miss the wakeup from the unlock and sleep
+	 * indefinitely.  This is just open-coded because our locking
+	 * is so particular to memcg hierarchies.
 	 */
 	 */
-	prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
-	if (!locked || memcg->oom_kill_disable)
-		need_to_kill = false;
+	wakeups = atomic_read(&memcg->oom_wakeups);
+	mem_cgroup_mark_under_oom(memcg);
+
+	locked = mem_cgroup_oom_trylock(memcg);
+
 	if (locked)
 	if (locked)
 		mem_cgroup_oom_notify(memcg);
 		mem_cgroup_oom_notify(memcg);
-	spin_unlock(&memcg_oom_lock);
 
 
-	if (need_to_kill) {
-		finish_wait(&memcg_oom_waitq, &owait.wait);
+	if (locked && !memcg->oom_kill_disable) {
+		mem_cgroup_unmark_under_oom(memcg);
 		mem_cgroup_out_of_memory(memcg, mask, order);
 		mem_cgroup_out_of_memory(memcg, mask, order);
+		mem_cgroup_oom_unlock(memcg);
+		/*
+		 * There is no guarantee that an OOM-lock contender
+		 * sees the wakeups triggered by the OOM kill
+		 * uncharges.  Wake any sleepers explicitely.
+		 */
+		memcg_oom_recover(memcg);
 	} else {
 	} else {
-		schedule();
-		finish_wait(&memcg_oom_waitq, &owait.wait);
+		/*
+		 * A system call can just return -ENOMEM, but if this
+		 * is a page fault and somebody else is handling the
+		 * OOM already, we need to sleep on the OOM waitqueue
+		 * for this memcg until the situation is resolved.
+		 * Which can take some time because it might be
+		 * handled by a userspace task.
+		 *
+		 * However, this is the charge context, which means
+		 * that we may sit on a large call stack and hold
+		 * various filesystem locks, the mmap_sem etc. and we
+		 * don't want the OOM handler to deadlock on them
+		 * while we sit here and wait.  Store the current OOM
+		 * context in the task_struct, then return -ENOMEM.
+		 * At the end of the page fault handler, with the
+		 * stack unwound, pagefault_out_of_memory() will check
+		 * back with us by calling
+		 * mem_cgroup_oom_synchronize(), possibly putting the
+		 * task to sleep.
+		 */
+		current->memcg_oom.oom_locked = locked;
+		current->memcg_oom.wakeups = wakeups;
+		css_get(&memcg->css);
+		current->memcg_oom.wait_on_memcg = memcg;
 	}
 	}
-	spin_lock(&memcg_oom_lock);
-	if (locked)
-		mem_cgroup_oom_unlock(memcg);
-	memcg_wakeup_oom(memcg);
-	spin_unlock(&memcg_oom_lock);
+}
 
 
-	mem_cgroup_unmark_under_oom(memcg);
+/**
+ * mem_cgroup_oom_synchronize - complete memcg OOM handling
+ *
+ * This has to be called at the end of a page fault if the the memcg
+ * OOM handler was enabled and the fault is returning %VM_FAULT_OOM.
+ *
+ * Memcg supports userspace OOM handling, so failed allocations must
+ * sleep on a waitqueue until the userspace task resolves the
+ * situation.  Sleeping directly in the charge context with all kinds
+ * of locks held is not a good idea, instead we remember an OOM state
+ * in the task and mem_cgroup_oom_synchronize() has to be called at
+ * the end of the page fault to put the task to sleep and clean up the
+ * OOM state.
+ *
+ * Returns %true if an ongoing memcg OOM situation was detected and
+ * finalized, %false otherwise.
+ */
+bool mem_cgroup_oom_synchronize(void)
+{
+	struct oom_wait_info owait;
+	struct mem_cgroup *memcg;
 
 
-	if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
+	/* OOM is global, do not handle */
+	if (!current->memcg_oom.in_memcg_oom)
 		return false;
 		return false;
-	/* Give chance to dying process */
-	schedule_timeout_uninterruptible(1);
+
+	/*
+	 * We invoked the OOM killer but there is a chance that a kill
+	 * did not free up any charges.  Everybody else might already
+	 * be sleeping, so restart the fault and keep the rampage
+	 * going until some charges are released.
+	 */
+	memcg = current->memcg_oom.wait_on_memcg;
+	if (!memcg)
+		goto out;
+
+	if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
+		goto out_memcg;
+
+	owait.memcg = memcg;
+	owait.wait.flags = 0;
+	owait.wait.func = memcg_oom_wake_function;
+	owait.wait.private = current;
+	INIT_LIST_HEAD(&owait.wait.task_list);
+
+	prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
+	/* Only sleep if we didn't miss any wakeups since OOM */
+	if (atomic_read(&memcg->oom_wakeups) == current->memcg_oom.wakeups)
+		schedule();
+	finish_wait(&memcg_oom_waitq, &owait.wait);
+out_memcg:
+	mem_cgroup_unmark_under_oom(memcg);
+	if (current->memcg_oom.oom_locked) {
+		mem_cgroup_oom_unlock(memcg);
+		/*
+		 * There is no guarantee that an OOM-lock contender
+		 * sees the wakeups triggered by the OOM kill
+		 * uncharges.  Wake any sleepers explicitely.
+		 */
+		memcg_oom_recover(memcg);
+	}
+	css_put(&memcg->css);
+	current->memcg_oom.wait_on_memcg = NULL;
+out:
+	current->memcg_oom.in_memcg_oom = 0;
 	return true;
 	return true;
 }
 }
 
 
@@ -2288,7 +2217,7 @@ void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags)
 }
 }
 
 
 void mem_cgroup_update_page_stat(struct page *page,
 void mem_cgroup_update_page_stat(struct page *page,
-				 enum mem_cgroup_page_stat_item idx, int val)
+				 enum mem_cgroup_stat_index idx, int val)
 {
 {
 	struct mem_cgroup *memcg;
 	struct mem_cgroup *memcg;
 	struct page_cgroup *pc = lookup_page_cgroup(page);
 	struct page_cgroup *pc = lookup_page_cgroup(page);
@@ -2297,18 +2226,11 @@ void mem_cgroup_update_page_stat(struct page *page,
 	if (mem_cgroup_disabled())
 	if (mem_cgroup_disabled())
 		return;
 		return;
 
 
+	VM_BUG_ON(!rcu_read_lock_held());
 	memcg = pc->mem_cgroup;
 	memcg = pc->mem_cgroup;
 	if (unlikely(!memcg || !PageCgroupUsed(pc)))
 	if (unlikely(!memcg || !PageCgroupUsed(pc)))
 		return;
 		return;
 
 
-	switch (idx) {
-	case MEMCG_NR_FILE_MAPPED:
-		idx = MEM_CGROUP_STAT_FILE_MAPPED;
-		break;
-	default:
-		BUG();
-	}
-
 	this_cpu_add(memcg->stat->count[idx], val);
 	this_cpu_add(memcg->stat->count[idx], val);
 }
 }
 
 
@@ -2450,7 +2372,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)
 			flush_work(&stock->work);
 			flush_work(&stock->work);
 	}
 	}
 out:
 out:
- 	put_online_cpus();
+	put_online_cpus();
 }
 }
 
 
 /*
 /*
@@ -2532,12 +2454,11 @@ enum {
 	CHARGE_RETRY,		/* need to retry but retry is not bad */
 	CHARGE_RETRY,		/* need to retry but retry is not bad */
 	CHARGE_NOMEM,		/* we can't do more. return -ENOMEM */
 	CHARGE_NOMEM,		/* we can't do more. return -ENOMEM */
 	CHARGE_WOULDBLOCK,	/* GFP_WAIT wasn't set and no enough res. */
 	CHARGE_WOULDBLOCK,	/* GFP_WAIT wasn't set and no enough res. */
-	CHARGE_OOM_DIE,		/* the current is killed because of OOM */
 };
 };
 
 
 static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
 static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
 				unsigned int nr_pages, unsigned int min_pages,
 				unsigned int nr_pages, unsigned int min_pages,
-				bool oom_check)
+				bool invoke_oom)
 {
 {
 	unsigned long csize = nr_pages * PAGE_SIZE;
 	unsigned long csize = nr_pages * PAGE_SIZE;
 	struct mem_cgroup *mem_over_limit;
 	struct mem_cgroup *mem_over_limit;
@@ -2594,14 +2515,10 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
 	if (mem_cgroup_wait_acct_move(mem_over_limit))
 	if (mem_cgroup_wait_acct_move(mem_over_limit))
 		return CHARGE_RETRY;
 		return CHARGE_RETRY;
 
 
-	/* If we don't need to call oom-killer at el, return immediately */
-	if (!oom_check)
-		return CHARGE_NOMEM;
-	/* check OOM */
-	if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask, get_order(csize)))
-		return CHARGE_OOM_DIE;
+	if (invoke_oom)
+		mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(csize));
 
 
-	return CHARGE_RETRY;
+	return CHARGE_NOMEM;
 }
 }
 
 
 /*
 /*
@@ -2704,7 +2621,7 @@ again:
 	}
 	}
 
 
 	do {
 	do {
-		bool oom_check;
+		bool invoke_oom = oom && !nr_oom_retries;
 
 
 		/* If killed, bypass charge */
 		/* If killed, bypass charge */
 		if (fatal_signal_pending(current)) {
 		if (fatal_signal_pending(current)) {
@@ -2712,14 +2629,8 @@ again:
 			goto bypass;
 			goto bypass;
 		}
 		}
 
 
-		oom_check = false;
-		if (oom && !nr_oom_retries) {
-			oom_check = true;
-			nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
-		}
-
-		ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, nr_pages,
-		    oom_check);
+		ret = mem_cgroup_do_charge(memcg, gfp_mask, batch,
+					   nr_pages, invoke_oom);
 		switch (ret) {
 		switch (ret) {
 		case CHARGE_OK:
 		case CHARGE_OK:
 			break;
 			break;
@@ -2732,16 +2643,12 @@ again:
 			css_put(&memcg->css);
 			css_put(&memcg->css);
 			goto nomem;
 			goto nomem;
 		case CHARGE_NOMEM: /* OOM routine works */
 		case CHARGE_NOMEM: /* OOM routine works */
-			if (!oom) {
+			if (!oom || invoke_oom) {
 				css_put(&memcg->css);
 				css_put(&memcg->css);
 				goto nomem;
 				goto nomem;
 			}
 			}
-			/* If oom, we never return -ENOMEM */
 			nr_oom_retries--;
 			nr_oom_retries--;
 			break;
 			break;
-		case CHARGE_OOM_DIE: /* Killed by OOM Killer */
-			css_put(&memcg->css);
-			goto bypass;
 		}
 		}
 	} while (ret != CHARGE_OK);
 	} while (ret != CHARGE_OK);
 
 
@@ -2882,7 +2789,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
 	 * is accessed after testing USED bit. To make pc->mem_cgroup visible
 	 * is accessed after testing USED bit. To make pc->mem_cgroup visible
 	 * before USED bit, we need memory barrier here.
 	 * before USED bit, we need memory barrier here.
 	 * See mem_cgroup_add_lru_list(), etc.
 	 * See mem_cgroup_add_lru_list(), etc.
- 	 */
+	 */
 	smp_wmb();
 	smp_wmb();
 	SetPageCgroupUsed(pc);
 	SetPageCgroupUsed(pc);
 
 
@@ -2905,9 +2812,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
 	unlock_page_cgroup(pc);
 	unlock_page_cgroup(pc);
 
 
 	/*
 	/*
-	 * "charge_statistics" updated event counter. Then, check it.
-	 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
-	 * if they exceeds softlimit.
+	 * "charge_statistics" updated event counter.
 	 */
 	 */
 	memcg_check_events(memcg, page);
 	memcg_check_events(memcg, page);
 }
 }
@@ -3626,9 +3531,9 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
 	 * the page allocator. Therefore, the following sequence when backed by
 	 * the page allocator. Therefore, the following sequence when backed by
 	 * the SLUB allocator:
 	 * the SLUB allocator:
 	 *
 	 *
-	 * 	memcg_stop_kmem_account();
-	 * 	kmalloc(<large_number>)
-	 * 	memcg_resume_kmem_account();
+	 *	memcg_stop_kmem_account();
+	 *	kmalloc(<large_number>)
+	 *	memcg_resume_kmem_account();
 	 *
 	 *
 	 * would effectively ignore the fact that we should skip accounting,
 	 * would effectively ignore the fact that we should skip accounting,
 	 * since it will drive us directly to this function without passing
 	 * since it will drive us directly to this function without passing
@@ -3750,6 +3655,20 @@ void mem_cgroup_split_huge_fixup(struct page *head)
 }
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 
+static inline
+void mem_cgroup_move_account_page_stat(struct mem_cgroup *from,
+					struct mem_cgroup *to,
+					unsigned int nr_pages,
+					enum mem_cgroup_stat_index idx)
+{
+	/* Update stat data for mem_cgroup */
+	preempt_disable();
+	WARN_ON_ONCE(from->stat->count[idx] < nr_pages);
+	__this_cpu_add(from->stat->count[idx], -nr_pages);
+	__this_cpu_add(to->stat->count[idx], nr_pages);
+	preempt_enable();
+}
+
 /**
 /**
  * mem_cgroup_move_account - move account of the page
  * mem_cgroup_move_account - move account of the page
  * @page: the page
  * @page: the page
@@ -3795,13 +3714,14 @@ static int mem_cgroup_move_account(struct page *page,
 
 
 	move_lock_mem_cgroup(from, &flags);
 	move_lock_mem_cgroup(from, &flags);
 
 
-	if (!anon && page_mapped(page)) {
-		/* Update mapped_file data for mem_cgroup */
-		preempt_disable();
-		__this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
-		__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
-		preempt_enable();
-	}
+	if (!anon && page_mapped(page))
+		mem_cgroup_move_account_page_stat(from, to, nr_pages,
+			MEM_CGROUP_STAT_FILE_MAPPED);
+
+	if (PageWriteback(page))
+		mem_cgroup_move_account_page_stat(from, to, nr_pages,
+			MEM_CGROUP_STAT_WRITEBACK);
+
 	mem_cgroup_charge_statistics(from, page, anon, -nr_pages);
 	mem_cgroup_charge_statistics(from, page, anon, -nr_pages);
 
 
 	/* caller should have done css_get */
 	/* caller should have done css_get */
@@ -4657,7 +4577,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
 				   MEM_CGROUP_RECLAIM_SHRINK);
 				   MEM_CGROUP_RECLAIM_SHRINK);
 		curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
 		curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
 		/* Usage is reduced ? */
 		/* Usage is reduced ? */
-  		if (curusage >= oldusage)
+		if (curusage >= oldusage)
 			retry_count--;
 			retry_count--;
 		else
 		else
 			oldusage = curusage;
 			oldusage = curusage;
@@ -4678,7 +4598,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
 	int enlarge = 0;
 	int enlarge = 0;
 
 
 	/* see mem_cgroup_resize_res_limit */
 	/* see mem_cgroup_resize_res_limit */
- 	retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
+	retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
 	oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
 	oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
 	while (retry_count) {
 	while (retry_count) {
 		if (signal_pending(current)) {
 		if (signal_pending(current)) {
@@ -4727,98 +4647,6 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
 	return ret;
 	return ret;
 }
 }
 
 
-unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
-					    gfp_t gfp_mask,
-					    unsigned long *total_scanned)
-{
-	unsigned long nr_reclaimed = 0;
-	struct mem_cgroup_per_zone *mz, *next_mz = NULL;
-	unsigned long reclaimed;
-	int loop = 0;
-	struct mem_cgroup_tree_per_zone *mctz;
-	unsigned long long excess;
-	unsigned long nr_scanned;
-
-	if (order > 0)
-		return 0;
-
-	mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
-	/*
-	 * This loop can run a while, specially if mem_cgroup's continuously
-	 * keep exceeding their soft limit and putting the system under
-	 * pressure
-	 */
-	do {
-		if (next_mz)
-			mz = next_mz;
-		else
-			mz = mem_cgroup_largest_soft_limit_node(mctz);
-		if (!mz)
-			break;
-
-		nr_scanned = 0;
-		reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone,
-						    gfp_mask, &nr_scanned);
-		nr_reclaimed += reclaimed;
-		*total_scanned += nr_scanned;
-		spin_lock(&mctz->lock);
-
-		/*
-		 * If we failed to reclaim anything from this memory cgroup
-		 * it is time to move on to the next cgroup
-		 */
-		next_mz = NULL;
-		if (!reclaimed) {
-			do {
-				/*
-				 * Loop until we find yet another one.
-				 *
-				 * By the time we get the soft_limit lock
-				 * again, someone might have aded the
-				 * group back on the RB tree. Iterate to
-				 * make sure we get a different mem.
-				 * mem_cgroup_largest_soft_limit_node returns
-				 * NULL if no other cgroup is present on
-				 * the tree
-				 */
-				next_mz =
-				__mem_cgroup_largest_soft_limit_node(mctz);
-				if (next_mz == mz)
-					css_put(&next_mz->memcg->css);
-				else /* next_mz == NULL or other memcg */
-					break;
-			} while (1);
-		}
-		__mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
-		excess = res_counter_soft_limit_excess(&mz->memcg->res);
-		/*
-		 * One school of thought says that we should not add
-		 * back the node to the tree if reclaim returns 0.
-		 * But our reclaim could return 0, simply because due
-		 * to priority we are exposing a smaller subset of
-		 * memory to reclaim from. Consider this as a longer
-		 * term TODO.
-		 */
-		/* If excess == 0, no tree ops */
-		__mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess);
-		spin_unlock(&mctz->lock);
-		css_put(&mz->memcg->css);
-		loop++;
-		/*
-		 * Could not reclaim anything and there are no more
-		 * mem cgroups to try or we seem to be looping without
-		 * reclaiming anything.
-		 */
-		if (!nr_reclaimed &&
-			(next_mz == NULL ||
-			loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
-			break;
-	} while (!nr_reclaimed);
-	if (next_mz)
-		css_put(&next_mz->memcg->css);
-	return nr_reclaimed;
-}
-
 /**
 /**
  * mem_cgroup_force_empty_list - clears LRU of a group
  * mem_cgroup_force_empty_list - clears LRU of a group
  * @memcg: group to clear
  * @memcg: group to clear
@@ -4990,18 +4818,12 @@ static int mem_cgroup_force_empty_write(struct cgroup_subsys_state *css,
 					unsigned int event)
 					unsigned int event)
 {
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
-	int ret;
 
 
 	if (mem_cgroup_is_root(memcg))
 	if (mem_cgroup_is_root(memcg))
 		return -EINVAL;
 		return -EINVAL;
-	css_get(&memcg->css);
-	ret = mem_cgroup_force_empty(memcg);
-	css_put(&memcg->css);
-
-	return ret;
+	return mem_cgroup_force_empty(memcg);
 }
 }
 
 
-
 static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
 static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
 				     struct cftype *cft)
 				     struct cftype *cft)
 {
 {
@@ -5139,7 +4961,7 @@ static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val)
 	 */
 	 */
 	mutex_lock(&memcg_create_mutex);
 	mutex_lock(&memcg_create_mutex);
 	mutex_lock(&set_limit_mutex);
 	mutex_lock(&set_limit_mutex);
-	if (!memcg->kmem_account_flags && val != RESOURCE_MAX) {
+	if (!memcg->kmem_account_flags && val != RES_COUNTER_MAX) {
 		if (cgroup_task_count(css->cgroup) || memcg_has_children(memcg)) {
 		if (cgroup_task_count(css->cgroup) || memcg_has_children(memcg)) {
 			ret = -EBUSY;
 			ret = -EBUSY;
 			goto out;
 			goto out;
@@ -5149,7 +4971,7 @@ static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val)
 
 
 		ret = memcg_update_cache_sizes(memcg);
 		ret = memcg_update_cache_sizes(memcg);
 		if (ret) {
 		if (ret) {
-			res_counter_set_limit(&memcg->kmem, RESOURCE_MAX);
+			res_counter_set_limit(&memcg->kmem, RES_COUNTER_MAX);
 			goto out;
 			goto out;
 		}
 		}
 		static_key_slow_inc(&memcg_kmem_enabled_key);
 		static_key_slow_inc(&memcg_kmem_enabled_key);
@@ -6089,8 +5911,6 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
 	for (zone = 0; zone < MAX_NR_ZONES; zone++) {
 	for (zone = 0; zone < MAX_NR_ZONES; zone++) {
 		mz = &pn->zoneinfo[zone];
 		mz = &pn->zoneinfo[zone];
 		lruvec_init(&mz->lruvec);
 		lruvec_init(&mz->lruvec);
-		mz->usage_in_excess = 0;
-		mz->on_tree = false;
 		mz->memcg = memcg;
 		mz->memcg = memcg;
 	}
 	}
 	memcg->nodeinfo[node] = pn;
 	memcg->nodeinfo[node] = pn;
@@ -6146,7 +5966,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
 	int node;
 	int node;
 	size_t size = memcg_size();
 	size_t size = memcg_size();
 
 
-	mem_cgroup_remove_from_trees(memcg);
 	free_css_id(&mem_cgroup_subsys, &memcg->css);
 	free_css_id(&mem_cgroup_subsys, &memcg->css);
 
 
 	for_each_node(node)
 	for_each_node(node)
@@ -6183,29 +6002,6 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
 }
 }
 EXPORT_SYMBOL(parent_mem_cgroup);
 EXPORT_SYMBOL(parent_mem_cgroup);
 
 
-static void __init mem_cgroup_soft_limit_tree_init(void)
-{
-	struct mem_cgroup_tree_per_node *rtpn;
-	struct mem_cgroup_tree_per_zone *rtpz;
-	int tmp, node, zone;
-
-	for_each_node(node) {
-		tmp = node;
-		if (!node_state(node, N_NORMAL_MEMORY))
-			tmp = -1;
-		rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
-		BUG_ON(!rtpn);
-
-		soft_limit_tree.rb_tree_per_node[node] = rtpn;
-
-		for (zone = 0; zone < MAX_NR_ZONES; zone++) {
-			rtpz = &rtpn->rb_tree_per_zone[zone];
-			rtpz->rb_root = RB_ROOT;
-			spin_lock_init(&rtpz->lock);
-		}
-	}
-}
-
 static struct cgroup_subsys_state * __ref
 static struct cgroup_subsys_state * __ref
 mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 {
 {
@@ -6235,6 +6031,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 	mutex_init(&memcg->thresholds_lock);
 	mutex_init(&memcg->thresholds_lock);
 	spin_lock_init(&memcg->move_lock);
 	spin_lock_init(&memcg->move_lock);
 	vmpressure_init(&memcg->vmpressure);
 	vmpressure_init(&memcg->vmpressure);
+	spin_lock_init(&memcg->soft_lock);
 
 
 	return &memcg->css;
 	return &memcg->css;
 
 
@@ -6312,6 +6109,13 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
 
 
 	mem_cgroup_invalidate_reclaim_iterators(memcg);
 	mem_cgroup_invalidate_reclaim_iterators(memcg);
 	mem_cgroup_reparent_charges(memcg);
 	mem_cgroup_reparent_charges(memcg);
+	if (memcg->soft_contributed) {
+		while ((memcg = parent_mem_cgroup(memcg)))
+			atomic_dec(&memcg->children_in_excess);
+
+		if (memcg != root_mem_cgroup && !root_mem_cgroup->use_hierarchy)
+			atomic_dec(&root_mem_cgroup->children_in_excess);
+	}
 	mem_cgroup_destroy_all_caches(memcg);
 	mem_cgroup_destroy_all_caches(memcg);
 	vmpressure_cleanup(&memcg->vmpressure);
 	vmpressure_cleanup(&memcg->vmpressure);
 }
 }
@@ -6986,7 +6790,6 @@ static int __init mem_cgroup_init(void)
 {
 {
 	hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
 	hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
 	enable_swap_cgroup();
 	enable_swap_cgroup();
-	mem_cgroup_soft_limit_tree_init();
 	memcg_stock_init();
 	memcg_stock_init();
 	return 0;
 	return 0;
 }
 }

+ 39 - 13
mm/memory.c

@@ -3695,7 +3695,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
  * but allow concurrent faults), and pte mapped but not yet locked.
  * but allow concurrent faults), and pte mapped but not yet locked.
  * We return with mmap_sem still held, but pte unmapped and unlocked.
  * We return with mmap_sem still held, but pte unmapped and unlocked.
  */
  */
-int handle_pte_fault(struct mm_struct *mm,
+static int handle_pte_fault(struct mm_struct *mm,
 		     struct vm_area_struct *vma, unsigned long address,
 		     struct vm_area_struct *vma, unsigned long address,
 		     pte_t *pte, pmd_t *pmd, unsigned int flags)
 		     pte_t *pte, pmd_t *pmd, unsigned int flags)
 {
 {
@@ -3754,22 +3754,14 @@ unlock:
 /*
 /*
  * By the time we get here, we already hold the mm semaphore
  * By the time we get here, we already hold the mm semaphore
  */
  */
-int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
-		unsigned long address, unsigned int flags)
+static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+			     unsigned long address, unsigned int flags)
 {
 {
 	pgd_t *pgd;
 	pgd_t *pgd;
 	pud_t *pud;
 	pud_t *pud;
 	pmd_t *pmd;
 	pmd_t *pmd;
 	pte_t *pte;
 	pte_t *pte;
 
 
-	__set_current_state(TASK_RUNNING);
-
-	count_vm_event(PGFAULT);
-	mem_cgroup_count_vm_event(mm, PGFAULT);
-
-	/* do counter updates before entering really critical section. */
-	check_sync_rss_stat(current);
-
 	if (unlikely(is_vm_hugetlb_page(vma)))
 	if (unlikely(is_vm_hugetlb_page(vma)))
 		return hugetlb_fault(mm, vma, address, flags);
 		return hugetlb_fault(mm, vma, address, flags);
 
 
@@ -3782,9 +3774,12 @@ retry:
 	if (!pmd)
 	if (!pmd)
 		return VM_FAULT_OOM;
 		return VM_FAULT_OOM;
 	if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
 	if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
+		int ret = VM_FAULT_FALLBACK;
 		if (!vma->vm_ops)
 		if (!vma->vm_ops)
-			return do_huge_pmd_anonymous_page(mm, vma, address,
-							  pmd, flags);
+			ret = do_huge_pmd_anonymous_page(mm, vma, address,
+					pmd, flags);
+		if (!(ret & VM_FAULT_FALLBACK))
+			return ret;
 	} else {
 	} else {
 		pmd_t orig_pmd = *pmd;
 		pmd_t orig_pmd = *pmd;
 		int ret;
 		int ret;
@@ -3850,6 +3845,37 @@ retry:
 	return handle_pte_fault(mm, vma, address, pte, pmd, flags);
 	return handle_pte_fault(mm, vma, address, pte, pmd, flags);
 }
 }
 
 
+int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+		    unsigned long address, unsigned int flags)
+{
+	int ret;
+
+	__set_current_state(TASK_RUNNING);
+
+	count_vm_event(PGFAULT);
+	mem_cgroup_count_vm_event(mm, PGFAULT);
+
+	/* do counter updates before entering really critical section. */
+	check_sync_rss_stat(current);
+
+	/*
+	 * Enable the memcg OOM handling for faults triggered in user
+	 * space.  Kernel faults are handled more gracefully.
+	 */
+	if (flags & FAULT_FLAG_USER)
+		mem_cgroup_enable_oom();
+
+	ret = __handle_mm_fault(mm, vma, address, flags);
+
+	if (flags & FAULT_FLAG_USER)
+		mem_cgroup_disable_oom();
+
+	if (WARN_ON(task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM)))
+		mem_cgroup_oom_synchronize();
+
+	return ret;
+}
+
 #ifndef __PAGETABLE_PUD_FOLDED
 #ifndef __PAGETABLE_PUD_FOLDED
 /*
 /*
  * Allocate page upper directory.
  * Allocate page upper directory.

+ 5 - 2
mm/oom_kill.c

@@ -678,9 +678,12 @@ out:
  */
  */
 void pagefault_out_of_memory(void)
 void pagefault_out_of_memory(void)
 {
 {
-	struct zonelist *zonelist = node_zonelist(first_online_node,
-						  GFP_KERNEL);
+	struct zonelist *zonelist;
 
 
+	if (mem_cgroup_oom_synchronize())
+		return;
+
+	zonelist = node_zonelist(first_online_node, GFP_KERNEL);
 	if (try_set_zonelist_oom(zonelist, GFP_KERNEL)) {
 	if (try_set_zonelist_oom(zonelist, GFP_KERNEL)) {
 		out_of_memory(NULL, 0, 0, NULL, false);
 		out_of_memory(NULL, 0, 0, NULL, false);
 		clear_zonelist_oom(zonelist, GFP_KERNEL);
 		clear_zonelist_oom(zonelist, GFP_KERNEL);

+ 15 - 0
mm/page-writeback.c

@@ -2143,11 +2143,17 @@ EXPORT_SYMBOL(account_page_dirtied);
 
 
 /*
 /*
  * Helper function for set_page_writeback family.
  * Helper function for set_page_writeback family.
+ *
+ * The caller must hold mem_cgroup_begin/end_update_page_stat() lock
+ * while calling this function.
+ * See test_set_page_writeback for example.
+ *
  * NOTE: Unlike account_page_dirtied this does not rely on being atomic
  * NOTE: Unlike account_page_dirtied this does not rely on being atomic
  * wrt interrupts.
  * wrt interrupts.
  */
  */
 void account_page_writeback(struct page *page)
 void account_page_writeback(struct page *page)
 {
 {
+	mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK);
 	inc_zone_page_state(page, NR_WRITEBACK);
 	inc_zone_page_state(page, NR_WRITEBACK);
 }
 }
 EXPORT_SYMBOL(account_page_writeback);
 EXPORT_SYMBOL(account_page_writeback);
@@ -2364,7 +2370,10 @@ int test_clear_page_writeback(struct page *page)
 {
 {
 	struct address_space *mapping = page_mapping(page);
 	struct address_space *mapping = page_mapping(page);
 	int ret;
 	int ret;
+	bool locked;
+	unsigned long memcg_flags;
 
 
+	mem_cgroup_begin_update_page_stat(page, &locked, &memcg_flags);
 	if (mapping) {
 	if (mapping) {
 		struct backing_dev_info *bdi = mapping->backing_dev_info;
 		struct backing_dev_info *bdi = mapping->backing_dev_info;
 		unsigned long flags;
 		unsigned long flags;
@@ -2385,9 +2394,11 @@ int test_clear_page_writeback(struct page *page)
 		ret = TestClearPageWriteback(page);
 		ret = TestClearPageWriteback(page);
 	}
 	}
 	if (ret) {
 	if (ret) {
+		mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_WRITEBACK);
 		dec_zone_page_state(page, NR_WRITEBACK);
 		dec_zone_page_state(page, NR_WRITEBACK);
 		inc_zone_page_state(page, NR_WRITTEN);
 		inc_zone_page_state(page, NR_WRITTEN);
 	}
 	}
+	mem_cgroup_end_update_page_stat(page, &locked, &memcg_flags);
 	return ret;
 	return ret;
 }
 }
 
 
@@ -2395,7 +2406,10 @@ int test_set_page_writeback(struct page *page)
 {
 {
 	struct address_space *mapping = page_mapping(page);
 	struct address_space *mapping = page_mapping(page);
 	int ret;
 	int ret;
+	bool locked;
+	unsigned long memcg_flags;
 
 
+	mem_cgroup_begin_update_page_stat(page, &locked, &memcg_flags);
 	if (mapping) {
 	if (mapping) {
 		struct backing_dev_info *bdi = mapping->backing_dev_info;
 		struct backing_dev_info *bdi = mapping->backing_dev_info;
 		unsigned long flags;
 		unsigned long flags;
@@ -2422,6 +2436,7 @@ int test_set_page_writeback(struct page *page)
 	}
 	}
 	if (!ret)
 	if (!ret)
 		account_page_writeback(page);
 		account_page_writeback(page);
+	mem_cgroup_end_update_page_stat(page, &locked, &memcg_flags);
 	return ret;
 	return ret;
 
 
 }
 }

+ 11 - 11
mm/rmap.c

@@ -1052,11 +1052,11 @@ void do_page_add_anon_rmap(struct page *page,
 {
 {
 	int first = atomic_inc_and_test(&page->_mapcount);
 	int first = atomic_inc_and_test(&page->_mapcount);
 	if (first) {
 	if (first) {
-		if (!PageTransHuge(page))
-			__inc_zone_page_state(page, NR_ANON_PAGES);
-		else
+		if (PageTransHuge(page))
 			__inc_zone_page_state(page,
 			__inc_zone_page_state(page,
 					      NR_ANON_TRANSPARENT_HUGEPAGES);
 					      NR_ANON_TRANSPARENT_HUGEPAGES);
+		__mod_zone_page_state(page_zone(page), NR_ANON_PAGES,
+				hpage_nr_pages(page));
 	}
 	}
 	if (unlikely(PageKsm(page)))
 	if (unlikely(PageKsm(page)))
 		return;
 		return;
@@ -1085,10 +1085,10 @@ void page_add_new_anon_rmap(struct page *page,
 	VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
 	VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
 	SetPageSwapBacked(page);
 	SetPageSwapBacked(page);
 	atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */
 	atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */
-	if (!PageTransHuge(page))
-		__inc_zone_page_state(page, NR_ANON_PAGES);
-	else
+	if (PageTransHuge(page))
 		__inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
 		__inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
+	__mod_zone_page_state(page_zone(page), NR_ANON_PAGES,
+			hpage_nr_pages(page));
 	__page_set_anon_rmap(page, vma, address, 1);
 	__page_set_anon_rmap(page, vma, address, 1);
 	if (!mlocked_vma_newpage(vma, page)) {
 	if (!mlocked_vma_newpage(vma, page)) {
 		SetPageActive(page);
 		SetPageActive(page);
@@ -1111,7 +1111,7 @@ void page_add_file_rmap(struct page *page)
 	mem_cgroup_begin_update_page_stat(page, &locked, &flags);
 	mem_cgroup_begin_update_page_stat(page, &locked, &flags);
 	if (atomic_inc_and_test(&page->_mapcount)) {
 	if (atomic_inc_and_test(&page->_mapcount)) {
 		__inc_zone_page_state(page, NR_FILE_MAPPED);
 		__inc_zone_page_state(page, NR_FILE_MAPPED);
-		mem_cgroup_inc_page_stat(page, MEMCG_NR_FILE_MAPPED);
+		mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED);
 	}
 	}
 	mem_cgroup_end_update_page_stat(page, &locked, &flags);
 	mem_cgroup_end_update_page_stat(page, &locked, &flags);
 }
 }
@@ -1148,14 +1148,14 @@ void page_remove_rmap(struct page *page)
 		goto out;
 		goto out;
 	if (anon) {
 	if (anon) {
 		mem_cgroup_uncharge_page(page);
 		mem_cgroup_uncharge_page(page);
-		if (!PageTransHuge(page))
-			__dec_zone_page_state(page, NR_ANON_PAGES);
-		else
+		if (PageTransHuge(page))
 			__dec_zone_page_state(page,
 			__dec_zone_page_state(page,
 					      NR_ANON_TRANSPARENT_HUGEPAGES);
 					      NR_ANON_TRANSPARENT_HUGEPAGES);
+		__mod_zone_page_state(page_zone(page), NR_ANON_PAGES,
+				-hpage_nr_pages(page));
 	} else {
 	} else {
 		__dec_zone_page_state(page, NR_FILE_MAPPED);
 		__dec_zone_page_state(page, NR_FILE_MAPPED);
-		mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_MAPPED);
+		mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED);
 		mem_cgroup_end_update_page_stat(page, &locked, &flags);
 		mem_cgroup_end_update_page_stat(page, &locked, &flags);
 	}
 	}
 	if (unlikely(PageMlocked(page)))
 	if (unlikely(PageMlocked(page)))

+ 39 - 5
mm/swap.c

@@ -432,6 +432,11 @@ static void activate_page_drain(int cpu)
 		pagevec_lru_move_fn(pvec, __activate_page, NULL);
 		pagevec_lru_move_fn(pvec, __activate_page, NULL);
 }
 }
 
 
+static bool need_activate_page_drain(int cpu)
+{
+	return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0;
+}
+
 void activate_page(struct page *page)
 void activate_page(struct page *page)
 {
 {
 	if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
 	if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
@@ -449,6 +454,11 @@ static inline void activate_page_drain(int cpu)
 {
 {
 }
 }
 
 
+static bool need_activate_page_drain(int cpu)
+{
+	return false;
+}
+
 void activate_page(struct page *page)
 void activate_page(struct page *page)
 {
 {
 	struct zone *zone = page_zone(page);
 	struct zone *zone = page_zone(page);
@@ -701,12 +711,36 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy)
 	lru_add_drain();
 	lru_add_drain();
 }
 }
 
 
-/*
- * Returns 0 for success
- */
-int lru_add_drain_all(void)
+static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
+
+void lru_add_drain_all(void)
 {
 {
-	return schedule_on_each_cpu(lru_add_drain_per_cpu);
+	static DEFINE_MUTEX(lock);
+	static struct cpumask has_work;
+	int cpu;
+
+	mutex_lock(&lock);
+	get_online_cpus();
+	cpumask_clear(&has_work);
+
+	for_each_online_cpu(cpu) {
+		struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
+
+		if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
+		    pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
+		    pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
+		    need_activate_page_drain(cpu)) {
+			INIT_WORK(work, lru_add_drain_per_cpu);
+			schedule_work_on(cpu, work);
+			cpumask_set_cpu(cpu, &has_work);
+		}
+	}
+
+	for_each_cpu(cpu, &has_work)
+		flush_work(&per_cpu(lru_add_drain_work, cpu));
+
+	put_online_cpus();
+	mutex_unlock(&lock);
 }
 }
 
 
 /*
 /*

+ 2 - 7
mm/truncate.c

@@ -567,7 +567,6 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2);
 /**
 /**
  * truncate_pagecache - unmap and remove pagecache that has been truncated
  * truncate_pagecache - unmap and remove pagecache that has been truncated
  * @inode: inode
  * @inode: inode
- * @oldsize: old file size
  * @newsize: new file size
  * @newsize: new file size
  *
  *
  * inode's new i_size must already be written before truncate_pagecache
  * inode's new i_size must already be written before truncate_pagecache
@@ -580,7 +579,7 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2);
  * situations such as writepage being called for a page that has already
  * situations such as writepage being called for a page that has already
  * had its underlying blocks deallocated.
  * had its underlying blocks deallocated.
  */
  */
-void truncate_pagecache(struct inode *inode, loff_t oldsize, loff_t newsize)
+void truncate_pagecache(struct inode *inode, loff_t newsize)
 {
 {
 	struct address_space *mapping = inode->i_mapping;
 	struct address_space *mapping = inode->i_mapping;
 	loff_t holebegin = round_up(newsize, PAGE_SIZE);
 	loff_t holebegin = round_up(newsize, PAGE_SIZE);
@@ -614,12 +613,8 @@ EXPORT_SYMBOL(truncate_pagecache);
  */
  */
 void truncate_setsize(struct inode *inode, loff_t newsize)
 void truncate_setsize(struct inode *inode, loff_t newsize)
 {
 {
-	loff_t oldsize;
-
-	oldsize = inode->i_size;
 	i_size_write(inode, newsize);
 	i_size_write(inode, newsize);
-
-	truncate_pagecache(inode, oldsize, newsize);
+	truncate_pagecache(inode, newsize);
 }
 }
 EXPORT_SYMBOL(truncate_setsize);
 EXPORT_SYMBOL(truncate_setsize);
 
 

+ 52 - 31
mm/vmscan.c

@@ -139,11 +139,23 @@ static bool global_reclaim(struct scan_control *sc)
 {
 {
 	return !sc->target_mem_cgroup;
 	return !sc->target_mem_cgroup;
 }
 }
+
+static bool mem_cgroup_should_soft_reclaim(struct scan_control *sc)
+{
+	struct mem_cgroup *root = sc->target_mem_cgroup;
+	return !mem_cgroup_disabled() &&
+		mem_cgroup_soft_reclaim_eligible(root, root) != SKIP_TREE;
+}
 #else
 #else
 static bool global_reclaim(struct scan_control *sc)
 static bool global_reclaim(struct scan_control *sc)
 {
 {
 	return true;
 	return true;
 }
 }
+
+static bool mem_cgroup_should_soft_reclaim(struct scan_control *sc)
+{
+	return false;
+}
 #endif
 #endif
 
 
 unsigned long zone_reclaimable_pages(struct zone *zone)
 unsigned long zone_reclaimable_pages(struct zone *zone)
@@ -2164,9 +2176,11 @@ static inline bool should_continue_reclaim(struct zone *zone,
 	}
 	}
 }
 }
 
 
-static void shrink_zone(struct zone *zone, struct scan_control *sc)
+static int
+__shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim)
 {
 {
 	unsigned long nr_reclaimed, nr_scanned;
 	unsigned long nr_reclaimed, nr_scanned;
+	int groups_scanned = 0;
 
 
 	do {
 	do {
 		struct mem_cgroup *root = sc->target_mem_cgroup;
 		struct mem_cgroup *root = sc->target_mem_cgroup;
@@ -2174,15 +2188,17 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc)
 			.zone = zone,
 			.zone = zone,
 			.priority = sc->priority,
 			.priority = sc->priority,
 		};
 		};
-		struct mem_cgroup *memcg;
+		struct mem_cgroup *memcg = NULL;
+		mem_cgroup_iter_filter filter = (soft_reclaim) ?
+			mem_cgroup_soft_reclaim_eligible : NULL;
 
 
 		nr_reclaimed = sc->nr_reclaimed;
 		nr_reclaimed = sc->nr_reclaimed;
 		nr_scanned = sc->nr_scanned;
 		nr_scanned = sc->nr_scanned;
 
 
-		memcg = mem_cgroup_iter(root, NULL, &reclaim);
-		do {
+		while ((memcg = mem_cgroup_iter_cond(root, memcg, &reclaim, filter))) {
 			struct lruvec *lruvec;
 			struct lruvec *lruvec;
 
 
+			groups_scanned++;
 			lruvec = mem_cgroup_zone_lruvec(zone, memcg);
 			lruvec = mem_cgroup_zone_lruvec(zone, memcg);
 
 
 			shrink_lruvec(lruvec, sc);
 			shrink_lruvec(lruvec, sc);
@@ -2202,8 +2218,7 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc)
 				mem_cgroup_iter_break(root, memcg);
 				mem_cgroup_iter_break(root, memcg);
 				break;
 				break;
 			}
 			}
-			memcg = mem_cgroup_iter(root, memcg, &reclaim);
-		} while (memcg);
+		}
 
 
 		vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
 		vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
 			   sc->nr_scanned - nr_scanned,
 			   sc->nr_scanned - nr_scanned,
@@ -2211,6 +2226,37 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc)
 
 
 	} while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,
 	} while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,
 					 sc->nr_scanned - nr_scanned, sc));
 					 sc->nr_scanned - nr_scanned, sc));
+
+	return groups_scanned;
+}
+
+
+static void shrink_zone(struct zone *zone, struct scan_control *sc)
+{
+	bool do_soft_reclaim = mem_cgroup_should_soft_reclaim(sc);
+	unsigned long nr_scanned = sc->nr_scanned;
+	int scanned_groups;
+
+	scanned_groups = __shrink_zone(zone, sc, do_soft_reclaim);
+	/*
+	 * memcg iterator might race with other reclaimer or start from
+	 * a incomplete tree walk so the tree walk in __shrink_zone
+	 * might have missed groups that are above the soft limit. Try
+	 * another loop to catch up with others. Do it just once to
+	 * prevent from reclaim latencies when other reclaimers always
+	 * preempt this one.
+	 */
+	if (do_soft_reclaim && !scanned_groups)
+		__shrink_zone(zone, sc, do_soft_reclaim);
+
+	/*
+	 * No group is over the soft limit or those that are do not have
+	 * pages in the zone we are reclaiming so we have to reclaim everybody
+	 */
+	if (do_soft_reclaim && (sc->nr_scanned == nr_scanned)) {
+		__shrink_zone(zone, sc, false);
+		return;
+	}
 }
 }
 
 
 /* Returns true if compaction should go ahead for a high-order request */
 /* Returns true if compaction should go ahead for a high-order request */
@@ -2274,8 +2320,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
 {
 {
 	struct zoneref *z;
 	struct zoneref *z;
 	struct zone *zone;
 	struct zone *zone;
-	unsigned long nr_soft_reclaimed;
-	unsigned long nr_soft_scanned;
 	bool aborted_reclaim = false;
 	bool aborted_reclaim = false;
 
 
 	/*
 	/*
@@ -2315,18 +2359,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
 					continue;
 					continue;
 				}
 				}
 			}
 			}
-			/*
-			 * This steals pages from memory cgroups over softlimit
-			 * and returns the number of reclaimed pages and
-			 * scanned pages. This works for global memory pressure
-			 * and balancing, not for a memcg's limit.
-			 */
-			nr_soft_scanned = 0;
-			nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
-						sc->order, sc->gfp_mask,
-						&nr_soft_scanned);
-			sc->nr_reclaimed += nr_soft_reclaimed;
-			sc->nr_scanned += nr_soft_scanned;
 			/* need some check for avoid more shrink_zone() */
 			/* need some check for avoid more shrink_zone() */
 		}
 		}
 
 
@@ -2920,8 +2952,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
 {
 {
 	int i;
 	int i;
 	int end_zone = 0;	/* Inclusive.  0 = ZONE_DMA */
 	int end_zone = 0;	/* Inclusive.  0 = ZONE_DMA */
-	unsigned long nr_soft_reclaimed;
-	unsigned long nr_soft_scanned;
 	struct scan_control sc = {
 	struct scan_control sc = {
 		.gfp_mask = GFP_KERNEL,
 		.gfp_mask = GFP_KERNEL,
 		.priority = DEF_PRIORITY,
 		.priority = DEF_PRIORITY,
@@ -3036,15 +3066,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
 
 
 			sc.nr_scanned = 0;
 			sc.nr_scanned = 0;
 
 
-			nr_soft_scanned = 0;
-			/*
-			 * Call soft limit reclaim before calling shrink_zone.
-			 */
-			nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
-							order, sc.gfp_mask,
-							&nr_soft_scanned);
-			sc.nr_reclaimed += nr_soft_reclaimed;
-
 			/*
 			/*
 			 * There should be no need to raise the scanning
 			 * There should be no need to raise the scanning
 			 * priority if enough pages are already being scanned
 			 * priority if enough pages are already being scanned

+ 5 - 5
net/ipv4/tcp_memcontrol.c

@@ -87,8 +87,8 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val)
 	if (!cg_proto)
 	if (!cg_proto)
 		return -EINVAL;
 		return -EINVAL;
 
 
-	if (val > RESOURCE_MAX)
-		val = RESOURCE_MAX;
+	if (val > RES_COUNTER_MAX)
+		val = RES_COUNTER_MAX;
 
 
 	tcp = tcp_from_cgproto(cg_proto);
 	tcp = tcp_from_cgproto(cg_proto);
 
 
@@ -101,9 +101,9 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val)
 		tcp->tcp_prot_mem[i] = min_t(long, val >> PAGE_SHIFT,
 		tcp->tcp_prot_mem[i] = min_t(long, val >> PAGE_SHIFT,
 					     net->ipv4.sysctl_tcp_mem[i]);
 					     net->ipv4.sysctl_tcp_mem[i]);
 
 
-	if (val == RESOURCE_MAX)
+	if (val == RES_COUNTER_MAX)
 		clear_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags);
 		clear_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags);
-	else if (val != RESOURCE_MAX) {
+	else if (val != RES_COUNTER_MAX) {
 		/*
 		/*
 		 * The active bit needs to be written after the static_key
 		 * The active bit needs to be written after the static_key
 		 * update. This is what guarantees that the socket activation
 		 * update. This is what guarantees that the socket activation
@@ -187,7 +187,7 @@ static u64 tcp_cgroup_read(struct cgroup_subsys_state *css, struct cftype *cft)
 
 
 	switch (cft->private) {
 	switch (cft->private) {
 	case RES_LIMIT:
 	case RES_LIMIT:
-		val = tcp_read_stat(memcg, RES_LIMIT, RESOURCE_MAX);
+		val = tcp_read_stat(memcg, RES_LIMIT, RES_COUNTER_MAX);
 		break;
 		break;
 	case RES_USAGE:
 	case RES_USAGE:
 		val = tcp_read_usage(memcg);
 		val = tcp_read_usage(memcg);