12 years ago · ac4de9543a
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -490,6 +490,8 @@ pgpgin		- # of charging events to the memory cgroup. The charging
 
				 pgpgout		- # of uncharging events to the memory cgroup. The uncharging
			
 
				 		event happens each time a page is unaccounted from the cgroup.
			
 
				 swap		- # of bytes of swap usage
			
 
				+writeback	- # of bytes of file/anon cache that are queued for syncing to
			
 
				+		disk.
			
 
				 inactive_anon	- # of bytes of anonymous and swap cache memory on inactive
			
 
				 		LRU list.
			
 
				 active_anon	- # of bytes of anonymous and swap cache memory on active
			
--- a/arch/alpha/mm/fault.c
+++ b/arch/alpha/mm/fault.c
@@ -89,8 +89,7 @@ do_page_fault(unsigned long address, unsigned long mmcsr,
 
				 	const struct exception_table_entry *fixup;
			
 
				 	int fault, si_code = SEGV_MAPERR;
			
 
				 	siginfo_t info;
			
 
				-	unsigned int flags = (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
			
 
				-			      (cause > 0 ? FAULT_FLAG_WRITE : 0));
			
 
				+	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
			
 
				 
			
 
				 	/* As of EV6, a load into $31/$f31 is a prefetch, and never faults
			
 
				 	   (or is suppressed by the PALcode).  Support that for older CPUs
			
@@ -115,7 +114,8 @@ do_page_fault(unsigned long address, unsigned long mmcsr,
 
				 	if (address >= TASK_SIZE)
			
 
				 		goto vmalloc_fault;
			
 
				 #endif
			
 
				-
			
 
				+	if (user_mode(regs))
			
 
				+		flags |= FAULT_FLAG_USER;
			
 
				 retry:
			
 
				 	down_read(&mm->mmap_sem);
			
 
				 	vma = find_vma(mm, address);
			
@@ -142,6 +142,7 @@ retry:
 
				 	} else {
			
 
				 		if (!(vma->vm_flags & VM_WRITE))
			
 
				 			goto bad_area;
			
 
				+		flags |= FAULT_FLAG_WRITE;
			
 
				 	}
			
 
				 
			
 
				 	/* If for any reason at all we couldn't handle the fault,
			
--- a/arch/arc/mm/fault.c
+++ b/arch/arc/mm/fault.c
@@ -60,8 +60,7 @@ void do_page_fault(struct pt_regs *regs, unsigned long address)
 
				 	siginfo_t info;
			
 
				 	int fault, ret;
			
 
				 	int write = regs->ecr_cause & ECR_C_PROTV_STORE;  /* ST/EX */
			
 
				-	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
			
 
				-				(write ? FAULT_FLAG_WRITE : 0);
			
 
				+	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
			
 
				 
			
 
				 	/*
			
 
				 	 * We fault-in kernel-space virtual memory on-demand. The
			
@@ -89,6 +88,8 @@ void do_page_fault(struct pt_regs *regs, unsigned long address)
 
				 	if (in_atomic() || !mm)
			
 
				 		goto no_context;
			
 
				 
			
 
				+	if (user_mode(regs))
			
 
				+		flags |= FAULT_FLAG_USER;
			
 
				 retry:
			
 
				 	down_read(&mm->mmap_sem);
			
 
				 	vma = find_vma(mm, address);
			
@@ -117,12 +118,12 @@ good_area:
 
				 	if (write) {
			
 
				 		if (!(vma->vm_flags & VM_WRITE))
			
 
				 			goto bad_area;
			
 
				+		flags |= FAULT_FLAG_WRITE;
			
 
				 	} else {
			
 
				 		if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
			
 
				 			goto bad_area;
			
 
				 	}
			
 
				 
			
 
				-survive:
			
 
				 	/*
			
 
				 	 * If for any reason at all we couldn't handle the fault,
			
 
				 	 * make sure we exit gracefully rather than endlessly redo
			
@@ -201,10 +202,6 @@ no_context:
 
				 	die("Oops", regs, address);
			
 
				 
			
 
				 out_of_memory:
			
 
				-	if (is_global_init(tsk)) {
			
 
				-		yield();
			
 
				-		goto survive;
			
 
				-	}
			
 
				 	up_read(&mm->mmap_sem);
			
 
				 
			
 
				 	if (user_mode(regs)) {
			
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -261,9 +261,7 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 
				 	struct task_struct *tsk;
			
 
				 	struct mm_struct *mm;
			
 
				 	int fault, sig, code;
			
 
				-	int write = fsr & FSR_WRITE;
			
 
				-	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
			
 
				-				(write ? FAULT_FLAG_WRITE : 0);
			
 
				+	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
			
 
				 
			
 
				 	if (notify_page_fault(regs, fsr))
			
 
				 		return 0;
			
@@ -282,6 +280,11 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 
				 	if (in_atomic() || !mm)
			
 
				 		goto no_context;
			
 
				 
			
 
				+	if (user_mode(regs))
			
 
				+		flags |= FAULT_FLAG_USER;
			
 
				+	if (fsr & FSR_WRITE)
			
 
				+		flags |= FAULT_FLAG_WRITE;
			
 
				+
			
 
				 	/*
			
 
				 	 * As per x86, we may deadlock here.  However, since the kernel only
			
 
				 	 * validly references user space from well defined areas of the code,
			
@@ -349,6 +352,13 @@ retry:
 
				 	if (likely(!(fault & (VM_FAULT_ERROR | VM_FAULT_BADMAP | VM_FAULT_BADACCESS))))
			
 
				 		return 0;
			
 
				 
			
 
				+	/*
			
 
				+	 * If we are in kernel mode at this point, we
			
 
				+	 * have no context to handle this fault with.
			
 
				+	 */
			
 
				+	if (!user_mode(regs))
			
 
				+		goto no_context;
			
 
				+
			
 
				 	if (fault & VM_FAULT_OOM) {
			
 
				 		/*
			
 
				 		 * We ran out of memory, call the OOM killer, and return to
			
@@ -359,13 +369,6 @@ retry:
 
				 		return 0;
			
 
				 	}
			
 
				 
			
 
				-	/*
			
 
				-	 * If we are in kernel mode at this point, we
			
 
				-	 * have no context to handle this fault with.
			
 
				-	 */
			
 
				-	if (!user_mode(regs))
			
 
				-		goto no_context;
			
 
				-
			
 
				 	if (fault & VM_FAULT_SIGBUS) {
			
 
				 		/*
			
 
				 		 * We had some memory, but were unable to
			
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -199,13 +199,6 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
 
				 	unsigned long vm_flags = VM_READ | VM_WRITE | VM_EXEC;
			
 
				 	unsigned int mm_flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
			
 
				 
			
 
				-	if (esr & ESR_LNX_EXEC) {
			
 
				-		vm_flags = VM_EXEC;
			
 
				-	} else if ((esr & ESR_WRITE) && !(esr & ESR_CM)) {
			
 
				-		vm_flags = VM_WRITE;
			
 
				-		mm_flags |= FAULT_FLAG_WRITE;
			
 
				-	}
			
 
				-
			
 
				 	tsk = current;
			
 
				 	mm  = tsk->mm;
			
 
				 
			
@@ -220,6 +213,16 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
 
				 	if (in_atomic() || !mm)
			
 
				 		goto no_context;
			
 
				 
			
 
				+	if (user_mode(regs))
			
 
				+		mm_flags |= FAULT_FLAG_USER;
			
 
				+
			
 
				+	if (esr & ESR_LNX_EXEC) {
			
 
				+		vm_flags = VM_EXEC;
			
 
				+	} else if ((esr & ESR_WRITE) && !(esr & ESR_CM)) {
			
 
				+		vm_flags = VM_WRITE;
			
 
				+		mm_flags |= FAULT_FLAG_WRITE;
			
 
				+	}
			
 
				+
			
 
				 	/*
			
 
				 	 * As per x86, we may deadlock here. However, since the kernel only
			
 
				 	 * validly references user space from well defined areas of the code,
			
@@ -288,6 +291,13 @@ retry:
 
				 			      VM_FAULT_BADACCESS))))
			
 
				 		return 0;
			
 
				 
			
 
				+	/*
			
 
				+	 * If we are in kernel mode at this point, we have no context to
			
 
				+	 * handle this fault with.
			
 
				+	 */
			
 
				+	if (!user_mode(regs))
			
 
				+		goto no_context;
			
 
				+
			
 
				 	if (fault & VM_FAULT_OOM) {
			
 
				 		/*
			
 
				 		 * We ran out of memory, call the OOM killer, and return to
			
@@ -298,13 +308,6 @@ retry:
 
				 		return 0;
			
 
				 	}
			
 
				 
			
 
				-	/*
			
 
				-	 * If we are in kernel mode at this point, we have no context to
			
 
				-	 * handle this fault with.
			
 
				-	 */
			
 
				-	if (!user_mode(regs))
			
 
				-		goto no_context;
			
 
				-
			
 
				 	if (fault & VM_FAULT_SIGBUS) {
			
 
				 		/*
			
 
				 		 * We had some memory, but were unable to successfully fix up
			
--- a/arch/avr32/mm/fault.c
+++ b/arch/avr32/mm/fault.c
@@ -86,6 +86,8 @@ asmlinkage void do_page_fault(unsigned long ecr, struct pt_regs *regs)
 
				 
			
 
				 	local_irq_enable();
			
 
				 
			
 
				+	if (user_mode(regs))
			
 
				+		flags |= FAULT_FLAG_USER;
			
 
				 retry:
			
 
				 	down_read(&mm->mmap_sem);
			
 
				 
			
@@ -228,9 +230,9 @@ no_context:
 
				 	 */
			
 
				 out_of_memory:
			
 
				 	up_read(&mm->mmap_sem);
			
 
				-	pagefault_out_of_memory();
			
 
				 	if (!user_mode(regs))
			
 
				 		goto no_context;
			
 
				+	pagefault_out_of_memory();
			
 
				 	return;
			
 
				 
			
 
				 do_sigbus:
			
--- a/arch/cris/mm/fault.c
+++ b/arch/cris/mm/fault.c
@@ -58,8 +58,7 @@ do_page_fault(unsigned long address, struct pt_regs *regs,
 
				 	struct vm_area_struct * vma;
			
 
				 	siginfo_t info;
			
 
				 	int fault;
			
 
				-	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
			
 
				-				((writeaccess & 1) ? FAULT_FLAG_WRITE : 0);
			
 
				+	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
			
 
				 
			
 
				 	D(printk(KERN_DEBUG
			
 
				 		 "Page fault for %lX on %X at %lX, prot %d write %d\n",
			
@@ -117,6 +116,8 @@ do_page_fault(unsigned long address, struct pt_regs *regs,
 
				 	if (in_atomic() || !mm)
			
 
				 		goto no_context;
			
 
				 
			
 
				+	if (user_mode(regs))
			
 
				+		flags |= FAULT_FLAG_USER;
			
 
				 retry:
			
 
				 	down_read(&mm->mmap_sem);
			
 
				 	vma = find_vma(mm, address);
			
@@ -155,6 +156,7 @@ retry:
 
				 	} else if (writeaccess == 1) {
			
 
				 		if (!(vma->vm_flags & VM_WRITE))
			
 
				 			goto bad_area;
			
 
				+		flags |= FAULT_FLAG_WRITE;
			
 
				 	} else {
			
 
				 		if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
			
 
				 			goto bad_area;
			
--- a/arch/frv/mm/fault.c
+++ b/arch/frv/mm/fault.c
@@ -34,11 +34,11 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear
 
				 	struct vm_area_struct *vma;
			
 
				 	struct mm_struct *mm;
			
 
				 	unsigned long _pme, lrai, lrad, fixup;
			
 
				+	unsigned long flags = 0;
			
 
				 	siginfo_t info;
			
 
				 	pgd_t *pge;
			
 
				 	pud_t *pue;
			
 
				 	pte_t *pte;
			
 
				-	int write;
			
 
				 	int fault;
			
 
				 
			
 
				 #if 0
			
@@ -81,6 +81,9 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear
 
				 	if (in_atomic() || !mm)
			
 
				 		goto no_context;
			
 
				 
			
 
				+	if (user_mode(__frame))
			
 
				+		flags |= FAULT_FLAG_USER;
			
 
				+
			
 
				 	down_read(&mm->mmap_sem);
			
 
				 
			
 
				 	vma = find_vma(mm, ear0);
			
@@ -129,7 +132,6 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear
 
				  */
			
 
				  good_area:
			
 
				 	info.si_code = SEGV_ACCERR;
			
 
				-	write = 0;
			
 
				 	switch (esr0 & ESR0_ATXC) {
			
 
				 	default:
			
 
				 		/* handle write to write protected page */
			
@@ -140,7 +142,7 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear
 
				 #endif
			
 
				 		if (!(vma->vm_flags & VM_WRITE))
			
 
				 			goto bad_area;
			
 
				-		write = 1;
			
 
				+		flags |= FAULT_FLAG_WRITE;
			
 
				 		break;
			
 
				 
			
 
				 		 /* handle read from protected page */
			
@@ -162,7 +164,7 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear
 
				 	 * make sure we exit gracefully rather than endlessly redo
			
 
				 	 * the fault.
			
 
				 	 */
			
 
				-	fault = handle_mm_fault(mm, vma, ear0, write ? FAULT_FLAG_WRITE : 0);
			
 
				+	fault = handle_mm_fault(mm, vma, ear0, flags);
			
 
				 	if (unlikely(fault & VM_FAULT_ERROR)) {
			
 
				 		if (fault & VM_FAULT_OOM)
			
 
				 			goto out_of_memory;
			
--- a/arch/hexagon/mm/vm_fault.c
+++ b/arch/hexagon/mm/vm_fault.c
@@ -53,8 +53,7 @@ void do_page_fault(unsigned long address, long cause, struct pt_regs *regs)
 
				 	int si_code = SEGV_MAPERR;
			
 
				 	int fault;
			
 
				 	const struct exception_table_entry *fixup;
			
 
				-	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
			
 
				-				 (cause > 0 ? FAULT_FLAG_WRITE : 0);
			
 
				+	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
			
 
				 
			
 
				 	/*
			
 
				 	 * If we're in an interrupt or have no user context,
			
@@ -65,6 +64,8 @@ void do_page_fault(unsigned long address, long cause, struct pt_regs *regs)
 
				 
			
 
				 	local_irq_enable();
			
 
				 
			
 
				+	if (user_mode(regs))
			
 
				+		flags |= FAULT_FLAG_USER;
			
 
				 retry:
			
 
				 	down_read(&mm->mmap_sem);
			
 
				 	vma = find_vma(mm, address);
			
@@ -96,6 +97,7 @@ good_area:
 
				 	case FLT_STORE:
			
 
				 		if (!(vma->vm_flags & VM_WRITE))
			
 
				 			goto bad_area;
			
 
				+		flags |= FAULT_FLAG_WRITE;
			
 
				 		break;
			
 
				 	}
			
 
				 
			
--- a/arch/ia64/mm/fault.c
+++ b/arch/ia64/mm/fault.c
@@ -90,8 +90,6 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re
 
				 	mask = ((((isr >> IA64_ISR_X_BIT) & 1UL) << VM_EXEC_BIT)
			
 
				 		| (((isr >> IA64_ISR_W_BIT) & 1UL) << VM_WRITE_BIT));
			
 
				 
			
 
				-	flags |= ((mask & VM_WRITE) ? FAULT_FLAG_WRITE : 0);
			
 
				-
			
 
				 	/* mmap_sem is performance critical.... */
			
 
				 	prefetchw(&mm->mmap_sem);
			
 
				 
			
@@ -119,6 +117,10 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re
 
				 	if (notify_page_fault(regs, TRAP_BRKPT))
			
 
				 		return;
			
 
				 
			
 
				+	if (user_mode(regs))
			
 
				+		flags |= FAULT_FLAG_USER;
			
 
				+	if (mask & VM_WRITE)
			
 
				+		flags |= FAULT_FLAG_WRITE;
			
 
				 retry:
			
 
				 	down_read(&mm->mmap_sem);
			
 
				 
			
--- a/arch/m32r/mm/fault.c
+++ b/arch/m32r/mm/fault.c
@@ -78,7 +78,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code,
 
				 	struct mm_struct *mm;
			
 
				 	struct vm_area_struct * vma;
			
 
				 	unsigned long page, addr;
			
 
				-	int write;
			
 
				+	unsigned long flags = 0;
			
 
				 	int fault;
			
 
				 	siginfo_t info;
			
 
				 
			
@@ -117,6 +117,9 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code,
 
				 	if (in_atomic() || !mm)
			
 
				 		goto bad_area_nosemaphore;
			
 
				 
			
 
				+	if (error_code & ACE_USERMODE)
			
 
				+		flags |= FAULT_FLAG_USER;
			
 
				+
			
 
				 	/* When running in the kernel we expect faults to occur only to
			
 
				 	 * addresses in user space.  All other faults represent errors in the
			
 
				 	 * kernel and should generate an OOPS.  Unfortunately, in the case of an
			
@@ -166,14 +169,13 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code,
 
				  */
			
 
				 good_area:
			
 
				 	info.si_code = SEGV_ACCERR;
			
 
				-	write = 0;
			
 
				 	switch (error_code & (ACE_WRITE|ACE_PROTECTION)) {
			
 
				 		default:	/* 3: write, present */
			
 
				 			/* fall through */
			
 
				 		case ACE_WRITE:	/* write, not present */
			
 
				 			if (!(vma->vm_flags & VM_WRITE))
			
 
				 				goto bad_area;
			
 
				-			write++;
			
 
				+			flags |= FAULT_FLAG_WRITE;
			
 
				 			break;
			
 
				 		case ACE_PROTECTION:	/* read, present */
			
 
				 		case 0:		/* read, not present */
			
@@ -194,7 +196,7 @@ good_area:
 
				 	 */
			
 
				 	addr = (address & PAGE_MASK);
			
 
				 	set_thread_fault_code(error_code);
			
 
				-	fault = handle_mm_fault(mm, vma, addr, write ? FAULT_FLAG_WRITE : 0);
			
 
				+	fault = handle_mm_fault(mm, vma, addr, flags);
			
 
				 	if (unlikely(fault & VM_FAULT_ERROR)) {
			
 
				 		if (fault & VM_FAULT_OOM)
			
 
				 			goto out_of_memory;
			
--- a/arch/m68k/mm/fault.c
+++ b/arch/m68k/mm/fault.c
@@ -88,6 +88,8 @@ int do_page_fault(struct pt_regs *regs, unsigned long address,
 
				 	if (in_atomic() || !mm)
			
 
				 		goto no_context;
			
 
				 
			
 
				+	if (user_mode(regs))
			
 
				+		flags |= FAULT_FLAG_USER;
			
 
				 retry:
			
 
				 	down_read(&mm->mmap_sem);
			
 
				 
			
--- a/arch/metag/mm/fault.c
+++ b/arch/metag/mm/fault.c
@@ -53,8 +53,7 @@ int do_page_fault(struct pt_regs *regs, unsigned long address,
 
				 	struct vm_area_struct *vma, *prev_vma;
			
 
				 	siginfo_t info;
			
 
				 	int fault;
			
 
				-	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
			
 
				-				(write_access ? FAULT_FLAG_WRITE : 0);
			
 
				+	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
			
 
				 
			
 
				 	tsk = current;
			
 
				 
			
@@ -109,6 +108,8 @@ int do_page_fault(struct pt_regs *regs, unsigned long address,
 
				 	if (in_atomic() || !mm)
			
 
				 		goto no_context;
			
 
				 
			
 
				+	if (user_mode(regs))
			
 
				+		flags |= FAULT_FLAG_USER;
			
 
				 retry:
			
 
				 	down_read(&mm->mmap_sem);
			
 
				 
			
@@ -121,6 +122,7 @@ good_area:
 
				 	if (write_access) {
			
 
				 		if (!(vma->vm_flags & VM_WRITE))
			
 
				 			goto bad_area;
			
 
				+		flags |= FAULT_FLAG_WRITE;
			
 
				 	} else {
			
 
				 		if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
			
 
				 			goto bad_area;
			
--- a/arch/microblaze/mm/fault.c
+++ b/arch/microblaze/mm/fault.c
@@ -92,8 +92,7 @@ void do_page_fault(struct pt_regs *regs, unsigned long address,
 
				 	int code = SEGV_MAPERR;
			
 
				 	int is_write = error_code & ESR_S;
			
 
				 	int fault;
			
 
				-	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
			
 
				-					 (is_write ? FAULT_FLAG_WRITE : 0);
			
 
				+	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
			
 
				 
			
 
				 	regs->ear = address;
			
 
				 	regs->esr = error_code;
			
@@ -121,6 +120,9 @@ void do_page_fault(struct pt_regs *regs, unsigned long address,
 
				 		die("Weird page fault", regs, SIGSEGV);
			
 
				 	}
			
 
				 
			
 
				+	if (user_mode(regs))
			
 
				+		flags |= FAULT_FLAG_USER;
			
 
				+
			
 
				 	/* When running in the kernel we expect faults to occur only to
			
 
				 	 * addresses in user space.  All other faults represent errors in the
			
 
				 	 * kernel and should generate an OOPS.  Unfortunately, in the case of an
			
@@ -199,6 +201,7 @@ good_area:
 
				 	if (unlikely(is_write)) {
			
 
				 		if (unlikely(!(vma->vm_flags & VM_WRITE)))
			
 
				 			goto bad_area;
			
 
				+		flags |= FAULT_FLAG_WRITE;
			
 
				 	/* a read */
			
 
				 	} else {
			
 
				 		/* protection fault */
			
--- a/arch/mips/mm/fault.c
+++ b/arch/mips/mm/fault.c
@@ -42,8 +42,7 @@ static void __kprobes __do_page_fault(struct pt_regs *regs, unsigned long write,
 
				 	const int field = sizeof(unsigned long) * 2;
			
 
				 	siginfo_t info;
			
 
				 	int fault;
			
 
				-	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
			
 
				-						 (write ? FAULT_FLAG_WRITE : 0);
			
 
				+	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
			
 
				 
			
 
				 #if 0
			
 
				 	printk("Cpu%d[%s:%d:%0*lx:%ld:%0*lx]\n", raw_smp_processor_id(),
			
@@ -93,6 +92,8 @@ static void __kprobes __do_page_fault(struct pt_regs *regs, unsigned long write,
 
				 	if (in_atomic() || !mm)
			
 
				 		goto bad_area_nosemaphore;
			
 
				 
			
 
				+	if (user_mode(regs))
			
 
				+		flags |= FAULT_FLAG_USER;
			
 
				 retry:
			
 
				 	down_read(&mm->mmap_sem);
			
 
				 	vma = find_vma(mm, address);
			
@@ -114,6 +115,7 @@ good_area:
 
				 	if (write) {
			
 
				 		if (!(vma->vm_flags & VM_WRITE))
			
 
				 			goto bad_area;
			
 
				+		flags |= FAULT_FLAG_WRITE;
			
 
				 	} else {
			
 
				 		if (cpu_has_rixi) {
			
 
				 			if (address == regs->cp0_epc && !(vma->vm_flags & VM_EXEC)) {
			
@@ -241,6 +243,8 @@ out_of_memory:
 
				 	 * (which will retry the fault, or kill us if we got oom-killed).
			
 
				 	 */
			
 
				 	up_read(&mm->mmap_sem);
			
 
				+	if (!user_mode(regs))
			
 
				+		goto no_context;
			
 
				 	pagefault_out_of_memory();
			
 
				 	return;
			
 
				 
			
--- a/arch/mn10300/mm/fault.c
+++ b/arch/mn10300/mm/fault.c
@@ -171,6 +171,8 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long fault_code,
 
				 	if (in_atomic() || !mm)
			
 
				 		goto no_context;
			
 
				 
			
 
				+	if ((fault_code & MMUFCR_xFC_ACCESS) == MMUFCR_xFC_ACCESS_USR)
			
 
				+		flags |= FAULT_FLAG_USER;
			
 
				 retry:
			
 
				 	down_read(&mm->mmap_sem);
			
 
				 
			
--- a/arch/openrisc/mm/fault.c
+++ b/arch/openrisc/mm/fault.c
@@ -86,6 +86,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long address,
 
				 	if (user_mode(regs)) {
			
 
				 		/* Exception was in userspace: reenable interrupts */
			
 
				 		local_irq_enable();
			
 
				+		flags |= FAULT_FLAG_USER;
			
 
				 	} else {
			
 
				 		/* If exception was in a syscall, then IRQ's may have
			
 
				 		 * been enabled or disabled.  If they were enabled,
			
--- a/arch/parisc/mm/fault.c
+++ b/arch/parisc/mm/fault.c
@@ -180,6 +180,10 @@ void do_page_fault(struct pt_regs *regs, unsigned long code,
 
				 	if (in_atomic() || !mm)
			
 
				 		goto no_context;
			
 
				 
			
 
				+	if (user_mode(regs))
			
 
				+		flags |= FAULT_FLAG_USER;
			
 
				+	if (acc_type & VM_WRITE)
			
 
				+		flags |= FAULT_FLAG_WRITE;
			
 
				 retry:
			
 
				 	down_read(&mm->mmap_sem);
			
 
				 	vma = find_vma_prev(mm, address, &prev_vma);
			
@@ -203,8 +207,7 @@ good_area:
 
				 	 * fault.
			
 
				 	 */
			
 
				 
			
 
				-	fault = handle_mm_fault(mm, vma, address,
			
 
				-			flags | ((acc_type & VM_WRITE) ? FAULT_FLAG_WRITE : 0));
			
 
				+	fault = handle_mm_fault(mm, vma, address, flags);
			
 
				 
			
 
				 	if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))
			
 
				 		return;
			
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -223,9 +223,6 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
 
				 	is_write = error_code & ESR_DST;
			
 
				 #endif /* CONFIG_4xx || CONFIG_BOOKE */
			
 
				 
			
 
				-	if (is_write)
			
 
				-		flags |= FAULT_FLAG_WRITE;
			
 
				-
			
 
				 #ifdef CONFIG_PPC_ICSWX
			
 
				 	/*
			
 
				 	 * we need to do this early because this "data storage
			
@@ -288,6 +285,9 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
 
				 	if (user_mode(regs))
			
 
				 		store_update_sp = store_updates_sp(regs);
			
 
				 
			
 
				+	if (user_mode(regs))
			
 
				+		flags |= FAULT_FLAG_USER;
			
 
				+
			
 
				 	/* When running in the kernel we expect faults to occur only to
			
 
				 	 * addresses in user space.  All other faults represent errors in the
			
 
				 	 * kernel and should generate an OOPS.  Unfortunately, in the case of an
			
@@ -415,6 +415,7 @@ good_area:
 
				 	} else if (is_write) {
			
 
				 		if (!(vma->vm_flags & VM_WRITE))
			
 
				 			goto bad_area;
			
 
				+		flags |= FAULT_FLAG_WRITE;
			
 
				 	/* a read */
			
 
				 	} else {
			
 
				 		/* protection fault */
			
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -302,6 +302,8 @@ static inline int do_exception(struct pt_regs *regs, int access)
 
				 	address = trans_exc_code & __FAIL_ADDR_MASK;
			
 
				 	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
			
 
				 	flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
			
 
				+	if (user_mode(regs))
			
 
				+		flags |= FAULT_FLAG_USER;
			
 
				 	if (access == VM_WRITE || (trans_exc_code & store_indication) == 0x400)
			
 
				 		flags |= FAULT_FLAG_WRITE;
			
 
				 	down_read(&mm->mmap_sem);
			
--- a/arch/score/mm/fault.c
+++ b/arch/score/mm/fault.c
@@ -47,6 +47,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long write,
 
				 	struct task_struct *tsk = current;
			
 
				 	struct mm_struct *mm = tsk->mm;
			
 
				 	const int field = sizeof(unsigned long) * 2;
			
 
				+	unsigned long flags = 0;
			
 
				 	siginfo_t info;
			
 
				 	int fault;
			
 
				 
			
@@ -75,6 +76,9 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long write,
 
				 	if (in_atomic() || !mm)
			
 
				 		goto bad_area_nosemaphore;
			
 
				 
			
 
				+	if (user_mode(regs))
			
 
				+		flags |= FAULT_FLAG_USER;
			
 
				+
			
 
				 	down_read(&mm->mmap_sem);
			
 
				 	vma = find_vma(mm, address);
			
 
				 	if (!vma)
			
@@ -95,18 +99,18 @@ good_area:
 
				 	if (write) {
			
 
				 		if (!(vma->vm_flags & VM_WRITE))
			
 
				 			goto bad_area;
			
 
				+		flags |= FAULT_FLAG_WRITE;
			
 
				 	} else {
			
 
				 		if (!(vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)))
			
 
				 			goto bad_area;
			
 
				 	}
			
 
				 
			
 
				-survive:
			
 
				 	/*
			
 
				 	* If for any reason at all we couldn't handle the fault,
			
 
				 	* make sure we exit gracefully rather than endlessly redo
			
 
				 	* the fault.
			
 
				 	*/
			
 
				-	fault = handle_mm_fault(mm, vma, address, write);
			
 
				+	fault = handle_mm_fault(mm, vma, address, flags);
			
 
				 	if (unlikely(fault & VM_FAULT_ERROR)) {
			
 
				 		if (fault & VM_FAULT_OOM)
			
 
				 			goto out_of_memory;
			
@@ -167,11 +171,6 @@ no_context:
 
				 	*/
			
 
				 out_of_memory:
			
 
				 	up_read(&mm->mmap_sem);
			
 
				-	if (is_global_init(tsk)) {
			
 
				-		yield();
			
 
				-		down_read(&mm->mmap_sem);
			
 
				-		goto survive;
			
 
				-	}
			
 
				 	if (!user_mode(regs))
			
 
				 		goto no_context;
			
 
				 	pagefault_out_of_memory();
			
--- a/arch/sh/mm/fault.c
+++ b/arch/sh/mm/fault.c
@@ -400,9 +400,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
 
				 	struct mm_struct *mm;
			
 
				 	struct vm_area_struct * vma;
			
 
				 	int fault;
			
 
				-	int write = error_code & FAULT_CODE_WRITE;
			
 
				-	unsigned int flags = (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
			
 
				-			      (write ? FAULT_FLAG_WRITE : 0));
			
 
				+	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
			
 
				 
			
 
				 	tsk = current;
			
 
				 	mm = tsk->mm;
			
@@ -476,6 +474,11 @@ good_area:
 
				 
			
 
				 	set_thread_fault_code(error_code);
			
 
				 
			
 
				+	if (user_mode(regs))
			
 
				+		flags |= FAULT_FLAG_USER;
			
 
				+	if (error_code & FAULT_CODE_WRITE)
			
 
				+		flags |= FAULT_FLAG_WRITE;
			
 
				+
			
 
				 	/*
			
 
				 	 * If for any reason at all we couldn't handle the fault,
			
 
				 	 * make sure we exit gracefully rather than endlessly redo
			
--- a/arch/sparc/mm/fault_32.c
+++ b/arch/sparc/mm/fault_32.c
@@ -177,8 +177,7 @@ asmlinkage void do_sparc_fault(struct pt_regs *regs, int text_fault, int write,
 
				 	unsigned long g2;
			
 
				 	int from_user = !(regs->psr & PSR_PS);
			
 
				 	int fault, code;
			
 
				-	unsigned int flags = (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
			
 
				-			      (write ? FAULT_FLAG_WRITE : 0));
			
 
				+	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
			
 
				 
			
 
				 	if (text_fault)
			
 
				 		address = regs->pc;
			
@@ -235,6 +234,11 @@ good_area:
 
				 			goto bad_area;
			
 
				 	}
			
 
				 
			
 
				+	if (from_user)
			
 
				+		flags |= FAULT_FLAG_USER;
			
 
				+	if (write)
			
 
				+		flags |= FAULT_FLAG_WRITE;
			
 
				+
			
 
				 	/*
			
 
				 	 * If for any reason at all we couldn't handle the fault,
			
 
				 	 * make sure we exit gracefully rather than endlessly redo
			
@@ -383,6 +387,7 @@ static void force_user_fault(unsigned long address, int write)
 
				 	struct vm_area_struct *vma;
			
 
				 	struct task_struct *tsk = current;
			
 
				 	struct mm_struct *mm = tsk->mm;
			
 
				+	unsigned int flags = FAULT_FLAG_USER;
			
 
				 	int code;
			
 
				 
			
 
				 	code = SEGV_MAPERR;
			
@@ -402,11 +407,12 @@ good_area:
 
				 	if (write) {
			
 
				 		if (!(vma->vm_flags & VM_WRITE))
			
 
				 			goto bad_area;
			
 
				+		flags |= FAULT_FLAG_WRITE;
			
 
				 	} else {
			
 
				 		if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
			
 
				 			goto bad_area;
			
 
				 	}
			
 
				-	switch (handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0)) {
			
 
				+	switch (handle_mm_fault(mm, vma, address, flags)) {
			
 
				 	case VM_FAULT_SIGBUS:
			
 
				 	case VM_FAULT_OOM:
			
 
				 		goto do_sigbus;
			
--- a/arch/sparc/mm/fault_64.c
+++ b/arch/sparc/mm/fault_64.c
@@ -315,7 +315,8 @@ asmlinkage void __kprobes do_sparc64_fault(struct pt_regs *regs)
 
				 			bad_kernel_pc(regs, address);
			
 
				 			return;
			
 
				 		}
			
 
				-	}
			
 
				+	} else
			
 
				+		flags |= FAULT_FLAG_USER;
			
 
				 
			
 
				 	/*
			
 
				 	 * If we're in an interrupt or have no user
			
@@ -418,13 +419,14 @@ good_area:
 
				 		    vma->vm_file != NULL)
			
 
				 			set_thread_fault_code(fault_code |
			
 
				 					      FAULT_CODE_BLKCOMMIT);
			
 
				+
			
 
				+		flags |= FAULT_FLAG_WRITE;
			
 
				 	} else {
			
 
				 		/* Allow reads even for write-only mappings */
			
 
				 		if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
			
 
				 			goto bad_area;
			
 
				 	}
			
 
				 
			
 
				-	flags |= ((fault_code & FAULT_CODE_WRITE) ? FAULT_FLAG_WRITE : 0);
			
 
				 	fault = handle_mm_fault(mm, vma, address, flags);
			
 
				 
			
 
				 	if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))
			
--- a/arch/tile/mm/fault.c
+++ b/arch/tile/mm/fault.c
@@ -280,8 +280,7 @@ static int handle_page_fault(struct pt_regs *regs,
 
				 	if (!is_page_fault)
			
 
				 		write = 1;
			
 
				 
			
 
				-	flags = (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
			
 
				-		 (write ? FAULT_FLAG_WRITE : 0));
			
 
				+	flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
			
 
				 
			
 
				 	is_kernel_mode = !user_mode(regs);
			
 
				 
			
@@ -365,6 +364,9 @@ static int handle_page_fault(struct pt_regs *regs,
 
				 		goto bad_area_nosemaphore;
			
 
				 	}
			
 
				 
			
 
				+	if (!is_kernel_mode)
			
 
				+		flags |= FAULT_FLAG_USER;
			
 
				+
			
 
				 	/*
			
 
				 	 * When running in the kernel we expect faults to occur only to
			
 
				 	 * addresses in user space.  All other faults represent errors in the
			
@@ -425,12 +427,12 @@ good_area:
 
				 #endif
			
 
				 		if (!(vma->vm_flags & VM_WRITE))
			
 
				 			goto bad_area;
			
 
				+		flags |= FAULT_FLAG_WRITE;
			
 
				 	} else {
			
 
				 		if (!is_page_fault || !(vma->vm_flags & VM_READ))
			
 
				 			goto bad_area;
			
 
				 	}
			
 
				 
			
 
				- survive:
			
 
				 	/*
			
 
				 	 * If for any reason at all we couldn't handle the fault,
			
 
				 	 * make sure we exit gracefully rather than endlessly redo
			
@@ -555,11 +557,6 @@ no_context:
 
				  */
			
 
				 out_of_memory:
			
 
				 	up_read(&mm->mmap_sem);
			
 
				-	if (is_global_init(tsk)) {
			
 
				-		yield();
			
 
				-		down_read(&mm->mmap_sem);
			
 
				-		goto survive;
			
 
				-	}
			
 
				 	if (is_kernel_mode)
			
 
				 		goto no_context;
			
 
				 	pagefault_out_of_memory();
			
--- a/arch/um/kernel/trap.c
+++ b/arch/um/kernel/trap.c
@@ -30,8 +30,7 @@ int handle_page_fault(unsigned long address, unsigned long ip,
 
				 	pmd_t *pmd;
			
 
				 	pte_t *pte;
			
 
				 	int err = -EFAULT;
			
 
				-	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
			
 
				-				 (is_write ? FAULT_FLAG_WRITE : 0);
			
 
				+	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
			
 
				 
			
 
				 	*code_out = SEGV_MAPERR;
			
 
				 
			
@@ -42,6 +41,8 @@ int handle_page_fault(unsigned long address, unsigned long ip,
 
				 	if (in_atomic())
			
 
				 		goto out_nosemaphore;
			
 
				 
			
 
				+	if (is_user)
			
 
				+		flags |= FAULT_FLAG_USER;
			
 
				 retry:
			
 
				 	down_read(&mm->mmap_sem);
			
 
				 	vma = find_vma(mm, address);
			
@@ -58,12 +59,15 @@ retry:
 
				 
			
 
				 good_area:
			
 
				 	*code_out = SEGV_ACCERR;
			
 
				-	if (is_write && !(vma->vm_flags & VM_WRITE))
			
 
				-		goto out;
			
 
				-
			
 
				-	/* Don't require VM_READ|VM_EXEC for write faults! */
			
 
				-	if (!is_write && !(vma->vm_flags & (VM_READ | VM_EXEC)))
			
 
				-		goto out;
			
 
				+	if (is_write) {
			
 
				+		if (!(vma->vm_flags & VM_WRITE))
			
 
				+			goto out;
			
 
				+		flags |= FAULT_FLAG_WRITE;
			
 
				+	} else {
			
 
				+		/* Don't require VM_READ|VM_EXEC for write faults! */
			
 
				+		if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
			
 
				+			goto out;
			
 
				+	}
			
 
				 
			
 
				 	do {
			
 
				 		int fault;
			
@@ -124,6 +128,8 @@ out_of_memory:
 
				 	 * (which will retry the fault, or kill us if we got oom-killed).
			
 
				 	 */
			
 
				 	up_read(&mm->mmap_sem);
			
 
				+	if (!is_user)
			
 
				+		goto out_nosemaphore;
			
 
				 	pagefault_out_of_memory();
			
 
				 	return 0;
			
 
				 }
			
--- a/arch/unicore32/mm/fault.c
+++ b/arch/unicore32/mm/fault.c
@@ -209,8 +209,7 @@ static int do_pf(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 
				 	struct task_struct *tsk;
			
 
				 	struct mm_struct *mm;
			
 
				 	int fault, sig, code;
			
 
				-	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
			
 
				-				 ((!(fsr ^ 0x12)) ? FAULT_FLAG_WRITE : 0);
			
 
				+	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
			
 
				 
			
 
				 	tsk = current;
			
 
				 	mm = tsk->mm;
			
@@ -222,6 +221,11 @@ static int do_pf(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 
				 	if (in_atomic() || !mm)
			
 
				 		goto no_context;
			
 
				 
			
 
				+	if (user_mode(regs))
			
 
				+		flags |= FAULT_FLAG_USER;
			
 
				+	if (!(fsr ^ 0x12))
			
 
				+		flags |= FAULT_FLAG_WRITE;
			
 
				+
			
 
				 	/*
			
 
				 	 * As per x86, we may deadlock here.  However, since the kernel only
			
 
				 	 * validly references user space from well defined areas of the code,
			
@@ -278,6 +282,13 @@ retry:
 
				 	       (VM_FAULT_ERROR | VM_FAULT_BADMAP | VM_FAULT_BADACCESS))))
			
 
				 		return 0;
			
 
				 
			
 
				+	/*
			
 
				+	 * If we are in kernel mode at this point, we
			
 
				+	 * have no context to handle this fault with.
			
 
				+	 */
			
 
				+	if (!user_mode(regs))
			
 
				+		goto no_context;
			
 
				+
			
 
				 	if (fault & VM_FAULT_OOM) {
			
 
				 		/*
			
 
				 		 * We ran out of memory, call the OOM killer, and return to
			
@@ -288,13 +299,6 @@ retry:
 
				 		return 0;
			
 
				 	}
			
 
				 
			
 
				-	/*
			
 
				-	 * If we are in kernel mode at this point, we
			
 
				-	 * have no context to handle this fault with.
			
 
				-	 */
			
 
				-	if (!user_mode(regs))
			
 
				-		goto no_context;
			
 
				-
			
 
				 	if (fault & VM_FAULT_SIGBUS) {
			
 
				 		/*
			
 
				 		 * We had some memory, but were unable to
			
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -842,23 +842,15 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
 
				 	force_sig_info_fault(SIGBUS, code, address, tsk, fault);
			
 
				 }
			
 
				 
			
 
				-static noinline int
			
 
				+static noinline void
			
 
				 mm_fault_error(struct pt_regs *regs, unsigned long error_code,
			
 
				 	       unsigned long address, unsigned int fault)
			
 
				 {
			
 
				-	/*
			
 
				-	 * Pagefault was interrupted by SIGKILL. We have no reason to
			
 
				-	 * continue pagefault.
			
 
				-	 */
			
 
				-	if (fatal_signal_pending(current)) {
			
 
				-		if (!(fault & VM_FAULT_RETRY))
			
 
				-			up_read(&current->mm->mmap_sem);
			
 
				-		if (!(error_code & PF_USER))
			
 
				-			no_context(regs, error_code, address, 0, 0);
			
 
				-		return 1;
			
 
				+	if (fatal_signal_pending(current) && !(error_code & PF_USER)) {
			
 
				+		up_read(&current->mm->mmap_sem);
			
 
				+		no_context(regs, error_code, address, 0, 0);
			
 
				+		return;
			
 
				 	}
			
 
				-	if (!(fault & VM_FAULT_ERROR))
			
 
				-		return 0;
			
 
				 
			
 
				 	if (fault & VM_FAULT_OOM) {
			
 
				 		/* Kernel mode? Handle exceptions or die: */
			
@@ -866,7 +858,7 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
 
				 			up_read(&current->mm->mmap_sem);
			
 
				 			no_context(regs, error_code, address,
			
 
				 				   SIGSEGV, SEGV_MAPERR);
			
 
				-			return 1;
			
 
				+			return;
			
 
				 		}
			
 
				 
			
 
				 		up_read(&current->mm->mmap_sem);
			
@@ -884,7 +876,6 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
 
				 		else
			
 
				 			BUG();
			
 
				 	}
			
 
				-	return 1;
			
 
				 }
			
 
				 
			
 
				 static int spurious_fault_check(unsigned long error_code, pte_t *pte)
			
@@ -1011,9 +1002,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code)
 
				 	unsigned long address;
			
 
				 	struct mm_struct *mm;
			
 
				 	int fault;
			
 
				-	int write = error_code & PF_WRITE;
			
 
				-	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
			
 
				-					(write ? FAULT_FLAG_WRITE : 0);
			
 
				+	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
			
 
				 
			
 
				 	tsk = current;
			
 
				 	mm = tsk->mm;
			
@@ -1083,6 +1072,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code)
 
				 	if (user_mode_vm(regs)) {
			
 
				 		local_irq_enable();
			
 
				 		error_code |= PF_USER;
			
 
				+		flags |= FAULT_FLAG_USER;
			
 
				 	} else {
			
 
				 		if (regs->flags & X86_EFLAGS_IF)
			
 
				 			local_irq_enable();
			
@@ -1109,6 +1099,9 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code)
 
				 		return;
			
 
				 	}
			
 
				 
			
 
				+	if (error_code & PF_WRITE)
			
 
				+		flags |= FAULT_FLAG_WRITE;
			
 
				+
			
 
				 	/*
			
 
				 	 * When running in the kernel we expect faults to occur only to
			
 
				 	 * addresses in user space.  All other faults represent errors in
			
@@ -1187,9 +1180,17 @@ good_area:
 
				 	 */
			
 
				 	fault = handle_mm_fault(mm, vma, address, flags);
			
 
				 
			
 
				-	if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) {
			
 
				-		if (mm_fault_error(regs, error_code, address, fault))
			
 
				-			return;
			
 
				+	/*
			
 
				+	 * If we need to retry but a fatal signal is pending, handle the
			
 
				+	 * signal first. We do not need to release the mmap_sem because it
			
 
				+	 * would already be released in __lock_page_or_retry in mm/filemap.c.
			
 
				+	 */
			
 
				+	if (unlikely((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)))
			
 
				+		return;
			
 
				+
			
 
				+	if (unlikely(fault & VM_FAULT_ERROR)) {
			
 
				+		mm_fault_error(regs, error_code, address, fault);
			
 
				+		return;
			
 
				 	}
			
 
				 
			
 
				 	/*
			
--- a/arch/xtensa/mm/fault.c
+++ b/arch/xtensa/mm/fault.c
@@ -72,6 +72,8 @@ void do_page_fault(struct pt_regs *regs)
 
				 	       address, exccause, regs->pc, is_write? "w":"", is_exec? "x":"");
			
 
				 #endif
			
 
				 
			
 
				+	if (user_mode(regs))
			
 
				+		flags |= FAULT_FLAG_USER;
			
 
				 retry:
			
 
				 	down_read(&mm->mmap_sem);
			
 
				 	vma = find_vma(mm, address);
			
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -125,13 +125,7 @@ static ssize_t node_read_meminfo(struct device *dev,
 
				 		       nid, K(node_page_state(nid, NR_WRITEBACK)),
			
 
				 		       nid, K(node_page_state(nid, NR_FILE_PAGES)),
			
 
				 		       nid, K(node_page_state(nid, NR_FILE_MAPPED)),
			
 
				-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
			
 
				-		       nid, K(node_page_state(nid, NR_ANON_PAGES)
			
 
				-			+ node_page_state(nid, NR_ANON_TRANSPARENT_HUGEPAGES) *
			
 
				-			HPAGE_PMD_NR),
			
 
				-#else
			
 
				 		       nid, K(node_page_state(nid, NR_ANON_PAGES)),
			
 
				-#endif
			
 
				 		       nid, K(node_page_state(nid, NR_SHMEM)),
			
 
				 		       nid, node_page_state(nid, NR_KERNEL_STACK) *
			
 
				 				THREAD_SIZE / 1024,
			
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -50,7 +50,7 @@ static void adfs_write_failed(struct address_space *mapping, loff_t to)
 
				 	struct inode *inode = mapping->host;
			
 
				 
			
 
				 	if (to > inode->i_size)
			
 
				-		truncate_pagecache(inode, to, inode->i_size);
			
 
				+		truncate_pagecache(inode, inode->i_size);
			
 
				 }
			
 
				 
			
 
				 static int adfs_write_begin(struct file *file, struct address_space *mapping,
			
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -406,7 +406,7 @@ static void affs_write_failed(struct address_space *mapping, loff_t to)
 
				 	struct inode *inode = mapping->host;
			
 
				 
			
 
				 	if (to > inode->i_size) {
			
 
				-		truncate_pagecache(inode, to, inode->i_size);
			
 
				+		truncate_pagecache(inode, inode->i_size);
			
 
				 		affs_truncate(inode);
			
 
				 	}
			
 
				 }
			
--- a/fs/bfs/file.c
+++ b/fs/bfs/file.c
@@ -166,7 +166,7 @@ static void bfs_write_failed(struct address_space *mapping, loff_t to)
 
				 	struct inode *inode = mapping->host;
			
 
				 
			
 
				 	if (to > inode->i_size)
			
 
				-		truncate_pagecache(inode, to, inode->i_size);
			
 
				+		truncate_pagecache(inode, inode->i_size);
			
 
				 }
			
 
				 
			
 
				 static int bfs_write_begin(struct file *file, struct address_space *mapping,
			
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -221,12 +221,10 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
 
				 				    struct btrfs_path *path,
			
 
				 				    struct inode *inode)
			
 
				 {
			
 
				-	loff_t oldsize;
			
 
				 	int ret = 0;
			
 
				 
			
 
				-	oldsize = i_size_read(inode);
			
 
				 	btrfs_i_size_write(inode, 0);
			
 
				-	truncate_pagecache(inode, oldsize, 0);
			
 
				+	truncate_pagecache(inode, 0);
			
 
				 
			
 
				 	/*
			
 
				 	 * We don't need an orphan item because truncating the free space cache
			
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4349,7 +4349,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
 
				 		inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb);
			
 
				 
			
 
				 	if (newsize > oldsize) {
			
 
				-		truncate_pagecache(inode, oldsize, newsize);
			
 
				+		truncate_pagecache(inode, newsize);
			
 
				 		ret = btrfs_cont_expand(inode, oldsize, newsize);
			
 
				 		if (ret)
			
 
				 			return ret;
			
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1856,14 +1856,11 @@ static int cifs_truncate_page(struct address_space *mapping, loff_t from)
 
				 
			
 
				 static void cifs_setsize(struct inode *inode, loff_t offset)
			
 
				 {
			
 
				-	loff_t oldsize;
			
 
				-
			
 
				 	spin_lock(&inode->i_lock);
			
 
				-	oldsize = inode->i_size;
			
 
				 	i_size_write(inode, offset);
			
 
				 	spin_unlock(&inode->i_lock);
			
 
				 
			
 
				-	truncate_pagecache(inode, oldsize, offset);
			
 
				+	truncate_pagecache(inode, offset);
			
 
				 }
			
 
				 
			
 
				 static int
			
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -861,7 +861,7 @@ static int exofs_writepage(struct page *page, struct writeback_control *wbc)
 
				 static void _write_failed(struct inode *inode, loff_t to)
			
 
				 {
			
 
				 	if (to > inode->i_size)
			
 
				-		truncate_pagecache(inode, to, inode->i_size);
			
 
				+		truncate_pagecache(inode, inode->i_size);
			
 
				 }
			
 
				 
			
 
				 int exofs_write_begin(struct file *file, struct address_space *mapping,
			
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -58,7 +58,7 @@ static void ext2_write_failed(struct address_space *mapping, loff_t to)
 
				 	struct inode *inode = mapping->host;
			
 
				 
			
 
				 	if (to > inode->i_size) {
			
 
				-		truncate_pagecache(inode, to, inode->i_size);
			
 
				+		truncate_pagecache(inode, inode->i_size);
			
 
				 		ext2_truncate_blocks(inode, inode->i_size);
			
 
				 	}
			
 
				 }
			
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4587,7 +4587,6 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 
				 
			
 
				 	if (attr->ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) {
			
 
				 		handle_t *handle;
			
 
				-		loff_t oldsize = inode->i_size;
			
 
				 
			
 
				 		if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
			
 
				 			struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
			
@@ -4650,7 +4649,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 
				 		 * Truncate pagecache after we've waited for commit
			
 
				 		 * in data=journal mode to make pages freeable.
			
 
				 		 */
			
 
				-		truncate_pagecache(inode, oldsize, inode->i_size);
			
 
				+			truncate_pagecache(inode, inode->i_size);
			
 
				 	}
			
 
				 	/*
			
 
				 	 * We want to call ext4_truncate() even if attr->ia_size ==
			
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -147,7 +147,7 @@ static void fat_write_failed(struct address_space *mapping, loff_t to)
 
				 	struct inode *inode = mapping->host;
			
 
				 
			
 
				 	if (to > inode->i_size) {
			
 
				-		truncate_pagecache(inode, to, inode->i_size);
			
 
				+		truncate_pagecache(inode, inode->i_size);
			
 
				 		fat_truncate_blocks(inode, inode->i_size);
			
 
				 	}
			
 
				 }
			
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1678,7 +1678,7 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr,
 
				 	 * FUSE_NOWRITE, otherwise fuse_launder_page() would deadlock.
			
 
				 	 */
			
 
				 	if (S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) {
			
 
				-		truncate_pagecache(inode, oldsize, outarg.attr.size);
			
 
				+		truncate_pagecache(inode, outarg.attr.size);
			
 
				 		invalidate_inode_pages2(inode->i_mapping);
			
 
				 	}
			
 
				 
			
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -218,7 +218,7 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
 
				 		bool inval = false;
			
 
				 
			
 
				 		if (oldsize != attr->size) {
			
 
				-			truncate_pagecache(inode, oldsize, attr->size);
			
 
				+			truncate_pagecache(inode, attr->size);
			
 
				 			inval = true;
			
 
				 		} else if (fc->auto_inval_data) {
			
 
				 			struct timespec new_mtime = {
			
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -1016,7 +1016,7 @@ static int gfs2_journaled_truncate(struct inode *inode, u64 oldsize, u64 newsize
 
				 		chunk = oldsize - newsize;
			
 
				 		if (chunk > max_chunk)
			
 
				 			chunk = max_chunk;
			
 
				-		truncate_pagecache(inode, oldsize, oldsize - chunk);
			
 
				+		truncate_pagecache(inode, oldsize - chunk);
			
 
				 		oldsize -= chunk;
			
 
				 		gfs2_trans_end(sdp);
			
 
				 		error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES);
			
@@ -1067,7 +1067,7 @@ static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize)
 
				 	if (journaled)
			
 
				 		error = gfs2_journaled_truncate(inode, oldsize, newsize);
			
 
				 	else
			
 
				-		truncate_pagecache(inode, oldsize, newsize);
			
 
				+		truncate_pagecache(inode, newsize);
			
 
				 
			
 
				 	if (error) {
			
 
				 		brelse(dibh);
			
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -41,7 +41,7 @@ static void hfs_write_failed(struct address_space *mapping, loff_t to)
 
				 	struct inode *inode = mapping->host;
			
 
				 
			
 
				 	if (to > inode->i_size) {
			
 
				-		truncate_pagecache(inode, to, inode->i_size);
			
 
				+		truncate_pagecache(inode, inode->i_size);
			
 
				 		hfs_file_truncate(inode);
			
 
				 	}
			
 
				 }
			
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -36,7 +36,7 @@ static void hfsplus_write_failed(struct address_space *mapping, loff_t to)
 
				 	struct inode *inode = mapping->host;
			
 
				 
			
 
				 	if (to > inode->i_size) {
			
 
				-		truncate_pagecache(inode, to, inode->i_size);
			
 
				+		truncate_pagecache(inode, inode->i_size);
			
 
				 		hfsplus_file_truncate(inode);
			
 
				 	}
			
 
				 }
			
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -138,7 +138,7 @@ static void hpfs_write_failed(struct address_space *mapping, loff_t to)
 
				 	hpfs_lock(inode->i_sb);
			
 
				 
			
 
				 	if (to > inode->i_size) {
			
 
				-		truncate_pagecache(inode, to, inode->i_size);
			
 
				+		truncate_pagecache(inode, inode->i_size);
			
 
				 		hpfs_truncate(inode);
			
 
				 	}
			
 
				 
			
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -306,7 +306,7 @@ static void jfs_write_failed(struct address_space *mapping, loff_t to)
 
				 	struct inode *inode = mapping->host;
			
 
				 
			
 
				 	if (to > inode->i_size) {
			
 
				-		truncate_pagecache(inode, to, inode->i_size);
			
 
				+		truncate_pagecache(inode, inode->i_size);
			
 
				 		jfs_truncate(inode);
			
 
				 	}
			
 
				 }
			
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -400,7 +400,7 @@ static void minix_write_failed(struct address_space *mapping, loff_t to)
 
				 	struct inode *inode = mapping->host;
			
 
				 
			
 
				 	if (to > inode->i_size) {
			
 
				-		truncate_pagecache(inode, to, inode->i_size);
			
 
				+		truncate_pagecache(inode, inode->i_size);
			
 
				 		minix_truncate(inode);
			
 
				 	}
			
 
				 }
			
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -541,7 +541,6 @@ EXPORT_SYMBOL_GPL(nfs_setattr);
 
				  */
			
 
				 static int nfs_vmtruncate(struct inode * inode, loff_t offset)
			
 
				 {
			
 
				-	loff_t oldsize;
			
 
				 	int err;
			
 
				 
			
 
				 	err = inode_newsize_ok(inode, offset);
			
@@ -549,11 +548,10 @@ static int nfs_vmtruncate(struct inode * inode, loff_t offset)
 
				 		goto out;
			
 
				 
			
 
				 	spin_lock(&inode->i_lock);
			
 
				-	oldsize = inode->i_size;
			
 
				 	i_size_write(inode, offset);
			
 
				 	spin_unlock(&inode->i_lock);
			
 
				 
			
 
				-	truncate_pagecache(inode, oldsize, offset);
			
 
				+	truncate_pagecache(inode, offset);
			
 
				 out:
			
 
				 	return err;
			
 
				 }
			
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -254,7 +254,7 @@ void nilfs_write_failed(struct address_space *mapping, loff_t to)
 
				 	struct inode *inode = mapping->host;
			
 
				 
			
 
				 	if (to > inode->i_size) {
			
 
				-		truncate_pagecache(inode, to, inode->i_size);
			
 
				+		truncate_pagecache(inode, inode->i_size);
			
 
				 		nilfs_truncate(inode);
			
 
				 	}
			
 
				 }
			
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -1768,7 +1768,7 @@ static void ntfs_write_failed(struct address_space *mapping, loff_t to)
 
				 	struct inode *inode = mapping->host;
			
 
				 
			
 
				 	if (to > inode->i_size) {
			
 
				-		truncate_pagecache(inode, to, inode->i_size);
			
 
				+		truncate_pagecache(inode, inode->i_size);
			
 
				 		ntfs_truncate_vfs(inode);
			
 
				 	}
			
 
				 }
			
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -311,7 +311,7 @@ static void omfs_write_failed(struct address_space *mapping, loff_t to)
 
				 	struct inode *inode = mapping->host;
			
 
				 
			
 
				 	if (to > inode->i_size) {
			
 
				-		truncate_pagecache(inode, to, inode->i_size);
			
 
				+		truncate_pagecache(inode, inode->i_size);
			
 
				 		omfs_truncate(inode);
			
 
				 	}
			
 
				 }
			
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -132,13 +132,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 
				 		K(i.freeswap),
			
 
				 		K(global_page_state(NR_FILE_DIRTY)),
			
 
				 		K(global_page_state(NR_WRITEBACK)),
			
 
				-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
			
 
				-		K(global_page_state(NR_ANON_PAGES)
			
 
				-		  + global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) *
			
 
				-		  HPAGE_PMD_NR),
			
 
				-#else
			
 
				 		K(global_page_state(NR_ANON_PAGES)),
			
 
				-#endif
			
 
				 		K(global_page_state(NR_FILE_MAPPED)),
			
 
				 		K(global_page_state(NR_SHMEM)),
			
 
				 		K(global_page_state(NR_SLAB_RECLAIMABLE) +
			
--- a/fs/sysv/itree.c
+++ b/fs/sysv/itree.c
@@ -469,7 +469,7 @@ static void sysv_write_failed(struct address_space *mapping, loff_t to)
 
				 	struct inode *inode = mapping->host;
			
 
				 
			
 
				 	if (to > inode->i_size) {
			
 
				-		truncate_pagecache(inode, to, inode->i_size);
			
 
				+		truncate_pagecache(inode, inode->i_size);
			
 
				 		sysv_truncate(inode);
			
 
				 	}
			
 
				 }
			
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -172,7 +172,7 @@ static void udf_write_failed(struct address_space *mapping, loff_t to)
 
				 	loff_t isize = inode->i_size;
			
 
				 
			
 
				 	if (to > isize) {
			
 
				-		truncate_pagecache(inode, to, isize);
			
 
				+		truncate_pagecache(inode, isize);
			
 
				 		if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
			
 
				 			down_write(&iinfo->i_data_sem);
			
 
				 			udf_clear_extent_cache(inode);
			
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -531,7 +531,7 @@ static void ufs_write_failed(struct address_space *mapping, loff_t to)
 
				 	struct inode *inode = mapping->host;
			
 
				 
			
 
				 	if (to > inode->i_size)
			
 
				-		truncate_pagecache(inode, to, inode->i_size);
			
 
				+		truncate_pagecache(inode, inode->i_size);
			
 
				 }
			
 
				 
			
 
				 static int ufs_write_begin(struct file *file, struct address_space *mapping,
			
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1582,7 +1582,7 @@ xfs_vm_write_begin(
 
				 		unlock_page(page);
			
 
				 
			
 
				 		if (pos + len > i_size_read(inode))
			
 
				-			truncate_pagecache(inode, pos + len, i_size_read(inode));
			
 
				+			truncate_pagecache(inode, i_size_read(inode));
			
 
				 
			
 
				 		page_cache_release(page);
			
 
				 		page = NULL;
			
@@ -1618,7 +1618,7 @@ xfs_vm_write_end(
 
				 		loff_t		to = pos + len;
			
 
				 
			
 
				 		if (to > isize) {
			
 
				-			truncate_pagecache(inode, to, isize);
			
 
				+			truncate_pagecache(inode, isize);
			
 
				 			xfs_vm_kill_delalloc_range(inode, isize, to);
			
 
				 		}
			
 
				 	}
			
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -96,9 +96,6 @@ extern int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 
				 			  pmd_t *dst_pmd, pmd_t *src_pmd,
			
 
				 			  struct vm_area_struct *vma,
			
 
				 			  unsigned long addr, unsigned long end);
			
 
				-extern int handle_pte_fault(struct mm_struct *mm,
			
 
				-			    struct vm_area_struct *vma, unsigned long address,
			
 
				-			    pte_t *pte, pmd_t *pmd, unsigned int flags);
			
 
				 extern int split_huge_page_to_list(struct page *page, struct list_head *list);
			
 
				 static inline int split_huge_page(struct page *page)
			
 
				 {
			
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -30,9 +30,21 @@ struct page;
 
				 struct mm_struct;
			
 
				 struct kmem_cache;
			
 
				 
			
 
				-/* Stats that can be updated by kernel. */
			
 
				-enum mem_cgroup_page_stat_item {
			
 
				-	MEMCG_NR_FILE_MAPPED, /* # of pages charged as file rss */
			
 
				+/*
			
 
				+ * The corresponding mem_cgroup_stat_names is defined in mm/memcontrol.c,
			
 
				+ * These two lists should keep in accord with each other.
			
 
				+ */
			
 
				+enum mem_cgroup_stat_index {
			
 
				+	/*
			
 
				+	 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
			
 
				+	 */
			
 
				+	MEM_CGROUP_STAT_CACHE,		/* # of pages charged as cache */
			
 
				+	MEM_CGROUP_STAT_RSS,		/* # of pages charged as anon rss */
			
 
				+	MEM_CGROUP_STAT_RSS_HUGE,	/* # of pages charged as anon huge */
			
 
				+	MEM_CGROUP_STAT_FILE_MAPPED,	/* # of pages charged as file rss */
			
 
				+	MEM_CGROUP_STAT_WRITEBACK,	/* # of pages under writeback */
			
 
				+	MEM_CGROUP_STAT_SWAP,		/* # of pages, swapped out */
			
 
				+	MEM_CGROUP_STAT_NSTATS,
			
 
				 };
			
 
				 
			
 
				 struct mem_cgroup_reclaim_cookie {
			
@@ -41,6 +53,23 @@ struct mem_cgroup_reclaim_cookie {
 
				 	unsigned int generation;
			
 
				 };
			
 
				 
			
 
				+enum mem_cgroup_filter_t {
			
 
				+	VISIT,		/* visit current node */
			
 
				+	SKIP,		/* skip the current node and continue traversal */
			
 
				+	SKIP_TREE,	/* skip the whole subtree and continue traversal */
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * mem_cgroup_filter_t predicate might instruct mem_cgroup_iter_cond how to
			
 
				+ * iterate through the hierarchy tree. Each tree element is checked by the
			
 
				+ * predicate before it is returned by the iterator. If a filter returns
			
 
				+ * SKIP or SKIP_TREE then the iterator code continues traversal (with the
			
 
				+ * next node down the hierarchy or the next node that doesn't belong under the
			
 
				+ * memcg's subtree).
			
 
				+ */
			
 
				+typedef enum mem_cgroup_filter_t
			
 
				+(*mem_cgroup_iter_filter)(struct mem_cgroup *memcg, struct mem_cgroup *root);
			
 
				+
			
 
				 #ifdef CONFIG_MEMCG
			
 
				 /*
			
 
				  * All "charge" functions with gfp_mask should use GFP_KERNEL or
			
@@ -108,9 +137,18 @@ mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
 
				 extern void mem_cgroup_end_migration(struct mem_cgroup *memcg,
			
 
				 	struct page *oldpage, struct page *newpage, bool migration_ok);
			
 
				 
			
 
				-struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
			
 
				-				   struct mem_cgroup *,
			
 
				-				   struct mem_cgroup_reclaim_cookie *);
			
 
				+struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root,
			
 
				+				   struct mem_cgroup *prev,
			
 
				+				   struct mem_cgroup_reclaim_cookie *reclaim,
			
 
				+				   mem_cgroup_iter_filter cond);
			
 
				+
			
 
				+static inline struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
			
 
				+				   struct mem_cgroup *prev,
			
 
				+				   struct mem_cgroup_reclaim_cookie *reclaim)
			
 
				+{
			
 
				+	return mem_cgroup_iter_cond(root, prev, reclaim, NULL);
			
 
				+}
			
 
				+
			
 
				 void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
			
 
				 
			
 
				 /*
			
@@ -125,6 +163,48 @@ extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
 
				 extern void mem_cgroup_replace_page_cache(struct page *oldpage,
			
 
				 					struct page *newpage);
			
 
				 
			
 
				+/**
			
 
				+ * mem_cgroup_toggle_oom - toggle the memcg OOM killer for the current task
			
 
				+ * @new: true to enable, false to disable
			
 
				+ *
			
 
				+ * Toggle whether a failed memcg charge should invoke the OOM killer
			
 
				+ * or just return -ENOMEM.  Returns the previous toggle state.
			
 
				+ *
			
 
				+ * NOTE: Any path that enables the OOM killer before charging must
			
 
				+ *       call mem_cgroup_oom_synchronize() afterward to finalize the
			
 
				+ *       OOM handling and clean up.
			
 
				+ */
			
 
				+static inline bool mem_cgroup_toggle_oom(bool new)
			
 
				+{
			
 
				+	bool old;
			
 
				+
			
 
				+	old = current->memcg_oom.may_oom;
			
 
				+	current->memcg_oom.may_oom = new;
			
 
				+
			
 
				+	return old;
			
 
				+}
			
 
				+
			
 
				+static inline void mem_cgroup_enable_oom(void)
			
 
				+{
			
 
				+	bool old = mem_cgroup_toggle_oom(true);
			
 
				+
			
 
				+	WARN_ON(old == true);
			
 
				+}
			
 
				+
			
 
				+static inline void mem_cgroup_disable_oom(void)
			
 
				+{
			
 
				+	bool old = mem_cgroup_toggle_oom(false);
			
 
				+
			
 
				+	WARN_ON(old == false);
			
 
				+}
			
 
				+
			
 
				+static inline bool task_in_memcg_oom(struct task_struct *p)
			
 
				+{
			
 
				+	return p->memcg_oom.in_memcg_oom;
			
 
				+}
			
 
				+
			
 
				+bool mem_cgroup_oom_synchronize(void);
			
 
				+
			
 
				 #ifdef CONFIG_MEMCG_SWAP
			
 
				 extern int do_swap_account;
			
 
				 #endif
			
@@ -165,24 +245,24 @@ static inline void mem_cgroup_end_update_page_stat(struct page *page,
 
				 }
			
 
				 
			
 
				 void mem_cgroup_update_page_stat(struct page *page,
			
 
				-				 enum mem_cgroup_page_stat_item idx,
			
 
				+				 enum mem_cgroup_stat_index idx,
			
 
				 				 int val);
			
 
				 
			
 
				 static inline void mem_cgroup_inc_page_stat(struct page *page,
			
 
				-					    enum mem_cgroup_page_stat_item idx)
			
 
				+					    enum mem_cgroup_stat_index idx)
			
 
				 {
			
 
				 	mem_cgroup_update_page_stat(page, idx, 1);
			
 
				 }
			
 
				 
			
 
				 static inline void mem_cgroup_dec_page_stat(struct page *page,
			
 
				-					    enum mem_cgroup_page_stat_item idx)
			
 
				+					    enum mem_cgroup_stat_index idx)
			
 
				 {
			
 
				 	mem_cgroup_update_page_stat(page, idx, -1);
			
 
				 }
			
 
				 
			
 
				-unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
			
 
				-						gfp_t gfp_mask,
			
 
				-						unsigned long *total_scanned);
			
 
				+enum mem_cgroup_filter_t
			
 
				+mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg,
			
 
				+		struct mem_cgroup *root);
			
 
				 
			
 
				 void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx);
			
 
				 static inline void mem_cgroup_count_vm_event(struct mm_struct *mm,
			
@@ -296,6 +376,15 @@ static inline void mem_cgroup_end_migration(struct mem_cgroup *memcg,
 
				 		struct page *oldpage, struct page *newpage, bool migration_ok)
			
 
				 {
			
 
				 }
			
 
				+static inline struct mem_cgroup *
			
 
				+mem_cgroup_iter_cond(struct mem_cgroup *root,
			
 
				+		struct mem_cgroup *prev,
			
 
				+		struct mem_cgroup_reclaim_cookie *reclaim,
			
 
				+		mem_cgroup_iter_filter cond)
			
 
				+{
			
 
				+	/* first call must return non-NULL, second return NULL */
			
 
				+	return (struct mem_cgroup *)(unsigned long)!prev;
			
 
				+}
			
 
				 
			
 
				 static inline struct mem_cgroup *
			
 
				 mem_cgroup_iter(struct mem_cgroup *root,
			
@@ -348,22 +437,45 @@ static inline void mem_cgroup_end_update_page_stat(struct page *page,
 
				 {
			
 
				 }
			
 
				 
			
 
				+static inline bool mem_cgroup_toggle_oom(bool new)
			
 
				+{
			
 
				+	return false;
			
 
				+}
			
 
				+
			
 
				+static inline void mem_cgroup_enable_oom(void)
			
 
				+{
			
 
				+}
			
 
				+
			
 
				+static inline void mem_cgroup_disable_oom(void)
			
 
				+{
			
 
				+}
			
 
				+
			
 
				+static inline bool task_in_memcg_oom(struct task_struct *p)
			
 
				+{
			
 
				+	return false;
			
 
				+}
			
 
				+
			
 
				+static inline bool mem_cgroup_oom_synchronize(void)
			
 
				+{
			
 
				+	return false;
			
 
				+}
			
 
				+
			
 
				 static inline void mem_cgroup_inc_page_stat(struct page *page,
			
 
				-					    enum mem_cgroup_page_stat_item idx)
			
 
				+					    enum mem_cgroup_stat_index idx)
			
 
				 {
			
 
				 }
			
 
				 
			
 
				 static inline void mem_cgroup_dec_page_stat(struct page *page,
			
 
				-					    enum mem_cgroup_page_stat_item idx)
			
 
				+					    enum mem_cgroup_stat_index idx)
			
 
				 {
			
 
				 }
			
 
				 
			
 
				 static inline
			
 
				-unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
			
 
				-					    gfp_t gfp_mask,
			
 
				-					    unsigned long *total_scanned)
			
 
				+enum mem_cgroup_filter_t
			
 
				+mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg,
			
 
				+		struct mem_cgroup *root)
			
 
				 {
			
 
				-	return 0;
			
 
				+	return VISIT;
			
 
				 }
			
 
				 
			
 
				 static inline void mem_cgroup_split_huge_fixup(struct page *head)
			
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -176,6 +176,7 @@ extern pgprot_t protection_map[16];
 
				 #define FAULT_FLAG_RETRY_NOWAIT	0x10	/* Don't drop mmap_sem and wait when retrying */
			
 
				 #define FAULT_FLAG_KILLABLE	0x20	/* The fault task is in SIGKILL killable region */
			
 
				 #define FAULT_FLAG_TRIED	0x40	/* second try */
			
 
				+#define FAULT_FLAG_USER		0x80	/* The fault originated in userspace */
			
 
				 
			
 
				 /*
			
 
				  * vm_fault is filled by the the pagefault handler and passed to the vma's
			
@@ -876,11 +877,12 @@ static inline int page_mapped(struct page *page)
 
				 #define VM_FAULT_NOPAGE	0x0100	/* ->fault installed the pte, not return page */
			
 
				 #define VM_FAULT_LOCKED	0x0200	/* ->fault locked the returned page */
			
 
				 #define VM_FAULT_RETRY	0x0400	/* ->fault blocked, must retry */
			
 
				+#define VM_FAULT_FALLBACK 0x0800	/* huge page fault failed, fall back to small */
			
 
				 
			
 
				 #define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */
			
 
				 
			
 
				 #define VM_FAULT_ERROR	(VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_HWPOISON | \
			
 
				-			 VM_FAULT_HWPOISON_LARGE)
			
 
				+			 VM_FAULT_FALLBACK | VM_FAULT_HWPOISON_LARGE)
			
 
				 
			
 
				 /* Encode hstate index for a hwpoisoned large page */
			
 
				 #define VM_FAULT_SET_HINDEX(x) ((x) << 12)
			
@@ -984,7 +986,7 @@ static inline void unmap_shared_mapping_range(struct address_space *mapping,
 
				 	unmap_mapping_range(mapping, holebegin, holelen, 0);
			
 
				 }
			
 
				 
			
 
				-extern void truncate_pagecache(struct inode *inode, loff_t old, loff_t new);
			
 
				+extern void truncate_pagecache(struct inode *inode, loff_t new);
			
 
				 extern void truncate_setsize(struct inode *inode, loff_t newsize);
			
 
				 void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end);
			
 
				 int truncate_inode_page(struct address_space *mapping, struct page *page);
			
--- a/include/linux/res_counter.h
+++ b/include/linux/res_counter.h
@@ -54,7 +54,7 @@ struct res_counter {
 
				 	struct res_counter *parent;
			
 
				 };
			
 
				 
			
 
				-#define RESOURCE_MAX (unsigned long long)LLONG_MAX
			
 
				+#define RES_COUNTER_MAX ULLONG_MAX
			
 
				 
			
 
				 /**
			
 
				  * Helpers to interact with userspace
			
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1393,6 +1393,13 @@ struct task_struct {
 
				 		unsigned long memsw_nr_pages; /* uncharged mem+swap usage */
			
 
				 	} memcg_batch;
			
 
				 	unsigned int memcg_kmem_skip_account;
			
 
				+	struct memcg_oom_info {
			
 
				+		unsigned int may_oom:1;
			
 
				+		unsigned int in_memcg_oom:1;
			
 
				+		unsigned int oom_locked:1;
			
 
				+		int wakeups;
			
 
				+		struct mem_cgroup *wait_on_memcg;
			
 
				+	} memcg_oom;
			
 
				 #endif
			
 
				 #ifdef CONFIG_UPROBES
			
 
				 	struct uprobe_task *utask;
			
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -280,7 +280,7 @@ extern void activate_page(struct page *);
 
				 extern void mark_page_accessed(struct page *);
			
 
				 extern void lru_add_drain(void);
			
 
				 extern void lru_add_drain_cpu(int cpu);
			
 
				-extern int lru_add_drain_all(void);
			
 
				+extern void lru_add_drain_all(void);
			
 
				 extern void rotate_reclaimable_page(struct page *page);
			
 
				 extern void deactivate_page(struct page *page);
			
 
				 extern void swap_setup(void);
			
--- a/kernel/gcov/fs.c
+++ b/kernel/gcov/fs.c
@@ -74,7 +74,7 @@ static int __init gcov_persist_setup(char *str)
 
				 {
			
 
				 	unsigned long val;
			
 
				 
			
 
				-	if (strict_strtoul(str, 0, &val)) {
			
 
				+	if (kstrtoul(str, 0, &val)) {
			
 
				 		pr_warning("invalid gcov_persist parameter '%s'\n", str);
			
 
				 		return 0;
			
 
				 	}
			
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -113,7 +113,7 @@ static ssize_t kexec_crash_size_store(struct kobject *kobj,
 
				 	unsigned long cnt;
			
 
				 	int ret;
			
 
				 
			
 
				-	if (strict_strtoul(buf, 0, &cnt))
			
 
				+	if (kstrtoul(buf, 0, &cnt))
			
 
				 		return -EINVAL;
			
 
				 
			
 
				 	ret = crash_shrink_memory(cnt);
			
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -253,13 +253,13 @@ int parse_args(const char *doing,
 
				 	EXPORT_SYMBOL(param_ops_##name)
			
 
				 
			
 
				 
			
 
				-STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", unsigned long, strict_strtoul);
			
 
				-STANDARD_PARAM_DEF(short, short, "%hi", long, strict_strtol);
			
 
				-STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", unsigned long, strict_strtoul);
			
 
				-STANDARD_PARAM_DEF(int, int, "%i", long, strict_strtol);
			
 
				-STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, strict_strtoul);
			
 
				-STANDARD_PARAM_DEF(long, long, "%li", long, strict_strtol);
			
 
				-STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, strict_strtoul);
			
 
				+STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", unsigned long, kstrtoul);
			
 
				+STANDARD_PARAM_DEF(short, short, "%hi", long, kstrtoul);
			
 
				+STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", unsigned long, kstrtoul);
			
 
				+STANDARD_PARAM_DEF(int, int, "%i", long, kstrtoul);
			
 
				+STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, kstrtoul);
			
 
				+STANDARD_PARAM_DEF(long, long, "%li", long, kstrtoul);
			
 
				+STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, kstrtoul);
			
 
				 
			
 
				 int param_set_charp(const char *val, const struct kernel_param *kp)
			
 
				 {
			
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -17,8 +17,8 @@
 
				 void res_counter_init(struct res_counter *counter, struct res_counter *parent)
			
 
				 {
			
 
				 	spin_lock_init(&counter->lock);
			
 
				-	counter->limit = RESOURCE_MAX;
			
 
				-	counter->soft_limit = RESOURCE_MAX;
			
 
				+	counter->limit = RES_COUNTER_MAX;
			
 
				+	counter->soft_limit = RES_COUNTER_MAX;
			
 
				 	counter->parent = parent;
			
 
				 }
			
 
				 
			
@@ -178,23 +178,30 @@ u64 res_counter_read_u64(struct res_counter *counter, int member)
 
				 #endif
			
 
				 
			
 
				 int res_counter_memparse_write_strategy(const char *buf,
			
 
				-					unsigned long long *res)
			
 
				+					unsigned long long *resp)
			
 
				 {
			
 
				 	char *end;
			
 
				+	unsigned long long res;
			
 
				 
			
 
				-	/* return RESOURCE_MAX(unlimited) if "-1" is specified */
			
 
				+	/* return RES_COUNTER_MAX(unlimited) if "-1" is specified */
			
 
				 	if (*buf == '-') {
			
 
				-		*res = simple_strtoull(buf + 1, &end, 10);
			
 
				-		if (*res != 1 || *end != '\0')
			
 
				+		res = simple_strtoull(buf + 1, &end, 10);
			
 
				+		if (res != 1 || *end != '\0')
			
 
				 			return -EINVAL;
			
 
				-		*res = RESOURCE_MAX;
			
 
				+		*resp = RES_COUNTER_MAX;
			
 
				 		return 0;
			
 
				 	}
			
 
				 
			
 
				-	*res = memparse(buf, &end);
			
 
				+	res = memparse(buf, &end);
			
 
				 	if (*end != '\0')
			
 
				 		return -EINVAL;
			
 
				 
			
 
				-	*res = PAGE_ALIGN(*res);
			
 
				+	if (PAGE_ALIGN(res) >= res)
			
 
				+		res = PAGE_ALIGN(res);
			
 
				+	else
			
 
				+		res = RES_COUNTER_MAX;
			
 
				+
			
 
				+	*resp = res;
			
 
				+
			
 
				 	return 0;
			
 
				 }
			
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -245,7 +245,7 @@ config COMPACTION
 
				 config MIGRATION
			
 
				 	bool "Page migration"
			
 
				 	def_bool y
			
 
				-	depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE || COMPACTION || CMA
			
 
				+	depends on (NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE || COMPACTION || CMA) && MMU
			
 
				 	help
			
 
				 	  Allows the migration of the physical location of pages of processes
			
 
				 	  while the virtual addresses are not changed. This is useful in
			
@@ -480,7 +480,7 @@ config FRONTSWAP
 
				 
			
 
				 config CMA
			
 
				 	bool "Contiguous Memory Allocator"
			
 
				-	depends on HAVE_MEMBLOCK
			
 
				+	depends on HAVE_MEMBLOCK && MMU
			
 
				 	select MIGRATION
			
 
				 	select MEMORY_ISOLATION
			
 
				 	help
			
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -467,32 +467,34 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
 
				 	error = mem_cgroup_cache_charge(page, current->mm,
			
 
				 					gfp_mask & GFP_RECLAIM_MASK);
			
 
				 	if (error)
			
 
				-		goto out;
			
 
				+		return error;
			
 
				 
			
 
				 	error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM);
			
 
				-	if (error == 0) {
			
 
				-		page_cache_get(page);
			
 
				-		page->mapping = mapping;
			
 
				-		page->index = offset;
			
 
				-
			
 
				-		spin_lock_irq(&mapping->tree_lock);
			
 
				-		error = radix_tree_insert(&mapping->page_tree, offset, page);
			
 
				-		if (likely(!error)) {
			
 
				-			mapping->nrpages++;
			
 
				-			__inc_zone_page_state(page, NR_FILE_PAGES);
			
 
				-			spin_unlock_irq(&mapping->tree_lock);
			
 
				-			trace_mm_filemap_add_to_page_cache(page);
			
 
				-		} else {
			
 
				-			page->mapping = NULL;
			
 
				-			/* Leave page->index set: truncation relies upon it */
			
 
				-			spin_unlock_irq(&mapping->tree_lock);
			
 
				-			mem_cgroup_uncharge_cache_page(page);
			
 
				-			page_cache_release(page);
			
 
				-		}
			
 
				-		radix_tree_preload_end();
			
 
				-	} else
			
 
				+	if (error) {
			
 
				 		mem_cgroup_uncharge_cache_page(page);
			
 
				-out:
			
 
				+		return error;
			
 
				+	}
			
 
				+
			
 
				+	page_cache_get(page);
			
 
				+	page->mapping = mapping;
			
 
				+	page->index = offset;
			
 
				+
			
 
				+	spin_lock_irq(&mapping->tree_lock);
			
 
				+	error = radix_tree_insert(&mapping->page_tree, offset, page);
			
 
				+	radix_tree_preload_end();
			
 
				+	if (unlikely(error))
			
 
				+		goto err_insert;
			
 
				+	mapping->nrpages++;
			
 
				+	__inc_zone_page_state(page, NR_FILE_PAGES);
			
 
				+	spin_unlock_irq(&mapping->tree_lock);
			
 
				+	trace_mm_filemap_add_to_page_cache(page);
			
 
				+	return 0;
			
 
				+err_insert:
			
 
				+	page->mapping = NULL;
			
 
				+	/* Leave page->index set: truncation relies upon it */
			
 
				+	spin_unlock_irq(&mapping->tree_lock);
			
 
				+	mem_cgroup_uncharge_cache_page(page);
			
 
				+	page_cache_release(page);
			
 
				 	return error;
			
 
				 }
			
 
				 EXPORT_SYMBOL(add_to_page_cache_locked);
			
@@ -1614,6 +1616,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 
				 	struct inode *inode = mapping->host;
			
 
				 	pgoff_t offset = vmf->pgoff;
			
 
				 	struct page *page;
			
 
				+	bool memcg_oom;
			
 
				 	pgoff_t size;
			
 
				 	int ret = 0;
			
 
				 
			
@@ -1622,7 +1625,11 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 
				 		return VM_FAULT_SIGBUS;
			
 
				 
			
 
				 	/*
			
 
				-	 * Do we have something in the page cache already?
			
 
				+	 * Do we have something in the page cache already?  Either
			
 
				+	 * way, try readahead, but disable the memcg OOM killer for it
			
 
				+	 * as readahead is optional and no errors are propagated up
			
 
				+	 * the fault stack.  The OOM killer is enabled while trying to
			
 
				+	 * instantiate the faulting page individually below.
			
 
				 	 */
			
 
				 	page = find_get_page(mapping, offset);
			
 
				 	if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) {
			
@@ -1630,10 +1637,14 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 
				 		 * We found the page, so try async readahead before
			
 
				 		 * waiting for the lock.
			
 
				 		 */
			
 
				+		memcg_oom = mem_cgroup_toggle_oom(false);
			
 
				 		do_async_mmap_readahead(vma, ra, file, page, offset);
			
 
				+		mem_cgroup_toggle_oom(memcg_oom);
			
 
				 	} else if (!page) {
			
 
				 		/* No page in the page cache at all */
			
 
				+		memcg_oom = mem_cgroup_toggle_oom(false);
			
 
				 		do_sync_mmap_readahead(vma, ra, file, offset);
			
 
				+		mem_cgroup_toggle_oom(memcg_oom);
			
 
				 		count_vm_event(PGMAJFAULT);
			
 
				 		mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
			
 
				 		ret = VM_FAULT_MAJOR;
			
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -695,11 +695,10 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
 
				 	return pmd;
			
 
				 }
			
 
				 
			
 
				-static inline pmd_t mk_huge_pmd(struct page *page, struct vm_area_struct *vma)
			
 
				+static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot)
			
 
				 {
			
 
				 	pmd_t entry;
			
 
				-	entry = mk_pmd(page, vma->vm_page_prot);
			
 
				-	entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
			
 
				+	entry = mk_pmd(page, prot);
			
 
				 	entry = pmd_mkhuge(entry);
			
 
				 	return entry;
			
 
				 }
			
@@ -732,7 +731,8 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
 
				 		pte_free(mm, pgtable);
			
 
				 	} else {
			
 
				 		pmd_t entry;
			
 
				-		entry = mk_huge_pmd(page, vma);
			
 
				+		entry = mk_huge_pmd(page, vma->vm_page_prot);
			
 
				+		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
			
 
				 		page_add_new_anon_rmap(page, vma, haddr);
			
 
				 		pgtable_trans_huge_deposit(mm, pmd, pgtable);
			
 
				 		set_pmd_at(mm, haddr, pmd, entry);
			
@@ -788,77 +788,57 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
				 {
			
 
				 	struct page *page;
			
 
				 	unsigned long haddr = address & HPAGE_PMD_MASK;
			
 
				-	pte_t *pte;
			
 
				 
			
 
				-	if (haddr >= vma->vm_start && haddr + HPAGE_PMD_SIZE <= vma->vm_end) {
			
 
				-		if (unlikely(anon_vma_prepare(vma)))
			
 
				-			return VM_FAULT_OOM;
			
 
				-		if (unlikely(khugepaged_enter(vma)))
			
 
				+	if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
			
 
				+		return VM_FAULT_FALLBACK;
			
 
				+	if (unlikely(anon_vma_prepare(vma)))
			
 
				+		return VM_FAULT_OOM;
			
 
				+	if (unlikely(khugepaged_enter(vma)))
			
 
				+		return VM_FAULT_OOM;
			
 
				+	if (!(flags & FAULT_FLAG_WRITE) &&
			
 
				+			transparent_hugepage_use_zero_page()) {
			
 
				+		pgtable_t pgtable;
			
 
				+		struct page *zero_page;
			
 
				+		bool set;
			
 
				+		pgtable = pte_alloc_one(mm, haddr);
			
 
				+		if (unlikely(!pgtable))
			
 
				 			return VM_FAULT_OOM;
			
 
				-		if (!(flags & FAULT_FLAG_WRITE) &&
			
 
				-				transparent_hugepage_use_zero_page()) {
			
 
				-			pgtable_t pgtable;
			
 
				-			struct page *zero_page;
			
 
				-			bool set;
			
 
				-			pgtable = pte_alloc_one(mm, haddr);
			
 
				-			if (unlikely(!pgtable))
			
 
				-				return VM_FAULT_OOM;
			
 
				-			zero_page = get_huge_zero_page();
			
 
				-			if (unlikely(!zero_page)) {
			
 
				-				pte_free(mm, pgtable);
			
 
				-				count_vm_event(THP_FAULT_FALLBACK);
			
 
				-				goto out;
			
 
				-			}
			
 
				-			spin_lock(&mm->page_table_lock);
			
 
				-			set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd,
			
 
				-					zero_page);
			
 
				-			spin_unlock(&mm->page_table_lock);
			
 
				-			if (!set) {
			
 
				-				pte_free(mm, pgtable);
			
 
				-				put_huge_zero_page();
			
 
				-			}
			
 
				-			return 0;
			
 
				-		}
			
 
				-		page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
			
 
				-					  vma, haddr, numa_node_id(), 0);
			
 
				-		if (unlikely(!page)) {
			
 
				+		zero_page = get_huge_zero_page();
			
 
				+		if (unlikely(!zero_page)) {
			
 
				+			pte_free(mm, pgtable);
			
 
				 			count_vm_event(THP_FAULT_FALLBACK);
			
 
				-			goto out;
			
 
				-		}
			
 
				-		count_vm_event(THP_FAULT_ALLOC);
			
 
				-		if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) {
			
 
				-			put_page(page);
			
 
				-			goto out;
			
 
				+			return VM_FAULT_FALLBACK;
			
 
				 		}
			
 
				-		if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd,
			
 
				-							  page))) {
			
 
				-			mem_cgroup_uncharge_page(page);
			
 
				-			put_page(page);
			
 
				-			goto out;
			
 
				+		spin_lock(&mm->page_table_lock);
			
 
				+		set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd,
			
 
				+				zero_page);
			
 
				+		spin_unlock(&mm->page_table_lock);
			
 
				+		if (!set) {
			
 
				+			pte_free(mm, pgtable);
			
 
				+			put_huge_zero_page();
			
 
				 		}
			
 
				-
			
 
				 		return 0;
			
 
				 	}
			
 
				-out:
			
 
				-	/*
			
 
				-	 * Use __pte_alloc instead of pte_alloc_map, because we can't
			
 
				-	 * run pte_offset_map on the pmd, if an huge pmd could
			
 
				-	 * materialize from under us from a different thread.
			
 
				-	 */
			
 
				-	if (unlikely(pmd_none(*pmd)) &&
			
 
				-	    unlikely(__pte_alloc(mm, vma, pmd, address)))
			
 
				-		return VM_FAULT_OOM;
			
 
				-	/* if an huge pmd materialized from under us just retry later */
			
 
				-	if (unlikely(pmd_trans_huge(*pmd)))
			
 
				-		return 0;
			
 
				-	/*
			
 
				-	 * A regular pmd is established and it can't morph into a huge pmd
			
 
				-	 * from under us anymore at this point because we hold the mmap_sem
			
 
				-	 * read mode and khugepaged takes it in write mode. So now it's
			
 
				-	 * safe to run pte_offset_map().
			
 
				-	 */
			
 
				-	pte = pte_offset_map(pmd, address);
			
 
				-	return handle_pte_fault(mm, vma, address, pte, pmd, flags);
			
 
				+	page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
			
 
				+			vma, haddr, numa_node_id(), 0);
			
 
				+	if (unlikely(!page)) {
			
 
				+		count_vm_event(THP_FAULT_FALLBACK);
			
 
				+		return VM_FAULT_FALLBACK;
			
 
				+	}
			
 
				+	if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) {
			
 
				+		put_page(page);
			
 
				+		count_vm_event(THP_FAULT_FALLBACK);
			
 
				+		return VM_FAULT_FALLBACK;
			
 
				+	}
			
 
				+	if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page))) {
			
 
				+		mem_cgroup_uncharge_page(page);
			
 
				+		put_page(page);
			
 
				+		count_vm_event(THP_FAULT_FALLBACK);
			
 
				+		return VM_FAULT_FALLBACK;
			
 
				+	}
			
 
				+
			
 
				+	count_vm_event(THP_FAULT_ALLOC);
			
 
				+	return 0;
			
 
				 }
			
 
				 
			
 
				 int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
			
@@ -1170,7 +1150,6 @@ alloc:
 
				 		new_page = NULL;
			
 
				 
			
 
				 	if (unlikely(!new_page)) {
			
 
				-		count_vm_event(THP_FAULT_FALLBACK);
			
 
				 		if (is_huge_zero_pmd(orig_pmd)) {
			
 
				 			ret = do_huge_pmd_wp_zero_page_fallback(mm, vma,
			
 
				 					address, pmd, orig_pmd, haddr);
			
@@ -1181,9 +1160,9 @@ alloc:
 
				 				split_huge_page(page);
			
 
				 			put_page(page);
			
 
				 		}
			
 
				+		count_vm_event(THP_FAULT_FALLBACK);
			
 
				 		goto out;
			
 
				 	}
			
 
				-	count_vm_event(THP_FAULT_ALLOC);
			
 
				 
			
 
				 	if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
			
 
				 		put_page(new_page);
			
@@ -1191,10 +1170,13 @@ alloc:
 
				 			split_huge_page(page);
			
 
				 			put_page(page);
			
 
				 		}
			
 
				+		count_vm_event(THP_FAULT_FALLBACK);
			
 
				 		ret |= VM_FAULT_OOM;
			
 
				 		goto out;
			
 
				 	}
			
 
				 
			
 
				+	count_vm_event(THP_FAULT_ALLOC);
			
 
				+
			
 
				 	if (is_huge_zero_pmd(orig_pmd))
			
 
				 		clear_huge_page(new_page, haddr, HPAGE_PMD_NR);
			
 
				 	else
			
@@ -1215,7 +1197,8 @@ alloc:
 
				 		goto out_mn;
			
 
				 	} else {
			
 
				 		pmd_t entry;
			
 
				-		entry = mk_huge_pmd(new_page, vma);
			
 
				+		entry = mk_huge_pmd(new_page, vma->vm_page_prot);
			
 
				+		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
			
 
				 		pmdp_clear_flush(vma, haddr, pmd);
			
 
				 		page_add_new_anon_rmap(new_page, vma, haddr);
			
 
				 		set_pmd_at(mm, haddr, pmd, entry);
			
@@ -1666,7 +1649,6 @@ static void __split_huge_page_refcount(struct page *page,
 
				 	BUG_ON(atomic_read(&page->_count) <= 0);
			
 
				 
			
 
				 	__mod_zone_page_state(zone, NR_ANON_TRANSPARENT_HUGEPAGES, -1);
			
 
				-	__mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR);
			
 
				 
			
 
				 	ClearPageCompound(page);
			
 
				 	compound_unlock(page);
			
@@ -2364,7 +2346,8 @@ static void collapse_huge_page(struct mm_struct *mm,
 
				 	__SetPageUptodate(new_page);
			
 
				 	pgtable = pmd_pgtable(_pmd);
			
 
				 
			
 
				-	_pmd = mk_huge_pmd(new_page, vma);
			
 
				+	_pmd = mk_huge_pmd(new_page, vma->vm_page_prot);
			
 
				+	_pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
			
 
				 
			
 
				 	/*
			
 
				 	 * spin_lock() below is not the equivalent of smp_wmb(), so
			
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -39,7 +39,6 @@
 
				 #include <linux/limits.h>
			
 
				 #include <linux/export.h>
			
 
				 #include <linux/mutex.h>
			
 
				-#include <linux/rbtree.h>
			
 
				 #include <linux/slab.h>
			
 
				 #include <linux/swap.h>
			
 
				 #include <linux/swapops.h>
			
@@ -85,26 +84,12 @@ static int really_do_swap_account __initdata = 0;
 
				 #endif
			
 
				 
			
 
				 
			
 
				-/*
			
 
				- * Statistics for memory cgroup.
			
 
				- */
			
 
				-enum mem_cgroup_stat_index {
			
 
				-	/*
			
 
				-	 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
			
 
				-	 */
			
 
				-	MEM_CGROUP_STAT_CACHE,		/* # of pages charged as cache */
			
 
				-	MEM_CGROUP_STAT_RSS,		/* # of pages charged as anon rss */
			
 
				-	MEM_CGROUP_STAT_RSS_HUGE,	/* # of pages charged as anon huge */
			
 
				-	MEM_CGROUP_STAT_FILE_MAPPED,	/* # of pages charged as file rss */
			
 
				-	MEM_CGROUP_STAT_SWAP,		/* # of pages, swapped out */
			
 
				-	MEM_CGROUP_STAT_NSTATS,
			
 
				-};
			
 
				-
			
 
				 static const char * const mem_cgroup_stat_names[] = {
			
 
				 	"cache",
			
 
				 	"rss",
			
 
				 	"rss_huge",
			
 
				 	"mapped_file",
			
 
				+	"writeback",
			
 
				 	"swap",
			
 
				 };
			
 
				 
			
@@ -175,10 +160,6 @@ struct mem_cgroup_per_zone {
 
				 
			
 
				 	struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
			
 
				 
			
 
				-	struct rb_node		tree_node;	/* RB tree node */
			
 
				-	unsigned long long	usage_in_excess;/* Set to the value by which */
			
 
				-						/* the soft limit is exceeded*/
			
 
				-	bool			on_tree;
			
 
				 	struct mem_cgroup	*memcg;		/* Back pointer, we cannot */
			
 
				 						/* use container_of	   */
			
 
				 };
			
@@ -187,26 +168,6 @@ struct mem_cgroup_per_node {
 
				 	struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
			
 
				 };
			
 
				 
			
 
				-/*
			
 
				- * Cgroups above their limits are maintained in a RB-Tree, independent of
			
 
				- * their hierarchy representation
			
 
				- */
			
 
				-
			
 
				-struct mem_cgroup_tree_per_zone {
			
 
				-	struct rb_root rb_root;
			
 
				-	spinlock_t lock;
			
 
				-};
			
 
				-
			
 
				-struct mem_cgroup_tree_per_node {
			
 
				-	struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
			
 
				-};
			
 
				-
			
 
				-struct mem_cgroup_tree {
			
 
				-	struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
			
 
				-};
			
 
				-
			
 
				-static struct mem_cgroup_tree soft_limit_tree __read_mostly;
			
 
				-
			
 
				 struct mem_cgroup_threshold {
			
 
				 	struct eventfd_ctx *eventfd;
			
 
				 	u64 threshold;
			
@@ -280,6 +241,7 @@ struct mem_cgroup {
 
				 
			
 
				 	bool		oom_lock;
			
 
				 	atomic_t	under_oom;
			
 
				+	atomic_t	oom_wakeups;
			
 
				 
			
 
				 	int	swappiness;
			
 
				 	/* OOM-Killer disable */
			
@@ -304,7 +266,7 @@ struct mem_cgroup {
 
				 	 * Should we move charges of a task when a task is moved into this
			
 
				 	 * mem_cgroup ? And what type of charges should we move ?
			
 
				 	 */
			
 
				-	unsigned long 	move_charge_at_immigrate;
			
 
				+	unsigned long move_charge_at_immigrate;
			
 
				 	/*
			
 
				 	 * set > 0 if pages under this cgroup are moving to other cgroup.
			
 
				 	 */
			
@@ -341,6 +303,22 @@ struct mem_cgroup {
 
				 	atomic_t	numainfo_events;
			
 
				 	atomic_t	numainfo_updating;
			
 
				 #endif
			
 
				+	/*
			
 
				+	 * Protects soft_contributed transitions.
			
 
				+	 * See mem_cgroup_update_soft_limit
			
 
				+	 */
			
 
				+	spinlock_t soft_lock;
			
 
				+
			
 
				+	/*
			
 
				+	 * If true then this group has increased parents' children_in_excess
			
 
				+	 * when it got over the soft limit.
			
 
				+	 * When a group falls bellow the soft limit, parents' children_in_excess
			
 
				+	 * is decreased and soft_contributed changed to false.
			
 
				+	 */
			
 
				+	bool soft_contributed;
			
 
				+
			
 
				+	/* Number of children that are in soft limit excess */
			
 
				+	atomic_t children_in_excess;
			
 
				 
			
 
				 	struct mem_cgroup_per_node *nodeinfo[0];
			
 
				 	/* WARNING: nodeinfo must be the last member here */
			
@@ -444,7 +422,6 @@ static bool move_file(void)
 
				  * limit reclaim to prevent infinite loops, if they ever occur.
			
 
				  */
			
 
				 #define	MEM_CGROUP_MAX_RECLAIM_LOOPS		100
			
 
				-#define	MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS	2
			
 
				 
			
 
				 enum charge_type {
			
 
				 	MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
			
@@ -671,164 +648,6 @@ page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page)
 
				 	return mem_cgroup_zoneinfo(memcg, nid, zid);
			
 
				 }
			
 
				 
			
 
				-static struct mem_cgroup_tree_per_zone *
			
 
				-soft_limit_tree_node_zone(int nid, int zid)
			
 
				-{
			
 
				-	return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
			
 
				-}
			
 
				-
			
 
				-static struct mem_cgroup_tree_per_zone *
			
 
				-soft_limit_tree_from_page(struct page *page)
			
 
				-{
			
 
				-	int nid = page_to_nid(page);
			
 
				-	int zid = page_zonenum(page);
			
 
				-
			
 
				-	return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
			
 
				-}
			
 
				-
			
 
				-static void
			
 
				-__mem_cgroup_insert_exceeded(struct mem_cgroup *memcg,
			
 
				-				struct mem_cgroup_per_zone *mz,
			
 
				-				struct mem_cgroup_tree_per_zone *mctz,
			
 
				-				unsigned long long new_usage_in_excess)
			
 
				-{
			
 
				-	struct rb_node **p = &mctz->rb_root.rb_node;
			
 
				-	struct rb_node *parent = NULL;
			
 
				-	struct mem_cgroup_per_zone *mz_node;
			
 
				-
			
 
				-	if (mz->on_tree)
			
 
				-		return;
			
 
				-
			
 
				-	mz->usage_in_excess = new_usage_in_excess;
			
 
				-	if (!mz->usage_in_excess)
			
 
				-		return;
			
 
				-	while (*p) {
			
 
				-		parent = *p;
			
 
				-		mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
			
 
				-					tree_node);
			
 
				-		if (mz->usage_in_excess < mz_node->usage_in_excess)
			
 
				-			p = &(*p)->rb_left;
			
 
				-		/*
			
 
				-		 * We can't avoid mem cgroups that are over their soft
			
 
				-		 * limit by the same amount
			
 
				-		 */
			
 
				-		else if (mz->usage_in_excess >= mz_node->usage_in_excess)
			
 
				-			p = &(*p)->rb_right;
			
 
				-	}
			
 
				-	rb_link_node(&mz->tree_node, parent, p);
			
 
				-	rb_insert_color(&mz->tree_node, &mctz->rb_root);
			
 
				-	mz->on_tree = true;
			
 
				-}
			
 
				-
			
 
				-static void
			
 
				-__mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
			
 
				-				struct mem_cgroup_per_zone *mz,
			
 
				-				struct mem_cgroup_tree_per_zone *mctz)
			
 
				-{
			
 
				-	if (!mz->on_tree)
			
 
				-		return;
			
 
				-	rb_erase(&mz->tree_node, &mctz->rb_root);
			
 
				-	mz->on_tree = false;
			
 
				-}
			
 
				-
			
 
				-static void
			
 
				-mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
			
 
				-				struct mem_cgroup_per_zone *mz,
			
 
				-				struct mem_cgroup_tree_per_zone *mctz)
			
 
				-{
			
 
				-	spin_lock(&mctz->lock);
			
 
				-	__mem_cgroup_remove_exceeded(memcg, mz, mctz);
			
 
				-	spin_unlock(&mctz->lock);
			
 
				-}
			
 
				-
			
 
				-
			
 
				-static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
			
 
				-{
			
 
				-	unsigned long long excess;
			
 
				-	struct mem_cgroup_per_zone *mz;
			
 
				-	struct mem_cgroup_tree_per_zone *mctz;
			
 
				-	int nid = page_to_nid(page);
			
 
				-	int zid = page_zonenum(page);
			
 
				-	mctz = soft_limit_tree_from_page(page);
			
 
				-
			
 
				-	/*
			
 
				-	 * Necessary to update all ancestors when hierarchy is used.
			
 
				-	 * because their event counter is not touched.
			
 
				-	 */
			
 
				-	for (; memcg; memcg = parent_mem_cgroup(memcg)) {
			
 
				-		mz = mem_cgroup_zoneinfo(memcg, nid, zid);
			
 
				-		excess = res_counter_soft_limit_excess(&memcg->res);
			
 
				-		/*
			
 
				-		 * We have to update the tree if mz is on RB-tree or
			
 
				-		 * mem is over its softlimit.
			
 
				-		 */
			
 
				-		if (excess || mz->on_tree) {
			
 
				-			spin_lock(&mctz->lock);
			
 
				-			/* if on-tree, remove it */
			
 
				-			if (mz->on_tree)
			
 
				-				__mem_cgroup_remove_exceeded(memcg, mz, mctz);
			
 
				-			/*
			
 
				-			 * Insert again. mz->usage_in_excess will be updated.
			
 
				-			 * If excess is 0, no tree ops.
			
 
				-			 */
			
 
				-			__mem_cgroup_insert_exceeded(memcg, mz, mctz, excess);
			
 
				-			spin_unlock(&mctz->lock);
			
 
				-		}
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
			
 
				-{
			
 
				-	int node, zone;
			
 
				-	struct mem_cgroup_per_zone *mz;
			
 
				-	struct mem_cgroup_tree_per_zone *mctz;
			
 
				-
			
 
				-	for_each_node(node) {
			
 
				-		for (zone = 0; zone < MAX_NR_ZONES; zone++) {
			
 
				-			mz = mem_cgroup_zoneinfo(memcg, node, zone);
			
 
				-			mctz = soft_limit_tree_node_zone(node, zone);
			
 
				-			mem_cgroup_remove_exceeded(memcg, mz, mctz);
			
 
				-		}
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-static struct mem_cgroup_per_zone *
			
 
				-__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
			
 
				-{
			
 
				-	struct rb_node *rightmost = NULL;
			
 
				-	struct mem_cgroup_per_zone *mz;
			
 
				-
			
 
				-retry:
			
 
				-	mz = NULL;
			
 
				-	rightmost = rb_last(&mctz->rb_root);
			
 
				-	if (!rightmost)
			
 
				-		goto done;		/* Nothing to reclaim from */
			
 
				-
			
 
				-	mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
			
 
				-	/*
			
 
				-	 * Remove the node now but someone else can add it back,
			
 
				-	 * we will to add it back at the end of reclaim to its correct
			
 
				-	 * position in the tree.
			
 
				-	 */
			
 
				-	__mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
			
 
				-	if (!res_counter_soft_limit_excess(&mz->memcg->res) ||
			
 
				-		!css_tryget(&mz->memcg->css))
			
 
				-		goto retry;
			
 
				-done:
			
 
				-	return mz;
			
 
				-}
			
 
				-
			
 
				-static struct mem_cgroup_per_zone *
			
 
				-mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
			
 
				-{
			
 
				-	struct mem_cgroup_per_zone *mz;
			
 
				-
			
 
				-	spin_lock(&mctz->lock);
			
 
				-	mz = __mem_cgroup_largest_soft_limit_node(mctz);
			
 
				-	spin_unlock(&mctz->lock);
			
 
				-	return mz;
			
 
				-}
			
 
				-
			
 
				 /*
			
 
				  * Implementation Note: reading percpu statistics for memcg.
			
 
				  *
			
@@ -1002,6 +821,48 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
 
				 	return false;
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * Called from rate-limited memcg_check_events when enough
			
 
				+ * MEM_CGROUP_TARGET_SOFTLIMIT events are accumulated and it makes sure
			
 
				+ * that all the parents up the hierarchy will be notified that this group
			
 
				+ * is in excess or that it is not in excess anymore. mmecg->soft_contributed
			
 
				+ * makes the transition a single action whenever the state flips from one to
			
 
				+ * the other.
			
 
				+ */
			
 
				+static void mem_cgroup_update_soft_limit(struct mem_cgroup *memcg)
			
 
				+{
			
 
				+	unsigned long long excess = res_counter_soft_limit_excess(&memcg->res);
			
 
				+	struct mem_cgroup *parent = memcg;
			
 
				+	int delta = 0;
			
 
				+
			
 
				+	spin_lock(&memcg->soft_lock);
			
 
				+	if (excess) {
			
 
				+		if (!memcg->soft_contributed) {
			
 
				+			delta = 1;
			
 
				+			memcg->soft_contributed = true;
			
 
				+		}
			
 
				+	} else {
			
 
				+		if (memcg->soft_contributed) {
			
 
				+			delta = -1;
			
 
				+			memcg->soft_contributed = false;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * Necessary to update all ancestors when hierarchy is used
			
 
				+	 * because their event counter is not touched.
			
 
				+	 * We track children even outside the hierarchy for the root
			
 
				+	 * cgroup because tree walk starting at root should visit
			
 
				+	 * all cgroups and we want to prevent from pointless tree
			
 
				+	 * walk if no children is below the limit.
			
 
				+	 */
			
 
				+	while (delta && (parent = parent_mem_cgroup(parent)))
			
 
				+		atomic_add(delta, &parent->children_in_excess);
			
 
				+	if (memcg != root_mem_cgroup && !root_mem_cgroup->use_hierarchy)
			
 
				+		atomic_add(delta, &root_mem_cgroup->children_in_excess);
			
 
				+	spin_unlock(&memcg->soft_lock);
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * Check events in order.
			
 
				  *
			
@@ -1025,7 +886,7 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
 
				 
			
 
				 		mem_cgroup_threshold(memcg);
			
 
				 		if (unlikely(do_softlimit))
			
 
				-			mem_cgroup_update_tree(memcg, page);
			
 
				+			mem_cgroup_update_soft_limit(memcg);
			
 
				 #if MAX_NUMNODES > 1
			
 
				 		if (unlikely(do_numainfo))
			
 
				 			atomic_inc(&memcg->numainfo_events);
			
@@ -1068,6 +929,15 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
 
				 	return memcg;
			
 
				 }
			
 
				 
			
 
				+static enum mem_cgroup_filter_t
			
 
				+mem_cgroup_filter(struct mem_cgroup *memcg, struct mem_cgroup *root,
			
 
				+		mem_cgroup_iter_filter cond)
			
 
				+{
			
 
				+	if (!cond)
			
 
				+		return VISIT;
			
 
				+	return cond(memcg, root);
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * Returns a next (in a pre-order walk) alive memcg (with elevated css
			
 
				  * ref. count) or NULL if the whole root's subtree has been visited.
			
@@ -1075,7 +945,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
 
				  * helper function to be used by mem_cgroup_iter
			
 
				  */
			
 
				 static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root,
			
 
				-		struct mem_cgroup *last_visited)
			
 
				+		struct mem_cgroup *last_visited, mem_cgroup_iter_filter cond)
			
 
				 {
			
 
				 	struct cgroup_subsys_state *prev_css, *next_css;
			
 
				 
			
@@ -1093,11 +963,31 @@ skip_node:
 
				 	if (next_css) {
			
 
				 		struct mem_cgroup *mem = mem_cgroup_from_css(next_css);
			
 
				 
			
 
				-		if (css_tryget(&mem->css))
			
 
				-			return mem;
			
 
				-		else {
			
 
				+		switch (mem_cgroup_filter(mem, root, cond)) {
			
 
				+		case SKIP:
			
 
				 			prev_css = next_css;
			
 
				 			goto skip_node;
			
 
				+		case SKIP_TREE:
			
 
				+			if (mem == root)
			
 
				+				return NULL;
			
 
				+			/*
			
 
				+			 * css_rightmost_descendant is not an optimal way to
			
 
				+			 * skip through a subtree (especially for imbalanced
			
 
				+			 * trees leaning to right) but that's what we have right
			
 
				+			 * now. More effective solution would be traversing
			
 
				+			 * right-up for first non-NULL without calling
			
 
				+			 * css_next_descendant_pre afterwards.
			
 
				+			 */
			
 
				+			prev_css = css_rightmost_descendant(next_css);
			
 
				+			goto skip_node;
			
 
				+		case VISIT:
			
 
				+			if (css_tryget(&mem->css))
			
 
				+				return mem;
			
 
				+			else {
			
 
				+				prev_css = next_css;
			
 
				+				goto skip_node;
			
 
				+			}
			
 
				+			break;
			
 
				 		}
			
 
				 	}
			
 
				 
			
@@ -1161,6 +1051,7 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
 
				  * @root: hierarchy root
			
 
				  * @prev: previously returned memcg, NULL on first invocation
			
 
				  * @reclaim: cookie for shared reclaim walks, NULL for full walks
			
 
				+ * @cond: filter for visited nodes, NULL for no filter
			
 
				  *
			
 
				  * Returns references to children of the hierarchy below @root, or
			
 
				  * @root itself, or %NULL after a full round-trip.
			
@@ -1173,15 +1064,18 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
 
				  * divide up the memcgs in the hierarchy among all concurrent
			
 
				  * reclaimers operating on the same zone and priority.
			
 
				  */
			
 
				-struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
			
 
				+struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root,
			
 
				 				   struct mem_cgroup *prev,
			
 
				-				   struct mem_cgroup_reclaim_cookie *reclaim)
			
 
				+				   struct mem_cgroup_reclaim_cookie *reclaim,
			
 
				+				   mem_cgroup_iter_filter cond)
			
 
				 {
			
 
				 	struct mem_cgroup *memcg = NULL;
			
 
				 	struct mem_cgroup *last_visited = NULL;
			
 
				 
			
 
				-	if (mem_cgroup_disabled())
			
 
				-		return NULL;
			
 
				+	if (mem_cgroup_disabled()) {
			
 
				+		/* first call must return non-NULL, second return NULL */
			
 
				+		return (struct mem_cgroup *)(unsigned long)!prev;
			
 
				+	}
			
 
				 
			
 
				 	if (!root)
			
 
				 		root = root_mem_cgroup;
			
@@ -1192,7 +1086,9 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
 
				 	if (!root->use_hierarchy && root != root_mem_cgroup) {
			
 
				 		if (prev)
			
 
				 			goto out_css_put;
			
 
				-		return root;
			
 
				+		if (mem_cgroup_filter(root, root, cond) == VISIT)
			
 
				+			return root;
			
 
				+		return NULL;
			
 
				 	}
			
 
				 
			
 
				 	rcu_read_lock();
			
@@ -1215,7 +1111,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
 
				 			last_visited = mem_cgroup_iter_load(iter, root, &seq);
			
 
				 		}
			
 
				 
			
 
				-		memcg = __mem_cgroup_iter_next(root, last_visited);
			
 
				+		memcg = __mem_cgroup_iter_next(root, last_visited, cond);
			
 
				 
			
 
				 		if (reclaim) {
			
 
				 			mem_cgroup_iter_update(iter, last_visited, memcg, seq);
			
@@ -1226,7 +1122,11 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
 
				 				reclaim->generation = iter->generation;
			
 
				 		}
			
 
				 
			
 
				-		if (prev && !memcg)
			
 
				+		/*
			
 
				+		 * We have finished the whole tree walk or no group has been
			
 
				+		 * visited because filter told us to skip the root node.
			
 
				+		 */
			
 
				+		if (!memcg && (prev || (cond && !last_visited)))
			
 
				 			goto out_unlock;
			
 
				 	}
			
 
				 out_unlock:
			
@@ -1867,6 +1767,7 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
 
				 	return total;
			
 
				 }
			
 
				 
			
 
				+#if MAX_NUMNODES > 1
			
 
				 /**
			
 
				  * test_mem_cgroup_node_reclaimable
			
 
				  * @memcg: the target memcg
			
@@ -1889,7 +1790,6 @@ static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
 
				 	return false;
			
 
				 
			
 
				 }
			
 
				-#if MAX_NUMNODES > 1
			
 
				 
			
 
				 /*
			
 
				  * Always updating the nodemask is not very good - even if we have an empty
			
@@ -1957,115 +1857,64 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
 
				 	return node;
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * Check all nodes whether it contains reclaimable pages or not.
			
 
				- * For quick scan, we make use of scan_nodes. This will allow us to skip
			
 
				- * unused nodes. But scan_nodes is lazily updated and may not cotain
			
 
				- * enough new information. We need to do double check.
			
 
				- */
			
 
				-static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
			
 
				-{
			
 
				-	int nid;
			
 
				-
			
 
				-	/*
			
 
				-	 * quick check...making use of scan_node.
			
 
				-	 * We can skip unused nodes.
			
 
				-	 */
			
 
				-	if (!nodes_empty(memcg->scan_nodes)) {
			
 
				-		for (nid = first_node(memcg->scan_nodes);
			
 
				-		     nid < MAX_NUMNODES;
			
 
				-		     nid = next_node(nid, memcg->scan_nodes)) {
			
 
				-
			
 
				-			if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
			
 
				-				return true;
			
 
				-		}
			
 
				-	}
			
 
				-	/*
			
 
				-	 * Check rest of nodes.
			
 
				-	 */
			
 
				-	for_each_node_state(nid, N_MEMORY) {
			
 
				-		if (node_isset(nid, memcg->scan_nodes))
			
 
				-			continue;
			
 
				-		if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
			
 
				-			return true;
			
 
				-	}
			
 
				-	return false;
			
 
				-}
			
 
				-
			
 
				 #else
			
 
				 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
			
 
				 {
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
			
 
				-{
			
 
				-	return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);
			
 
				-}
			
 
				 #endif
			
 
				 
			
 
				-static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
			
 
				-				   struct zone *zone,
			
 
				-				   gfp_t gfp_mask,
			
 
				-				   unsigned long *total_scanned)
			
 
				-{
			
 
				-	struct mem_cgroup *victim = NULL;
			
 
				-	int total = 0;
			
 
				-	int loop = 0;
			
 
				-	unsigned long excess;
			
 
				-	unsigned long nr_scanned;
			
 
				-	struct mem_cgroup_reclaim_cookie reclaim = {
			
 
				-		.zone = zone,
			
 
				-		.priority = 0,
			
 
				-	};
			
 
				-
			
 
				-	excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT;
			
 
				-
			
 
				-	while (1) {
			
 
				-		victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
			
 
				-		if (!victim) {
			
 
				-			loop++;
			
 
				-			if (loop >= 2) {
			
 
				-				/*
			
 
				-				 * If we have not been able to reclaim
			
 
				-				 * anything, it might because there are
			
 
				-				 * no reclaimable pages under this hierarchy
			
 
				-				 */
			
 
				-				if (!total)
			
 
				-					break;
			
 
				-				/*
			
 
				-				 * We want to do more targeted reclaim.
			
 
				-				 * excess >> 2 is not to excessive so as to
			
 
				-				 * reclaim too much, nor too less that we keep
			
 
				-				 * coming back to reclaim from this cgroup
			
 
				-				 */
			
 
				-				if (total >= (excess >> 2) ||
			
 
				-					(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
			
 
				-					break;
			
 
				-			}
			
 
				-			continue;
			
 
				-		}
			
 
				-		if (!mem_cgroup_reclaimable(victim, false))
			
 
				-			continue;
			
 
				-		total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
			
 
				-						     zone, &nr_scanned);
			
 
				-		*total_scanned += nr_scanned;
			
 
				-		if (!res_counter_soft_limit_excess(&root_memcg->res))
			
 
				+/*
			
 
				+ * A group is eligible for the soft limit reclaim under the given root
			
 
				+ * hierarchy if
			
 
				+ *	a) it is over its soft limit
			
 
				+ *	b) any parent up the hierarchy is over its soft limit
			
 
				+ *
			
 
				+ * If the given group doesn't have any children over the limit then it
			
 
				+ * doesn't make any sense to iterate its subtree.
			
 
				+ */
			
 
				+enum mem_cgroup_filter_t
			
 
				+mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg,
			
 
				+		struct mem_cgroup *root)
			
 
				+{
			
 
				+	struct mem_cgroup *parent;
			
 
				+
			
 
				+	if (!memcg)
			
 
				+		memcg = root_mem_cgroup;
			
 
				+	parent = memcg;
			
 
				+
			
 
				+	if (res_counter_soft_limit_excess(&memcg->res))
			
 
				+		return VISIT;
			
 
				+
			
 
				+	/*
			
 
				+	 * If any parent up to the root in the hierarchy is over its soft limit
			
 
				+	 * then we have to obey and reclaim from this group as well.
			
 
				+	 */
			
 
				+	while ((parent = parent_mem_cgroup(parent))) {
			
 
				+		if (res_counter_soft_limit_excess(&parent->res))
			
 
				+			return VISIT;
			
 
				+		if (parent == root)
			
 
				 			break;
			
 
				 	}
			
 
				-	mem_cgroup_iter_break(root_memcg, victim);
			
 
				-	return total;
			
 
				+
			
 
				+	if (!atomic_read(&memcg->children_in_excess))
			
 
				+		return SKIP_TREE;
			
 
				+	return SKIP;
			
 
				 }
			
 
				 
			
 
				+static DEFINE_SPINLOCK(memcg_oom_lock);
			
 
				+
			
 
				 /*
			
 
				  * Check OOM-Killer is already running under our hierarchy.
			
 
				  * If someone is running, return false.
			
 
				- * Has to be called with memcg_oom_lock
			
 
				  */
			
 
				-static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg)
			
 
				+static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
			
 
				 {
			
 
				 	struct mem_cgroup *iter, *failed = NULL;
			
 
				 
			
 
				+	spin_lock(&memcg_oom_lock);
			
 
				+
			
 
				 	for_each_mem_cgroup_tree(iter, memcg) {
			
 
				 		if (iter->oom_lock) {
			
 
				 			/*
			
@@ -2079,33 +1928,33 @@ static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg)
 
				 			iter->oom_lock = true;
			
 
				 	}
			
 
				 
			
 
				-	if (!failed)
			
 
				-		return true;
			
 
				-
			
 
				-	/*
			
 
				-	 * OK, we failed to lock the whole subtree so we have to clean up
			
 
				-	 * what we set up to the failing subtree
			
 
				-	 */
			
 
				-	for_each_mem_cgroup_tree(iter, memcg) {
			
 
				-		if (iter == failed) {
			
 
				-			mem_cgroup_iter_break(memcg, iter);
			
 
				-			break;
			
 
				+	if (failed) {
			
 
				+		/*
			
 
				+		 * OK, we failed to lock the whole subtree so we have
			
 
				+		 * to clean up what we set up to the failing subtree
			
 
				+		 */
			
 
				+		for_each_mem_cgroup_tree(iter, memcg) {
			
 
				+			if (iter == failed) {
			
 
				+				mem_cgroup_iter_break(memcg, iter);
			
 
				+				break;
			
 
				+			}
			
 
				+			iter->oom_lock = false;
			
 
				 		}
			
 
				-		iter->oom_lock = false;
			
 
				 	}
			
 
				-	return false;
			
 
				+
			
 
				+	spin_unlock(&memcg_oom_lock);
			
 
				+
			
 
				+	return !failed;
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * Has to be called with memcg_oom_lock
			
 
				- */
			
 
				-static int mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
			
 
				+static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
			
 
				 {
			
 
				 	struct mem_cgroup *iter;
			
 
				 
			
 
				+	spin_lock(&memcg_oom_lock);
			
 
				 	for_each_mem_cgroup_tree(iter, memcg)
			
 
				 		iter->oom_lock = false;
			
 
				-	return 0;
			
 
				+	spin_unlock(&memcg_oom_lock);
			
 
				 }
			
 
				 
			
 
				 static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
			
@@ -2129,7 +1978,6 @@ static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
 
				 		atomic_add_unless(&iter->under_oom, -1, 0);
			
 
				 }
			
 
				 
			
 
				-static DEFINE_SPINLOCK(memcg_oom_lock);
			
 
				 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
			
 
				 
			
 
				 struct oom_wait_info {
			
@@ -2159,6 +2007,7 @@ static int memcg_oom_wake_function(wait_queue_t *wait,
 
				 
			
 
				 static void memcg_wakeup_oom(struct mem_cgroup *memcg)
			
 
				 {
			
 
				+	atomic_inc(&memcg->oom_wakeups);
			
 
				 	/* for filtering, pass "memcg" as argument. */
			
 
				 	__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
			
 
				 }
			
@@ -2170,56 +2019,136 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
 
				 }
			
 
				 
			
 
				 /*
			
 
				- * try to call OOM killer. returns false if we should exit memory-reclaim loop.
			
 
				+ * try to call OOM killer
			
 
				  */
			
 
				-static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask,
			
 
				-				  int order)
			
 
				+static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
			
 
				 {
			
 
				-	struct oom_wait_info owait;
			
 
				-	bool locked, need_to_kill;
			
 
				+	bool locked;
			
 
				+	int wakeups;
			
 
				 
			
 
				-	owait.memcg = memcg;
			
 
				-	owait.wait.flags = 0;
			
 
				-	owait.wait.func = memcg_oom_wake_function;
			
 
				-	owait.wait.private = current;
			
 
				-	INIT_LIST_HEAD(&owait.wait.task_list);
			
 
				-	need_to_kill = true;
			
 
				-	mem_cgroup_mark_under_oom(memcg);
			
 
				+	if (!current->memcg_oom.may_oom)
			
 
				+		return;
			
 
				+
			
 
				+	current->memcg_oom.in_memcg_oom = 1;
			
 
				 
			
 
				-	/* At first, try to OOM lock hierarchy under memcg.*/
			
 
				-	spin_lock(&memcg_oom_lock);
			
 
				-	locked = mem_cgroup_oom_lock(memcg);
			
 
				 	/*
			
 
				-	 * Even if signal_pending(), we can't quit charge() loop without
			
 
				-	 * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
			
 
				-	 * under OOM is always welcomed, use TASK_KILLABLE here.
			
 
				+	 * As with any blocking lock, a contender needs to start
			
 
				+	 * listening for wakeups before attempting the trylock,
			
 
				+	 * otherwise it can miss the wakeup from the unlock and sleep
			
 
				+	 * indefinitely.  This is just open-coded because our locking
			
 
				+	 * is so particular to memcg hierarchies.
			
 
				 	 */
			
 
				-	prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
			
 
				-	if (!locked || memcg->oom_kill_disable)
			
 
				-		need_to_kill = false;
			
 
				+	wakeups = atomic_read(&memcg->oom_wakeups);
			
 
				+	mem_cgroup_mark_under_oom(memcg);
			
 
				+
			
 
				+	locked = mem_cgroup_oom_trylock(memcg);
			
 
				+
			
 
				 	if (locked)
			
 
				 		mem_cgroup_oom_notify(memcg);
			
 
				-	spin_unlock(&memcg_oom_lock);
			
 
				 
			
 
				-	if (need_to_kill) {
			
 
				-		finish_wait(&memcg_oom_waitq, &owait.wait);
			
 
				+	if (locked && !memcg->oom_kill_disable) {
			
 
				+		mem_cgroup_unmark_under_oom(memcg);
			
 
				 		mem_cgroup_out_of_memory(memcg, mask, order);
			
 
				+		mem_cgroup_oom_unlock(memcg);
			
 
				+		/*
			
 
				+		 * There is no guarantee that an OOM-lock contender
			
 
				+		 * sees the wakeups triggered by the OOM kill
			
 
				+		 * uncharges.  Wake any sleepers explicitely.
			
 
				+		 */
			
 
				+		memcg_oom_recover(memcg);
			
 
				 	} else {
			
 
				-		schedule();
			
 
				-		finish_wait(&memcg_oom_waitq, &owait.wait);
			
 
				+		/*
			
 
				+		 * A system call can just return -ENOMEM, but if this
			
 
				+		 * is a page fault and somebody else is handling the
			
 
				+		 * OOM already, we need to sleep on the OOM waitqueue
			
 
				+		 * for this memcg until the situation is resolved.
			
 
				+		 * Which can take some time because it might be
			
 
				+		 * handled by a userspace task.
			
 
				+		 *
			
 
				+		 * However, this is the charge context, which means
			
 
				+		 * that we may sit on a large call stack and hold
			
 
				+		 * various filesystem locks, the mmap_sem etc. and we
			
 
				+		 * don't want the OOM handler to deadlock on them
			
 
				+		 * while we sit here and wait.  Store the current OOM
			
 
				+		 * context in the task_struct, then return -ENOMEM.
			
 
				+		 * At the end of the page fault handler, with the
			
 
				+		 * stack unwound, pagefault_out_of_memory() will check
			
 
				+		 * back with us by calling
			
 
				+		 * mem_cgroup_oom_synchronize(), possibly putting the
			
 
				+		 * task to sleep.
			
 
				+		 */
			
 
				+		current->memcg_oom.oom_locked = locked;
			
 
				+		current->memcg_oom.wakeups = wakeups;
			
 
				+		css_get(&memcg->css);
			
 
				+		current->memcg_oom.wait_on_memcg = memcg;
			
 
				 	}
			
 
				-	spin_lock(&memcg_oom_lock);
			
 
				-	if (locked)
			
 
				-		mem_cgroup_oom_unlock(memcg);
			
 
				-	memcg_wakeup_oom(memcg);
			
 
				-	spin_unlock(&memcg_oom_lock);
			
 
				+}
			
 
				 
			
 
				-	mem_cgroup_unmark_under_oom(memcg);
			
 
				+/**
			
 
				+ * mem_cgroup_oom_synchronize - complete memcg OOM handling
			
 
				+ *
			
 
				+ * This has to be called at the end of a page fault if the the memcg
			
 
				+ * OOM handler was enabled and the fault is returning %VM_FAULT_OOM.
			
 
				+ *
			
 
				+ * Memcg supports userspace OOM handling, so failed allocations must
			
 
				+ * sleep on a waitqueue until the userspace task resolves the
			
 
				+ * situation.  Sleeping directly in the charge context with all kinds
			
 
				+ * of locks held is not a good idea, instead we remember an OOM state
			
 
				+ * in the task and mem_cgroup_oom_synchronize() has to be called at
			
 
				+ * the end of the page fault to put the task to sleep and clean up the
			
 
				+ * OOM state.
			
 
				+ *
			
 
				+ * Returns %true if an ongoing memcg OOM situation was detected and
			
 
				+ * finalized, %false otherwise.
			
 
				+ */
			
 
				+bool mem_cgroup_oom_synchronize(void)
			
 
				+{
			
 
				+	struct oom_wait_info owait;
			
 
				+	struct mem_cgroup *memcg;
			
 
				 
			
 
				-	if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
			
 
				+	/* OOM is global, do not handle */
			
 
				+	if (!current->memcg_oom.in_memcg_oom)
			
 
				 		return false;
			
 
				-	/* Give chance to dying process */
			
 
				-	schedule_timeout_uninterruptible(1);
			
 
				+
			
 
				+	/*
			
 
				+	 * We invoked the OOM killer but there is a chance that a kill
			
 
				+	 * did not free up any charges.  Everybody else might already
			
 
				+	 * be sleeping, so restart the fault and keep the rampage
			
 
				+	 * going until some charges are released.
			
 
				+	 */
			
 
				+	memcg = current->memcg_oom.wait_on_memcg;
			
 
				+	if (!memcg)
			
 
				+		goto out;
			
 
				+
			
 
				+	if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
			
 
				+		goto out_memcg;
			
 
				+
			
 
				+	owait.memcg = memcg;
			
 
				+	owait.wait.flags = 0;
			
 
				+	owait.wait.func = memcg_oom_wake_function;
			
 
				+	owait.wait.private = current;
			
 
				+	INIT_LIST_HEAD(&owait.wait.task_list);
			
 
				+
			
 
				+	prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
			
 
				+	/* Only sleep if we didn't miss any wakeups since OOM */
			
 
				+	if (atomic_read(&memcg->oom_wakeups) == current->memcg_oom.wakeups)
			
 
				+		schedule();
			
 
				+	finish_wait(&memcg_oom_waitq, &owait.wait);
			
 
				+out_memcg:
			
 
				+	mem_cgroup_unmark_under_oom(memcg);
			
 
				+	if (current->memcg_oom.oom_locked) {
			
 
				+		mem_cgroup_oom_unlock(memcg);
			
 
				+		/*
			
 
				+		 * There is no guarantee that an OOM-lock contender
			
 
				+		 * sees the wakeups triggered by the OOM kill
			
 
				+		 * uncharges.  Wake any sleepers explicitely.
			
 
				+		 */
			
 
				+		memcg_oom_recover(memcg);
			
 
				+	}
			
 
				+	css_put(&memcg->css);
			
 
				+	current->memcg_oom.wait_on_memcg = NULL;
			
 
				+out:
			
 
				+	current->memcg_oom.in_memcg_oom = 0;
			
 
				 	return true;
			
 
				 }
			
 
				 
			
@@ -2288,7 +2217,7 @@ void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags)
 
				 }
			
 
				 
			
 
				 void mem_cgroup_update_page_stat(struct page *page,
			
 
				-				 enum mem_cgroup_page_stat_item idx, int val)
			
 
				+				 enum mem_cgroup_stat_index idx, int val)
			
 
				 {
			
 
				 	struct mem_cgroup *memcg;
			
 
				 	struct page_cgroup *pc = lookup_page_cgroup(page);
			
@@ -2297,18 +2226,11 @@ void mem_cgroup_update_page_stat(struct page *page,
 
				 	if (mem_cgroup_disabled())
			
 
				 		return;
			
 
				 
			
 
				+	VM_BUG_ON(!rcu_read_lock_held());
			
 
				 	memcg = pc->mem_cgroup;
			
 
				 	if (unlikely(!memcg || !PageCgroupUsed(pc)))
			
 
				 		return;
			
 
				 
			
 
				-	switch (idx) {
			
 
				-	case MEMCG_NR_FILE_MAPPED:
			
 
				-		idx = MEM_CGROUP_STAT_FILE_MAPPED;
			
 
				-		break;
			
 
				-	default:
			
 
				-		BUG();
			
 
				-	}
			
 
				-
			
 
				 	this_cpu_add(memcg->stat->count[idx], val);
			
 
				 }
			
 
				 
			
@@ -2450,7 +2372,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)
 
				 			flush_work(&stock->work);
			
 
				 	}
			
 
				 out:
			
 
				- 	put_online_cpus();
			
 
				+	put_online_cpus();
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -2532,12 +2454,11 @@ enum {
 
				 	CHARGE_RETRY,		/* need to retry but retry is not bad */
			
 
				 	CHARGE_NOMEM,		/* we can't do more. return -ENOMEM */
			
 
				 	CHARGE_WOULDBLOCK,	/* GFP_WAIT wasn't set and no enough res. */
			
 
				-	CHARGE_OOM_DIE,		/* the current is killed because of OOM */
			
 
				 };
			
 
				 
			
 
				 static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
			
 
				 				unsigned int nr_pages, unsigned int min_pages,
			
 
				-				bool oom_check)
			
 
				+				bool invoke_oom)
			
 
				 {
			
 
				 	unsigned long csize = nr_pages * PAGE_SIZE;
			
 
				 	struct mem_cgroup *mem_over_limit;
			
@@ -2594,14 +2515,10 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
 
				 	if (mem_cgroup_wait_acct_move(mem_over_limit))
			
 
				 		return CHARGE_RETRY;
			
 
				 
			
 
				-	/* If we don't need to call oom-killer at el, return immediately */
			
 
				-	if (!oom_check)
			
 
				-		return CHARGE_NOMEM;
			
 
				-	/* check OOM */
			
 
				-	if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask, get_order(csize)))
			
 
				-		return CHARGE_OOM_DIE;
			
 
				+	if (invoke_oom)
			
 
				+		mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(csize));
			
 
				 
			
 
				-	return CHARGE_RETRY;
			
 
				+	return CHARGE_NOMEM;
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -2704,7 +2621,7 @@ again:
 
				 	}
			
 
				 
			
 
				 	do {
			
 
				-		bool oom_check;
			
 
				+		bool invoke_oom = oom && !nr_oom_retries;
			
 
				 
			
 
				 		/* If killed, bypass charge */
			
 
				 		if (fatal_signal_pending(current)) {
			
@@ -2712,14 +2629,8 @@ again:
 
				 			goto bypass;
			
 
				 		}
			
 
				 
			
 
				-		oom_check = false;
			
 
				-		if (oom && !nr_oom_retries) {
			
 
				-			oom_check = true;
			
 
				-			nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
			
 
				-		}
			
 
				-
			
 
				-		ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, nr_pages,
			
 
				-		    oom_check);
			
 
				+		ret = mem_cgroup_do_charge(memcg, gfp_mask, batch,
			
 
				+					   nr_pages, invoke_oom);
			
 
				 		switch (ret) {
			
 
				 		case CHARGE_OK:
			
 
				 			break;
			
@@ -2732,16 +2643,12 @@ again:
 
				 			css_put(&memcg->css);
			
 
				 			goto nomem;
			
 
				 		case CHARGE_NOMEM: /* OOM routine works */
			
 
				-			if (!oom) {
			
 
				+			if (!oom || invoke_oom) {
			
 
				 				css_put(&memcg->css);
			
 
				 				goto nomem;
			
 
				 			}
			
 
				-			/* If oom, we never return -ENOMEM */
			
 
				 			nr_oom_retries--;
			
 
				 			break;
			
 
				-		case CHARGE_OOM_DIE: /* Killed by OOM Killer */
			
 
				-			css_put(&memcg->css);
			
 
				-			goto bypass;
			
 
				 		}
			
 
				 	} while (ret != CHARGE_OK);
			
 
				 
			
@@ -2882,7 +2789,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
 
				 	 * is accessed after testing USED bit. To make pc->mem_cgroup visible
			
 
				 	 * before USED bit, we need memory barrier here.
			
 
				 	 * See mem_cgroup_add_lru_list(), etc.
			
 
				- 	 */
			
 
				+	 */
			
 
				 	smp_wmb();
			
 
				 	SetPageCgroupUsed(pc);
			
 
				 
			
@@ -2905,9 +2812,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
 
				 	unlock_page_cgroup(pc);
			
 
				 
			
 
				 	/*
			
 
				-	 * "charge_statistics" updated event counter. Then, check it.
			
 
				-	 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
			
 
				-	 * if they exceeds softlimit.
			
 
				+	 * "charge_statistics" updated event counter.
			
 
				 	 */
			
 
				 	memcg_check_events(memcg, page);
			
 
				 }
			
@@ -3626,9 +3531,9 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
 
				 	 * the page allocator. Therefore, the following sequence when backed by
			
 
				 	 * the SLUB allocator:
			
 
				 	 *
			
 
				-	 * 	memcg_stop_kmem_account();
			
 
				-	 * 	kmalloc(<large_number>)
			
 
				-	 * 	memcg_resume_kmem_account();
			
 
				+	 *	memcg_stop_kmem_account();
			
 
				+	 *	kmalloc(<large_number>)
			
 
				+	 *	memcg_resume_kmem_account();
			
 
				 	 *
			
 
				 	 * would effectively ignore the fact that we should skip accounting,
			
 
				 	 * since it will drive us directly to this function without passing
			
@@ -3750,6 +3655,20 @@ void mem_cgroup_split_huge_fixup(struct page *head)
 
				 }
			
 
				 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
			
 
				 
			
 
				+static inline
			
 
				+void mem_cgroup_move_account_page_stat(struct mem_cgroup *from,
			
 
				+					struct mem_cgroup *to,
			
 
				+					unsigned int nr_pages,
			
 
				+					enum mem_cgroup_stat_index idx)
			
 
				+{
			
 
				+	/* Update stat data for mem_cgroup */
			
 
				+	preempt_disable();
			
 
				+	WARN_ON_ONCE(from->stat->count[idx] < nr_pages);
			
 
				+	__this_cpu_add(from->stat->count[idx], -nr_pages);
			
 
				+	__this_cpu_add(to->stat->count[idx], nr_pages);
			
 
				+	preempt_enable();
			
 
				+}
			
 
				+
			
 
				 /**
			
 
				  * mem_cgroup_move_account - move account of the page
			
 
				  * @page: the page
			
@@ -3795,13 +3714,14 @@ static int mem_cgroup_move_account(struct page *page,
 
				 
			
 
				 	move_lock_mem_cgroup(from, &flags);
			
 
				 
			
 
				-	if (!anon && page_mapped(page)) {
			
 
				-		/* Update mapped_file data for mem_cgroup */
			
 
				-		preempt_disable();
			
 
				-		__this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
			
 
				-		__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
			
 
				-		preempt_enable();
			
 
				-	}
			
 
				+	if (!anon && page_mapped(page))
			
 
				+		mem_cgroup_move_account_page_stat(from, to, nr_pages,
			
 
				+			MEM_CGROUP_STAT_FILE_MAPPED);
			
 
				+
			
 
				+	if (PageWriteback(page))
			
 
				+		mem_cgroup_move_account_page_stat(from, to, nr_pages,
			
 
				+			MEM_CGROUP_STAT_WRITEBACK);
			
 
				+
			
 
				 	mem_cgroup_charge_statistics(from, page, anon, -nr_pages);
			
 
				 
			
 
				 	/* caller should have done css_get */
			
@@ -4657,7 +4577,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
 
				 				   MEM_CGROUP_RECLAIM_SHRINK);
			
 
				 		curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
			
 
				 		/* Usage is reduced ? */
			
 
				-  		if (curusage >= oldusage)
			
 
				+		if (curusage >= oldusage)
			
 
				 			retry_count--;
			
 
				 		else
			
 
				 			oldusage = curusage;
			
@@ -4678,7 +4598,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
 
				 	int enlarge = 0;
			
 
				 
			
 
				 	/* see mem_cgroup_resize_res_limit */
			
 
				- 	retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
			
 
				+	retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
			
 
				 	oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
			
 
				 	while (retry_count) {
			
 
				 		if (signal_pending(current)) {
			
@@ -4727,98 +4647,6 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				-unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
			
 
				-					    gfp_t gfp_mask,
			
 
				-					    unsigned long *total_scanned)
			
 
				-{
			
 
				-	unsigned long nr_reclaimed = 0;
			
 
				-	struct mem_cgroup_per_zone *mz, *next_mz = NULL;
			
 
				-	unsigned long reclaimed;
			
 
				-	int loop = 0;
			
 
				-	struct mem_cgroup_tree_per_zone *mctz;
			
 
				-	unsigned long long excess;
			
 
				-	unsigned long nr_scanned;
			
 
				-
			
 
				-	if (order > 0)
			
 
				-		return 0;
			
 
				-
			
 
				-	mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
			
 
				-	/*
			
 
				-	 * This loop can run a while, specially if mem_cgroup's continuously
			
 
				-	 * keep exceeding their soft limit and putting the system under
			
 
				-	 * pressure
			
 
				-	 */
			
 
				-	do {
			
 
				-		if (next_mz)
			
 
				-			mz = next_mz;
			
 
				-		else
			
 
				-			mz = mem_cgroup_largest_soft_limit_node(mctz);
			
 
				-		if (!mz)
			
 
				-			break;
			
 
				-
			
 
				-		nr_scanned = 0;
			
 
				-		reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone,
			
 
				-						    gfp_mask, &nr_scanned);
			
 
				-		nr_reclaimed += reclaimed;
			
 
				-		*total_scanned += nr_scanned;
			
 
				-		spin_lock(&mctz->lock);
			
 
				-
			
 
				-		/*
			
 
				-		 * If we failed to reclaim anything from this memory cgroup
			
 
				-		 * it is time to move on to the next cgroup
			
 
				-		 */
			
 
				-		next_mz = NULL;
			
 
				-		if (!reclaimed) {
			
 
				-			do {
			
 
				-				/*
			
 
				-				 * Loop until we find yet another one.
			
 
				-				 *
			
 
				-				 * By the time we get the soft_limit lock
			
 
				-				 * again, someone might have aded the
			
 
				-				 * group back on the RB tree. Iterate to
			
 
				-				 * make sure we get a different mem.
			
 
				-				 * mem_cgroup_largest_soft_limit_node returns
			
 
				-				 * NULL if no other cgroup is present on
			
 
				-				 * the tree
			
 
				-				 */
			
 
				-				next_mz =
			
 
				-				__mem_cgroup_largest_soft_limit_node(mctz);
			
 
				-				if (next_mz == mz)
			
 
				-					css_put(&next_mz->memcg->css);
			
 
				-				else /* next_mz == NULL or other memcg */
			
 
				-					break;
			
 
				-			} while (1);
			
 
				-		}
			
 
				-		__mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
			
 
				-		excess = res_counter_soft_limit_excess(&mz->memcg->res);
			
 
				-		/*
			
 
				-		 * One school of thought says that we should not add
			
 
				-		 * back the node to the tree if reclaim returns 0.
			
 
				-		 * But our reclaim could return 0, simply because due
			
 
				-		 * to priority we are exposing a smaller subset of
			
 
				-		 * memory to reclaim from. Consider this as a longer
			
 
				-		 * term TODO.
			
 
				-		 */
			
 
				-		/* If excess == 0, no tree ops */
			
 
				-		__mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess);
			
 
				-		spin_unlock(&mctz->lock);
			
 
				-		css_put(&mz->memcg->css);
			
 
				-		loop++;
			
 
				-		/*
			
 
				-		 * Could not reclaim anything and there are no more
			
 
				-		 * mem cgroups to try or we seem to be looping without
			
 
				-		 * reclaiming anything.
			
 
				-		 */
			
 
				-		if (!nr_reclaimed &&
			
 
				-			(next_mz == NULL ||
			
 
				-			loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
			
 
				-			break;
			
 
				-	} while (!nr_reclaimed);
			
 
				-	if (next_mz)
			
 
				-		css_put(&next_mz->memcg->css);
			
 
				-	return nr_reclaimed;
			
 
				-}
			
 
				-
			
 
				 /**
			
 
				  * mem_cgroup_force_empty_list - clears LRU of a group
			
 
				  * @memcg: group to clear
			
@@ -4990,18 +4818,12 @@ static int mem_cgroup_force_empty_write(struct cgroup_subsys_state *css,
 
				 					unsigned int event)
			
 
				 {
			
 
				 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
			
 
				-	int ret;
			
 
				 
			
 
				 	if (mem_cgroup_is_root(memcg))
			
 
				 		return -EINVAL;
			
 
				-	css_get(&memcg->css);
			
 
				-	ret = mem_cgroup_force_empty(memcg);
			
 
				-	css_put(&memcg->css);
			
 
				-
			
 
				-	return ret;
			
 
				+	return mem_cgroup_force_empty(memcg);
			
 
				 }
			
 
				 
			
 
				-
			
 
				 static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
			
 
				 				     struct cftype *cft)
			
 
				 {
			
@@ -5139,7 +4961,7 @@ static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val)
 
				 	 */
			
 
				 	mutex_lock(&memcg_create_mutex);
			
 
				 	mutex_lock(&set_limit_mutex);
			
 
				-	if (!memcg->kmem_account_flags && val != RESOURCE_MAX) {
			
 
				+	if (!memcg->kmem_account_flags && val != RES_COUNTER_MAX) {
			
 
				 		if (cgroup_task_count(css->cgroup) || memcg_has_children(memcg)) {
			
 
				 			ret = -EBUSY;
			
 
				 			goto out;
			
@@ -5149,7 +4971,7 @@ static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val)
 
				 
			
 
				 		ret = memcg_update_cache_sizes(memcg);
			
 
				 		if (ret) {
			
 
				-			res_counter_set_limit(&memcg->kmem, RESOURCE_MAX);
			
 
				+			res_counter_set_limit(&memcg->kmem, RES_COUNTER_MAX);
			
 
				 			goto out;
			
 
				 		}
			
 
				 		static_key_slow_inc(&memcg_kmem_enabled_key);
			
@@ -6089,8 +5911,6 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
 
				 	for (zone = 0; zone < MAX_NR_ZONES; zone++) {
			
 
				 		mz = &pn->zoneinfo[zone];
			
 
				 		lruvec_init(&mz->lruvec);
			
 
				-		mz->usage_in_excess = 0;
			
 
				-		mz->on_tree = false;
			
 
				 		mz->memcg = memcg;
			
 
				 	}
			
 
				 	memcg->nodeinfo[node] = pn;
			
@@ -6146,7 +5966,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
 
				 	int node;
			
 
				 	size_t size = memcg_size();
			
 
				 
			
 
				-	mem_cgroup_remove_from_trees(memcg);
			
 
				 	free_css_id(&mem_cgroup_subsys, &memcg->css);
			
 
				 
			
 
				 	for_each_node(node)
			
@@ -6183,29 +6002,6 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
 
				 }
			
 
				 EXPORT_SYMBOL(parent_mem_cgroup);
			
 
				 
			
 
				-static void __init mem_cgroup_soft_limit_tree_init(void)
			
 
				-{
			
 
				-	struct mem_cgroup_tree_per_node *rtpn;
			
 
				-	struct mem_cgroup_tree_per_zone *rtpz;
			
 
				-	int tmp, node, zone;
			
 
				-
			
 
				-	for_each_node(node) {
			
 
				-		tmp = node;
			
 
				-		if (!node_state(node, N_NORMAL_MEMORY))
			
 
				-			tmp = -1;
			
 
				-		rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
			
 
				-		BUG_ON(!rtpn);
			
 
				-
			
 
				-		soft_limit_tree.rb_tree_per_node[node] = rtpn;
			
 
				-
			
 
				-		for (zone = 0; zone < MAX_NR_ZONES; zone++) {
			
 
				-			rtpz = &rtpn->rb_tree_per_zone[zone];
			
 
				-			rtpz->rb_root = RB_ROOT;
			
 
				-			spin_lock_init(&rtpz->lock);
			
 
				-		}
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				 static struct cgroup_subsys_state * __ref
			
 
				 mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
			
 
				 {
			
@@ -6235,6 +6031,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 
				 	mutex_init(&memcg->thresholds_lock);
			
 
				 	spin_lock_init(&memcg->move_lock);
			
 
				 	vmpressure_init(&memcg->vmpressure);
			
 
				+	spin_lock_init(&memcg->soft_lock);
			
 
				 
			
 
				 	return &memcg->css;
			
 
				 
			
@@ -6312,6 +6109,13 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
 
				 
			
 
				 	mem_cgroup_invalidate_reclaim_iterators(memcg);
			
 
				 	mem_cgroup_reparent_charges(memcg);
			
 
				+	if (memcg->soft_contributed) {
			
 
				+		while ((memcg = parent_mem_cgroup(memcg)))
			
 
				+			atomic_dec(&memcg->children_in_excess);
			
 
				+
			
 
				+		if (memcg != root_mem_cgroup && !root_mem_cgroup->use_hierarchy)
			
 
				+			atomic_dec(&root_mem_cgroup->children_in_excess);
			
 
				+	}
			
 
				 	mem_cgroup_destroy_all_caches(memcg);
			
 
				 	vmpressure_cleanup(&memcg->vmpressure);
			
 
				 }
			
@@ -6986,7 +6790,6 @@ static int __init mem_cgroup_init(void)
 
				 {
			
 
				 	hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
			
 
				 	enable_swap_cgroup();
			
 
				-	mem_cgroup_soft_limit_tree_init();
			
 
				 	memcg_stock_init();
			
 
				 	return 0;
			
 
				 }
			
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3695,7 +3695,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
				  * but allow concurrent faults), and pte mapped but not yet locked.
			
 
				  * We return with mmap_sem still held, but pte unmapped and unlocked.
			
 
				  */
			
 
				-int handle_pte_fault(struct mm_struct *mm,
			
 
				+static int handle_pte_fault(struct mm_struct *mm,
			
 
				 		     struct vm_area_struct *vma, unsigned long address,
			
 
				 		     pte_t *pte, pmd_t *pmd, unsigned int flags)
			
 
				 {
			
@@ -3754,22 +3754,14 @@ unlock:
 
				 /*
			
 
				  * By the time we get here, we already hold the mm semaphore
			
 
				  */
			
 
				-int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
			
 
				-		unsigned long address, unsigned int flags)
			
 
				+static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
			
 
				+			     unsigned long address, unsigned int flags)
			
 
				 {
			
 
				 	pgd_t *pgd;
			
 
				 	pud_t *pud;
			
 
				 	pmd_t *pmd;
			
 
				 	pte_t *pte;
			
 
				 
			
 
				-	__set_current_state(TASK_RUNNING);
			
 
				-
			
 
				-	count_vm_event(PGFAULT);
			
 
				-	mem_cgroup_count_vm_event(mm, PGFAULT);
			
 
				-
			
 
				-	/* do counter updates before entering really critical section. */
			
 
				-	check_sync_rss_stat(current);
			
 
				-
			
 
				 	if (unlikely(is_vm_hugetlb_page(vma)))
			
 
				 		return hugetlb_fault(mm, vma, address, flags);
			
 
				 
			
@@ -3782,9 +3774,12 @@ retry:
 
				 	if (!pmd)
			
 
				 		return VM_FAULT_OOM;
			
 
				 	if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
			
 
				+		int ret = VM_FAULT_FALLBACK;
			
 
				 		if (!vma->vm_ops)
			
 
				-			return do_huge_pmd_anonymous_page(mm, vma, address,
			
 
				-							  pmd, flags);
			
 
				+			ret = do_huge_pmd_anonymous_page(mm, vma, address,
			
 
				+					pmd, flags);
			
 
				+		if (!(ret & VM_FAULT_FALLBACK))
			
 
				+			return ret;
			
 
				 	} else {
			
 
				 		pmd_t orig_pmd = *pmd;
			
 
				 		int ret;
			
@@ -3850,6 +3845,37 @@ retry:
 
				 	return handle_pte_fault(mm, vma, address, pte, pmd, flags);
			
 
				 }
			
 
				 
			
 
				+int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
			
 
				+		    unsigned long address, unsigned int flags)
			
 
				+{
			
 
				+	int ret;
			
 
				+
			
 
				+	__set_current_state(TASK_RUNNING);
			
 
				+
			
 
				+	count_vm_event(PGFAULT);
			
 
				+	mem_cgroup_count_vm_event(mm, PGFAULT);
			
 
				+
			
 
				+	/* do counter updates before entering really critical section. */
			
 
				+	check_sync_rss_stat(current);
			
 
				+
			
 
				+	/*
			
 
				+	 * Enable the memcg OOM handling for faults triggered in user
			
 
				+	 * space.  Kernel faults are handled more gracefully.
			
 
				+	 */
			
 
				+	if (flags & FAULT_FLAG_USER)
			
 
				+		mem_cgroup_enable_oom();
			
 
				+
			
 
				+	ret = __handle_mm_fault(mm, vma, address, flags);
			
 
				+
			
 
				+	if (flags & FAULT_FLAG_USER)
			
 
				+		mem_cgroup_disable_oom();
			
 
				+
			
 
				+	if (WARN_ON(task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM)))
			
 
				+		mem_cgroup_oom_synchronize();
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				 #ifndef __PAGETABLE_PUD_FOLDED
			
 
				 /*
			
 
				  * Allocate page upper directory.
			
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -678,9 +678,12 @@ out:
 
				  */
			
 
				 void pagefault_out_of_memory(void)
			
 
				 {
			
 
				-	struct zonelist *zonelist = node_zonelist(first_online_node,
			
 
				-						  GFP_KERNEL);
			
 
				+	struct zonelist *zonelist;
			
 
				 
			
 
				+	if (mem_cgroup_oom_synchronize())
			
 
				+		return;
			
 
				+
			
 
				+	zonelist = node_zonelist(first_online_node, GFP_KERNEL);
			
 
				 	if (try_set_zonelist_oom(zonelist, GFP_KERNEL)) {
			
 
				 		out_of_memory(NULL, 0, 0, NULL, false);
			
 
				 		clear_zonelist_oom(zonelist, GFP_KERNEL);
			
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2143,11 +2143,17 @@ EXPORT_SYMBOL(account_page_dirtied);
 
				 
			
 
				 /*
			
 
				  * Helper function for set_page_writeback family.
			
 
				+ *
			
 
				+ * The caller must hold mem_cgroup_begin/end_update_page_stat() lock
			
 
				+ * while calling this function.
			
 
				+ * See test_set_page_writeback for example.
			
 
				+ *
			
 
				  * NOTE: Unlike account_page_dirtied this does not rely on being atomic
			
 
				  * wrt interrupts.
			
 
				  */
			
 
				 void account_page_writeback(struct page *page)
			
 
				 {
			
 
				+	mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK);
			
 
				 	inc_zone_page_state(page, NR_WRITEBACK);
			
 
				 }
			
 
				 EXPORT_SYMBOL(account_page_writeback);
			
@@ -2364,7 +2370,10 @@ int test_clear_page_writeback(struct page *page)
 
				 {
			
 
				 	struct address_space *mapping = page_mapping(page);
			
 
				 	int ret;
			
 
				+	bool locked;
			
 
				+	unsigned long memcg_flags;
			
 
				 
			
 
				+	mem_cgroup_begin_update_page_stat(page, &locked, &memcg_flags);
			
 
				 	if (mapping) {
			
 
				 		struct backing_dev_info *bdi = mapping->backing_dev_info;
			
 
				 		unsigned long flags;
			
@@ -2385,9 +2394,11 @@ int test_clear_page_writeback(struct page *page)
 
				 		ret = TestClearPageWriteback(page);
			
 
				 	}
			
 
				 	if (ret) {
			
 
				+		mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_WRITEBACK);
			
 
				 		dec_zone_page_state(page, NR_WRITEBACK);
			
 
				 		inc_zone_page_state(page, NR_WRITTEN);
			
 
				 	}
			
 
				+	mem_cgroup_end_update_page_stat(page, &locked, &memcg_flags);
			
 
				 	return ret;
			
 
				 }
			
 
				 
			
@@ -2395,7 +2406,10 @@ int test_set_page_writeback(struct page *page)
 
				 {
			
 
				 	struct address_space *mapping = page_mapping(page);
			
 
				 	int ret;
			
 
				+	bool locked;
			
 
				+	unsigned long memcg_flags;
			
 
				 
			
 
				+	mem_cgroup_begin_update_page_stat(page, &locked, &memcg_flags);
			
 
				 	if (mapping) {
			
 
				 		struct backing_dev_info *bdi = mapping->backing_dev_info;
			
 
				 		unsigned long flags;
			
@@ -2422,6 +2436,7 @@ int test_set_page_writeback(struct page *page)
 
				 	}
			
 
				 	if (!ret)
			
 
				 		account_page_writeback(page);
			
 
				+	mem_cgroup_end_update_page_stat(page, &locked, &memcg_flags);
			
 
				 	return ret;
			
 
				 
			
 
				 }
			
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1052,11 +1052,11 @@ void do_page_add_anon_rmap(struct page *page,
 
				 {
			
 
				 	int first = atomic_inc_and_test(&page->_mapcount);
			
 
				 	if (first) {
			
 
				-		if (!PageTransHuge(page))
			
 
				-			__inc_zone_page_state(page, NR_ANON_PAGES);
			
 
				-		else
			
 
				+		if (PageTransHuge(page))
			
 
				 			__inc_zone_page_state(page,
			
 
				 					      NR_ANON_TRANSPARENT_HUGEPAGES);
			
 
				+		__mod_zone_page_state(page_zone(page), NR_ANON_PAGES,
			
 
				+				hpage_nr_pages(page));
			
 
				 	}
			
 
				 	if (unlikely(PageKsm(page)))
			
 
				 		return;
			
@@ -1085,10 +1085,10 @@ void page_add_new_anon_rmap(struct page *page,
 
				 	VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
			
 
				 	SetPageSwapBacked(page);
			
 
				 	atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */
			
 
				-	if (!PageTransHuge(page))
			
 
				-		__inc_zone_page_state(page, NR_ANON_PAGES);
			
 
				-	else
			
 
				+	if (PageTransHuge(page))
			
 
				 		__inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
			
 
				+	__mod_zone_page_state(page_zone(page), NR_ANON_PAGES,
			
 
				+			hpage_nr_pages(page));
			
 
				 	__page_set_anon_rmap(page, vma, address, 1);
			
 
				 	if (!mlocked_vma_newpage(vma, page)) {
			
 
				 		SetPageActive(page);
			
@@ -1111,7 +1111,7 @@ void page_add_file_rmap(struct page *page)
 
				 	mem_cgroup_begin_update_page_stat(page, &locked, &flags);
			
 
				 	if (atomic_inc_and_test(&page->_mapcount)) {
			
 
				 		__inc_zone_page_state(page, NR_FILE_MAPPED);
			
 
				-		mem_cgroup_inc_page_stat(page, MEMCG_NR_FILE_MAPPED);
			
 
				+		mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED);
			
 
				 	}
			
 
				 	mem_cgroup_end_update_page_stat(page, &locked, &flags);
			
 
				 }
			
@@ -1148,14 +1148,14 @@ void page_remove_rmap(struct page *page)
 
				 		goto out;
			
 
				 	if (anon) {
			
 
				 		mem_cgroup_uncharge_page(page);
			
 
				-		if (!PageTransHuge(page))
			
 
				-			__dec_zone_page_state(page, NR_ANON_PAGES);
			
 
				-		else
			
 
				+		if (PageTransHuge(page))
			
 
				 			__dec_zone_page_state(page,
			
 
				 					      NR_ANON_TRANSPARENT_HUGEPAGES);
			
 
				+		__mod_zone_page_state(page_zone(page), NR_ANON_PAGES,
			
 
				+				-hpage_nr_pages(page));
			
 
				 	} else {
			
 
				 		__dec_zone_page_state(page, NR_FILE_MAPPED);
			
 
				-		mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_MAPPED);
			
 
				+		mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED);
			
 
				 		mem_cgroup_end_update_page_stat(page, &locked, &flags);
			
 
				 	}
			
 
				 	if (unlikely(PageMlocked(page)))
			
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -432,6 +432,11 @@ static void activate_page_drain(int cpu)
 
				 		pagevec_lru_move_fn(pvec, __activate_page, NULL);
			
 
				 }
			
 
				 
			
 
				+static bool need_activate_page_drain(int cpu)
			
 
				+{
			
 
				+	return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0;
			
 
				+}
			
 
				+
			
 
				 void activate_page(struct page *page)
			
 
				 {
			
 
				 	if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
			
@@ -449,6 +454,11 @@ static inline void activate_page_drain(int cpu)
 
				 {
			
 
				 }
			
 
				 
			
 
				+static bool need_activate_page_drain(int cpu)
			
 
				+{
			
 
				+	return false;
			
 
				+}
			
 
				+
			
 
				 void activate_page(struct page *page)
			
 
				 {
			
 
				 	struct zone *zone = page_zone(page);
			
@@ -701,12 +711,36 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy)
 
				 	lru_add_drain();
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * Returns 0 for success
			
 
				- */
			
 
				-int lru_add_drain_all(void)
			
 
				+static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
			
 
				+
			
 
				+void lru_add_drain_all(void)
			
 
				 {
			
 
				-	return schedule_on_each_cpu(lru_add_drain_per_cpu);
			
 
				+	static DEFINE_MUTEX(lock);
			
 
				+	static struct cpumask has_work;
			
 
				+	int cpu;
			
 
				+
			
 
				+	mutex_lock(&lock);
			
 
				+	get_online_cpus();
			
 
				+	cpumask_clear(&has_work);
			
 
				+
			
 
				+	for_each_online_cpu(cpu) {
			
 
				+		struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
			
 
				+
			
 
				+		if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
			
 
				+		    pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
			
 
				+		    pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
			
 
				+		    need_activate_page_drain(cpu)) {
			
 
				+			INIT_WORK(work, lru_add_drain_per_cpu);
			
 
				+			schedule_work_on(cpu, work);
			
 
				+			cpumask_set_cpu(cpu, &has_work);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	for_each_cpu(cpu, &has_work)
			
 
				+		flush_work(&per_cpu(lru_add_drain_work, cpu));
			
 
				+
			
 
				+	put_online_cpus();
			
 
				+	mutex_unlock(&lock);
			
 
				 }
			
 
				 
			
 
				 /*
			
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -567,7 +567,6 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2);
 
				 /**
			
 
				  * truncate_pagecache - unmap and remove pagecache that has been truncated
			
 
				  * @inode: inode
			
 
				- * @oldsize: old file size
			
 
				  * @newsize: new file size
			
 
				  *
			
 
				  * inode's new i_size must already be written before truncate_pagecache
			
@@ -580,7 +579,7 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2);
 
				  * situations such as writepage being called for a page that has already
			
 
				  * had its underlying blocks deallocated.
			
 
				  */
			
 
				-void truncate_pagecache(struct inode *inode, loff_t oldsize, loff_t newsize)
			
 
				+void truncate_pagecache(struct inode *inode, loff_t newsize)
			
 
				 {
			
 
				 	struct address_space *mapping = inode->i_mapping;
			
 
				 	loff_t holebegin = round_up(newsize, PAGE_SIZE);
			
@@ -614,12 +613,8 @@ EXPORT_SYMBOL(truncate_pagecache);
 
				  */
			
 
				 void truncate_setsize(struct inode *inode, loff_t newsize)
			
 
				 {
			
 
				-	loff_t oldsize;
			
 
				-
			
 
				-	oldsize = inode->i_size;
			
 
				 	i_size_write(inode, newsize);
			
 
				-
			
 
				-	truncate_pagecache(inode, oldsize, newsize);
			
 
				+	truncate_pagecache(inode, newsize);
			
 
				 }
			
 
				 EXPORT_SYMBOL(truncate_setsize);
			
 
				 
			
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -139,11 +139,23 @@ static bool global_reclaim(struct scan_control *sc)
 
				 {
			
 
				 	return !sc->target_mem_cgroup;
			
 
				 }
			
 
				+
			
 
				+static bool mem_cgroup_should_soft_reclaim(struct scan_control *sc)
			
 
				+{
			
 
				+	struct mem_cgroup *root = sc->target_mem_cgroup;
			
 
				+	return !mem_cgroup_disabled() &&
			
 
				+		mem_cgroup_soft_reclaim_eligible(root, root) != SKIP_TREE;
			
 
				+}
			
 
				 #else
			
 
				 static bool global_reclaim(struct scan_control *sc)
			
 
				 {
			
 
				 	return true;
			
 
				 }
			
 
				+
			
 
				+static bool mem_cgroup_should_soft_reclaim(struct scan_control *sc)
			
 
				+{
			
 
				+	return false;
			
 
				+}
			
 
				 #endif
			
 
				 
			
 
				 unsigned long zone_reclaimable_pages(struct zone *zone)
			
@@ -2164,9 +2176,11 @@ static inline bool should_continue_reclaim(struct zone *zone,
 
				 	}
			
 
				 }
			
 
				 
			
 
				-static void shrink_zone(struct zone *zone, struct scan_control *sc)
			
 
				+static int
			
 
				+__shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim)
			
 
				 {
			
 
				 	unsigned long nr_reclaimed, nr_scanned;
			
 
				+	int groups_scanned = 0;
			
 
				 
			
 
				 	do {
			
 
				 		struct mem_cgroup *root = sc->target_mem_cgroup;
			
@@ -2174,15 +2188,17 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc)
 
				 			.zone = zone,
			
 
				 			.priority = sc->priority,
			
 
				 		};
			
 
				-		struct mem_cgroup *memcg;
			
 
				+		struct mem_cgroup *memcg = NULL;
			
 
				+		mem_cgroup_iter_filter filter = (soft_reclaim) ?
			
 
				+			mem_cgroup_soft_reclaim_eligible : NULL;
			
 
				 
			
 
				 		nr_reclaimed = sc->nr_reclaimed;
			
 
				 		nr_scanned = sc->nr_scanned;
			
 
				 
			
 
				-		memcg = mem_cgroup_iter(root, NULL, &reclaim);
			
 
				-		do {
			
 
				+		while ((memcg = mem_cgroup_iter_cond(root, memcg, &reclaim, filter))) {
			
 
				 			struct lruvec *lruvec;
			
 
				 
			
 
				+			groups_scanned++;
			
 
				 			lruvec = mem_cgroup_zone_lruvec(zone, memcg);
			
 
				 
			
 
				 			shrink_lruvec(lruvec, sc);
			
@@ -2202,8 +2218,7 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc)
 
				 				mem_cgroup_iter_break(root, memcg);
			
 
				 				break;
			
 
				 			}
			
 
				-			memcg = mem_cgroup_iter(root, memcg, &reclaim);
			
 
				-		} while (memcg);
			
 
				+		}
			
 
				 
			
 
				 		vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
			
 
				 			   sc->nr_scanned - nr_scanned,
			
@@ -2211,6 +2226,37 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc)
 
				 
			
 
				 	} while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,
			
 
				 					 sc->nr_scanned - nr_scanned, sc));
			
 
				+
			
 
				+	return groups_scanned;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static void shrink_zone(struct zone *zone, struct scan_control *sc)
			
 
				+{
			
 
				+	bool do_soft_reclaim = mem_cgroup_should_soft_reclaim(sc);
			
 
				+	unsigned long nr_scanned = sc->nr_scanned;
			
 
				+	int scanned_groups;
			
 
				+
			
 
				+	scanned_groups = __shrink_zone(zone, sc, do_soft_reclaim);
			
 
				+	/*
			
 
				+	 * memcg iterator might race with other reclaimer or start from
			
 
				+	 * a incomplete tree walk so the tree walk in __shrink_zone
			
 
				+	 * might have missed groups that are above the soft limit. Try
			
 
				+	 * another loop to catch up with others. Do it just once to
			
 
				+	 * prevent from reclaim latencies when other reclaimers always
			
 
				+	 * preempt this one.
			
 
				+	 */
			
 
				+	if (do_soft_reclaim && !scanned_groups)
			
 
				+		__shrink_zone(zone, sc, do_soft_reclaim);
			
 
				+
			
 
				+	/*
			
 
				+	 * No group is over the soft limit or those that are do not have
			
 
				+	 * pages in the zone we are reclaiming so we have to reclaim everybody
			
 
				+	 */
			
 
				+	if (do_soft_reclaim && (sc->nr_scanned == nr_scanned)) {
			
 
				+		__shrink_zone(zone, sc, false);
			
 
				+		return;
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				 /* Returns true if compaction should go ahead for a high-order request */
			
@@ -2274,8 +2320,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
 
				 {
			
 
				 	struct zoneref *z;
			
 
				 	struct zone *zone;
			
 
				-	unsigned long nr_soft_reclaimed;
			
 
				-	unsigned long nr_soft_scanned;
			
 
				 	bool aborted_reclaim = false;
			
 
				 
			
 
				 	/*
			
@@ -2315,18 +2359,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
 
				 					continue;
			
 
				 				}
			
 
				 			}
			
 
				-			/*
			
 
				-			 * This steals pages from memory cgroups over softlimit
			
 
				-			 * and returns the number of reclaimed pages and
			
 
				-			 * scanned pages. This works for global memory pressure
			
 
				-			 * and balancing, not for a memcg's limit.
			
 
				-			 */
			
 
				-			nr_soft_scanned = 0;
			
 
				-			nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
			
 
				-						sc->order, sc->gfp_mask,
			
 
				-						&nr_soft_scanned);
			
 
				-			sc->nr_reclaimed += nr_soft_reclaimed;
			
 
				-			sc->nr_scanned += nr_soft_scanned;
			
 
				 			/* need some check for avoid more shrink_zone() */
			
 
				 		}
			
 
				 
			
@@ -2920,8 +2952,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
 
				 {
			
 
				 	int i;
			
 
				 	int end_zone = 0;	/* Inclusive.  0 = ZONE_DMA */
			
 
				-	unsigned long nr_soft_reclaimed;
			
 
				-	unsigned long nr_soft_scanned;
			
 
				 	struct scan_control sc = {
			
 
				 		.gfp_mask = GFP_KERNEL,
			
 
				 		.priority = DEF_PRIORITY,
			
@@ -3036,15 +3066,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
 
				 
			
 
				 			sc.nr_scanned = 0;
			
 
				 
			
 
				-			nr_soft_scanned = 0;
			
 
				-			/*
			
 
				-			 * Call soft limit reclaim before calling shrink_zone.
			
 
				-			 */
			
 
				-			nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
			
 
				-							order, sc.gfp_mask,
			
 
				-							&nr_soft_scanned);
			
 
				-			sc.nr_reclaimed += nr_soft_reclaimed;
			
 
				-
			
 
				 			/*
			
 
				 			 * There should be no need to raise the scanning
			
 
				 			 * priority if enough pages are already being scanned
			
--- a/net/ipv4/tcp_memcontrol.c
+++ b/net/ipv4/tcp_memcontrol.c
@@ -87,8 +87,8 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val)
 
				 	if (!cg_proto)
			
 
				 		return -EINVAL;
			
 
				 
			
 
				-	if (val > RESOURCE_MAX)
			
 
				-		val = RESOURCE_MAX;
			
 
				+	if (val > RES_COUNTER_MAX)
			
 
				+		val = RES_COUNTER_MAX;
			
 
				 
			
 
				 	tcp = tcp_from_cgproto(cg_proto);
			
 
				 
			
@@ -101,9 +101,9 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val)
 
				 		tcp->tcp_prot_mem[i] = min_t(long, val >> PAGE_SHIFT,
			
 
				 					     net->ipv4.sysctl_tcp_mem[i]);
			
 
				 
			
 
				-	if (val == RESOURCE_MAX)
			
 
				+	if (val == RES_COUNTER_MAX)
			
 
				 		clear_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags);
			
 
				-	else if (val != RESOURCE_MAX) {
			
 
				+	else if (val != RES_COUNTER_MAX) {
			
 
				 		/*
			
 
				 		 * The active bit needs to be written after the static_key
			
 
				 		 * update. This is what guarantees that the socket activation
			
@@ -187,7 +187,7 @@ static u64 tcp_cgroup_read(struct cgroup_subsys_state *css, struct cftype *cft)
 
				 
			
 
				 	switch (cft->private) {
			
 
				 	case RES_LIMIT:
			
 
				-		val = tcp_read_stat(memcg, RES_LIMIT, RESOURCE_MAX);
			
 
				+		val = tcp_read_stat(memcg, RES_LIMIT, RES_COUNTER_MAX);
			
 
				 		break;
			
 
				 	case RES_USAGE:
			
 
				 		val = tcp_read_usage(memcg);