Browse Source

Merge branch 'akpm' (patches from Andrew)

Merge updates from Andrew Morton:

 - various misc bits

 - most of MM (quite a lot of MM material is awaiting the merge of
   linux-next dependencies)

 - kasan

 - printk updates

 - procfs updates

 - MAINTAINERS

 - /lib updates

 - checkpatch updates

* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (123 commits)
  init: reduce rootwait polling interval time to 5ms
  binfmt_elf: use vmalloc() for allocation of vma_filesz
  checkpatch: don't emit unified-diff error for rename-only patches
  checkpatch: don't check c99 types like uint8_t under tools
  checkpatch: avoid multiple line dereferences
  checkpatch: don't check .pl files, improve absolute path commit log test
  scripts/checkpatch.pl: fix spelling
  checkpatch: don't try to get maintained status when --no-tree is given
  lib/ida: document locking requirements a bit better
  lib/rbtree.c: fix typo in comment of ____rb_erase_color
  lib/Kconfig.debug: make CONFIG_STRICT_DEVMEM depend on CONFIG_DEVMEM
  MAINTAINERS: add drm and drm/i915 irc channels
  MAINTAINERS: add "C:" for URI for chat where developers hang out
  MAINTAINERS: add drm and drm/i915 bug filing info
  MAINTAINERS: add "B:" for URI where to file bugs
  get_maintainer: look for arbitrary letter prefixes in sections
  printk: add Kconfig option to set default console loglevel
  printk/sound: handle more message headers
  printk/btrfs: handle more message headers
  printk/kdb: handle more message headers
  ...
Linus Torvalds 8 years ago
parent
commit
e34bac726d
100 changed files with 1264 additions and 818 deletions
  1. 7 0
      Documentation/devicetree/booting-without-of.txt
  2. 2 0
      Documentation/filesystems/proc.txt
  3. 1 1
      Documentation/kernel-parameters.txt
  4. 5 0
      Documentation/vm/transhuge.txt
  5. 8 0
      MAINTAINERS
  6. 11 10
      arch/arm/include/asm/tlb.h
  7. 13 12
      arch/ia64/include/asm/tlb.h
  8. 1 1
      arch/m32r/Kconfig
  9. 5 1
      arch/m32r/include/asm/device.h
  10. 32 0
      arch/m32r/include/asm/dma-mapping.h
  11. 2 0
      arch/m32r/platforms/m32700ut/setup.c
  12. 12 1
      arch/powerpc/include/asm/book3s/64/pgtable.h
  13. 16 0
      arch/powerpc/include/asm/tlb.h
  14. 1 12
      arch/powerpc/mm/numa.c
  15. 8 6
      arch/s390/include/asm/tlb.h
  16. 1 1
      arch/s390/mm/gmap.c
  17. 9 6
      arch/sh/include/asm/tlb.h
  18. 9 6
      arch/um/include/asm/tlb.h
  19. 1 1
      arch/x86/kernel/ldt.c
  20. 24 0
      arch/x86/kernel/setup.c
  21. 1 0
      block/blk-settings.c
  22. 1 0
      block/blk-sysfs.c
  23. 19 0
      drivers/of/fdt.c
  24. 28 13
      drivers/pcmcia/m32r_pcc.c
  25. 1 1
      drivers/sh/intc/virq.c
  26. 4 2
      fs/binfmt_elf.c
  27. 15 11
      fs/btrfs/super.c
  28. 6 4
      fs/dax.c
  29. 7 9
      fs/fs-writeback.c
  30. 3 4
      fs/ocfs2/aops.c
  31. 1 2
      fs/ocfs2/aops.h
  32. 1 1
      fs/ocfs2/cluster/heartbeat.c
  33. 1 10
      fs/ocfs2/dlm/dlmmaster.c
  34. 0 2
      fs/ocfs2/dlm/dlmrecovery.c
  35. 1 1
      fs/ocfs2/inode.c
  36. 2 2
      fs/ocfs2/journal.c
  37. 1 2
      fs/ocfs2/mmap.c
  38. 4 2
      fs/ocfs2/namei.c
  39. 1 1
      fs/ocfs2/ocfs2.h
  40. 0 1
      fs/ocfs2/refcounttree.c
  41. 1 1
      fs/ocfs2/super.c
  42. 4 3
      fs/proc/array.c
  43. 19 12
      fs/proc/base.c
  44. 26 11
      fs/proc/inode.c
  45. 2 1
      fs/proc/internal.h
  46. 1 0
      fs/proc/root.c
  47. 1 0
      fs/proc/task_mmu.c
  48. 2 11
      include/asm-generic/pgtable.h
  49. 50 33
      include/asm-generic/tlb.h
  50. 2 1
      include/linux/backing-dev-defs.h
  51. 3 0
      include/linux/cma.h
  52. 1 1
      include/linux/compiler-gcc.h
  53. 2 0
      include/linux/huge_mm.h
  54. 1 1
      include/linux/kthread.h
  55. 8 0
      include/linux/mempolicy.h
  56. 1 0
      include/linux/of_fdt.h
  57. 16 1
      include/linux/printk.h
  58. 12 22
      include/linux/radix-tree.h
  59. 9 1
      include/linux/rmap.h
  60. 5 1
      include/linux/sched.h
  61. 1 33
      include/linux/swap.h
  62. 1 0
      include/linux/vmalloc.h
  63. 1 1
      init/do_mounts.c
  64. 1 1
      kernel/debug/kdb/kdb_io.c
  65. 1 1
      kernel/fork.c
  66. 2 1
      kernel/hung_task.c
  67. 3 2
      kernel/kthread.c
  68. 53 30
      kernel/printk/nmi.c
  69. 0 10
      kernel/sys.c
  70. 20 1
      lib/Kconfig.debug
  71. 11 0
      lib/idr.c
  72. 190 107
      lib/radix-tree.c
  73. 19 4
      lib/rbtree.c
  74. 2 6
      mm/Kconfig
  75. 3 22
      mm/compaction.c
  76. 4 0
      mm/debug.c
  77. 10 58
      mm/filemap.c
  78. 8 11
      mm/gup.c
  79. 49 4
      mm/huge_memory.c
  80. 15 10
      mm/hugetlb.c
  81. 48 46
      mm/kasan/quarantine.c
  82. 2 0
      mm/kasan/report.c
  83. 31 6
      mm/khugepaged.c
  84. 1 1
      mm/kmemleak.c
  85. 1 0
      mm/madvise.c
  86. 14 1
      mm/memcontrol.c
  87. 64 28
      mm/memory.c
  88. 0 20
      mm/memory_hotplug.c
  89. 12 18
      mm/mempolicy.c
  90. 13 6
      mm/migrate.c
  91. 18 1
      mm/mprotect.c
  92. 56 19
      mm/page_alloc.c
  93. 12 4
      mm/percpu.c
  94. 28 11
      mm/readahead.c
  95. 34 35
      mm/rmap.c
  96. 8 7
      mm/shmem.c
  97. 47 82
      mm/slab.c
  98. 18 2
      mm/slab.h
  99. 31 2
      mm/slab_common.c
  100. 1 1
      mm/slob.c

+ 7 - 0
Documentation/devicetree/booting-without-of.txt

@@ -974,6 +974,13 @@ compatibility.
       4Gb. Some vendors prefer splitting those ranges into smaller
       4Gb. Some vendors prefer splitting those ranges into smaller
       segments, but the kernel doesn't care.
       segments, but the kernel doesn't care.
 
 
+  Additional properties:
+
+    - hotpluggable : The presence of this property provides an explicit
+      hint to the operating system that this memory may potentially be
+      removed later. The kernel can take this into consideration when
+      doing nonmovable allocations and when laying out memory zones.
+
   e) The /chosen node
   e) The /chosen node
 
 
   This node is a bit "special". Normally, that's where Open Firmware
   This node is a bit "special". Normally, that's where Open Firmware

+ 2 - 0
Documentation/filesystems/proc.txt

@@ -191,6 +191,7 @@ read the file /proc/PID/status:
   CapPrm: 0000000000000000
   CapPrm: 0000000000000000
   CapEff: 0000000000000000
   CapEff: 0000000000000000
   CapBnd: ffffffffffffffff
   CapBnd: ffffffffffffffff
+  NoNewPrivs:     0
   Seccomp:        0
   Seccomp:        0
   voluntary_ctxt_switches:        0
   voluntary_ctxt_switches:        0
   nonvoluntary_ctxt_switches:     1
   nonvoluntary_ctxt_switches:     1
@@ -262,6 +263,7 @@ Table 1-2: Contents of the status files (as of 4.1)
  CapPrm                      bitmap of permitted capabilities
  CapPrm                      bitmap of permitted capabilities
  CapEff                      bitmap of effective capabilities
  CapEff                      bitmap of effective capabilities
  CapBnd                      bitmap of capabilities bounding set
  CapBnd                      bitmap of capabilities bounding set
+ NoNewPrivs                  no_new_privs, like prctl(PR_GET_NO_NEW_PRIV, ...)
  Seccomp                     seccomp mode, like prctl(PR_GET_SECCOMP, ...)
  Seccomp                     seccomp mode, like prctl(PR_GET_SECCOMP, ...)
  Cpus_allowed                mask of CPUs on which this process may run
  Cpus_allowed                mask of CPUs on which this process may run
  Cpus_allowed_list           Same as previous, but in "list format"
  Cpus_allowed_list           Same as previous, but in "list format"

+ 1 - 1
Documentation/kernel-parameters.txt

@@ -2397,7 +2397,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			that the amount of memory usable for all allocations
 			that the amount of memory usable for all allocations
 			is not too small.
 			is not too small.
 
 
-	movable_node	[KNL,X86] Boot-time switch to enable the effects
+	movable_node	[KNL] Boot-time switch to enable the effects
 			of CONFIG_MOVABLE_NODE=y. See mm/Kconfig for details.
 			of CONFIG_MOVABLE_NODE=y. See mm/Kconfig for details.
 
 
 	MTD_Partition=	[MTD]
 	MTD_Partition=	[MTD]

+ 5 - 0
Documentation/vm/transhuge.txt

@@ -136,6 +136,11 @@ or enable it back by writing 1:
 echo 0 >/sys/kernel/mm/transparent_hugepage/use_zero_page
 echo 0 >/sys/kernel/mm/transparent_hugepage/use_zero_page
 echo 1 >/sys/kernel/mm/transparent_hugepage/use_zero_page
 echo 1 >/sys/kernel/mm/transparent_hugepage/use_zero_page
 
 
+Some userspace (such as a test program, or an optimized memory allocation
+library) may want to know the size (in bytes) of a transparent hugepage:
+
+cat /sys/kernel/mm/transparent_hugepage/hpage_pmd_size
+
 khugepaged will be automatically started when
 khugepaged will be automatically started when
 transparent_hugepage/enabled is set to "always" or "madvise, and it'll
 transparent_hugepage/enabled is set to "always" or "madvise, and it'll
 be automatically shutdown if it's set to "never".
 be automatically shutdown if it's set to "never".

+ 8 - 0
MAINTAINERS

@@ -74,6 +74,10 @@ Descriptions of section entries:
 	   These reviewers should be CCed on patches.
 	   These reviewers should be CCed on patches.
 	L: Mailing list that is relevant to this area
 	L: Mailing list that is relevant to this area
 	W: Web-page with status/info
 	W: Web-page with status/info
+	B: URI for where to file bugs. A web-page with detailed bug
+	   filing info, a direct bug tracker link, or a mailto: URI.
+	C: URI for chat protocol, server and channel where developers
+	   usually hang out, for example irc://server/channel.
 	Q: Patchwork web based patch tracking system site
 	Q: Patchwork web based patch tracking system site
 	T: SCM tree type and location.
 	T: SCM tree type and location.
 	   Type is one of: git, hg, quilt, stgit, topgit
 	   Type is one of: git, hg, quilt, stgit, topgit
@@ -4024,6 +4028,8 @@ DRM DRIVERS
 M:	David Airlie <airlied@linux.ie>
 M:	David Airlie <airlied@linux.ie>
 L:	dri-devel@lists.freedesktop.org
 L:	dri-devel@lists.freedesktop.org
 T:	git git://people.freedesktop.org/~airlied/linux
 T:	git git://people.freedesktop.org/~airlied/linux
+B:	https://bugs.freedesktop.org/
+C:	irc://chat.freenode.net/dri-devel
 S:	Maintained
 S:	Maintained
 F:	drivers/gpu/drm/
 F:	drivers/gpu/drm/
 F:	drivers/gpu/vga/
 F:	drivers/gpu/vga/
@@ -4076,6 +4082,8 @@ M:	Jani Nikula <jani.nikula@linux.intel.com>
 L:	intel-gfx@lists.freedesktop.org
 L:	intel-gfx@lists.freedesktop.org
 L:	dri-devel@lists.freedesktop.org
 L:	dri-devel@lists.freedesktop.org
 W:	https://01.org/linuxgraphics/
 W:	https://01.org/linuxgraphics/
+B:	https://01.org/linuxgraphics/documentation/how-report-bugs
+C:	irc://chat.freenode.net/intel-gfx
 Q:	http://patchwork.freedesktop.org/project/intel-gfx/
 Q:	http://patchwork.freedesktop.org/project/intel-gfx/
 T:	git git://anongit.freedesktop.org/drm-intel
 T:	git git://anongit.freedesktop.org/drm-intel
 S:	Supported
 S:	Supported

+ 11 - 10
arch/arm/include/asm/tlb.h

@@ -186,6 +186,8 @@ tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep, unsigned long addr)
 	tlb_add_flush(tlb, addr);
 	tlb_add_flush(tlb, addr);
 }
 }
 
 
+#define tlb_remove_huge_tlb_entry(h, tlb, ptep, address)	\
+	tlb_remove_tlb_entry(tlb, ptep, address)
 /*
 /*
  * In the case of tlb vma handling, we can optimise these away in the
  * In the case of tlb vma handling, we can optimise these away in the
  * case where we're doing a full MM flush.  When we're doing a munmap,
  * case where we're doing a full MM flush.  When we're doing a munmap,
@@ -211,18 +213,17 @@ tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma)
 
 
 static inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
 static inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
 {
 {
+	tlb->pages[tlb->nr++] = page;
+	VM_WARN_ON(tlb->nr > tlb->max);
 	if (tlb->nr == tlb->max)
 	if (tlb->nr == tlb->max)
 		return true;
 		return true;
-	tlb->pages[tlb->nr++] = page;
 	return false;
 	return false;
 }
 }
 
 
 static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page)
 static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page)
 {
 {
-	if (__tlb_remove_page(tlb, page)) {
+	if (__tlb_remove_page(tlb, page))
 		tlb_flush_mmu(tlb);
 		tlb_flush_mmu(tlb);
-		__tlb_remove_page(tlb, page);
-	}
 }
 }
 
 
 static inline bool __tlb_remove_page_size(struct mmu_gather *tlb,
 static inline bool __tlb_remove_page_size(struct mmu_gather *tlb,
@@ -231,12 +232,6 @@ static inline bool __tlb_remove_page_size(struct mmu_gather *tlb,
 	return __tlb_remove_page(tlb, page);
 	return __tlb_remove_page(tlb, page);
 }
 }
 
 
-static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb,
-					 struct page *page)
-{
-	return __tlb_remove_page(tlb, page);
-}
-
 static inline void tlb_remove_page_size(struct mmu_gather *tlb,
 static inline void tlb_remove_page_size(struct mmu_gather *tlb,
 					struct page *page, int page_size)
 					struct page *page, int page_size)
 {
 {
@@ -284,5 +279,11 @@ tlb_remove_pmd_tlb_entry(struct mmu_gather *tlb, pmd_t *pmdp, unsigned long addr
 
 
 #define tlb_migrate_finish(mm)		do { } while (0)
 #define tlb_migrate_finish(mm)		do { } while (0)
 
 
+#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change
+static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb,
+						     unsigned int page_size)
+{
+}
+
 #endif /* CONFIG_MMU */
 #endif /* CONFIG_MMU */
 #endif
 #endif

+ 13 - 12
arch/ia64/include/asm/tlb.h

@@ -207,15 +207,15 @@ tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
  */
  */
 static inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
 static inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
 {
 {
-	if (tlb->nr == tlb->max)
-		return true;
-
 	tlb->need_flush = 1;
 	tlb->need_flush = 1;
 
 
 	if (!tlb->nr && tlb->pages == tlb->local)
 	if (!tlb->nr && tlb->pages == tlb->local)
 		__tlb_alloc_page(tlb);
 		__tlb_alloc_page(tlb);
 
 
 	tlb->pages[tlb->nr++] = page;
 	tlb->pages[tlb->nr++] = page;
+	VM_WARN_ON(tlb->nr > tlb->max);
+	if (tlb->nr == tlb->max)
+		return true;
 	return false;
 	return false;
 }
 }
 
 
@@ -236,10 +236,8 @@ static inline void tlb_flush_mmu(struct mmu_gather *tlb)
 
 
 static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page)
 static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page)
 {
 {
-	if (__tlb_remove_page(tlb, page)) {
+	if (__tlb_remove_page(tlb, page))
 		tlb_flush_mmu(tlb);
 		tlb_flush_mmu(tlb);
-		__tlb_remove_page(tlb, page);
-	}
 }
 }
 
 
 static inline bool __tlb_remove_page_size(struct mmu_gather *tlb,
 static inline bool __tlb_remove_page_size(struct mmu_gather *tlb,
@@ -248,12 +246,6 @@ static inline bool __tlb_remove_page_size(struct mmu_gather *tlb,
 	return __tlb_remove_page(tlb, page);
 	return __tlb_remove_page(tlb, page);
 }
 }
 
 
-static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb,
-					 struct page *page)
-{
-	return __tlb_remove_page(tlb, page);
-}
-
 static inline void tlb_remove_page_size(struct mmu_gather *tlb,
 static inline void tlb_remove_page_size(struct mmu_gather *tlb,
 					struct page *page, int page_size)
 					struct page *page, int page_size)
 {
 {
@@ -283,6 +275,15 @@ do {							\
 	__tlb_remove_tlb_entry(tlb, ptep, addr);	\
 	__tlb_remove_tlb_entry(tlb, ptep, addr);	\
 } while (0)
 } while (0)
 
 
+#define tlb_remove_huge_tlb_entry(h, tlb, ptep, address)	\
+	tlb_remove_tlb_entry(tlb, ptep, address)
+
+#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change
+static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb,
+						     unsigned int page_size)
+{
+}
+
 #define pte_free_tlb(tlb, ptep, address)		\
 #define pte_free_tlb(tlb, ptep, address)		\
 do {							\
 do {							\
 	tlb->need_flush = 1;				\
 	tlb->need_flush = 1;				\

+ 1 - 1
arch/m32r/Kconfig

@@ -34,7 +34,7 @@ config NO_IOPORT_MAP
 	def_bool y
 	def_bool y
 
 
 config NO_DMA
 config NO_DMA
-	def_bool y
+	def_bool n
 
 
 config HZ
 config HZ
 	int
 	int

+ 5 - 1
arch/m32r/include/asm/device.h

@@ -3,5 +3,9 @@
  *
  *
  * This file is released under the GPLv2
  * This file is released under the GPLv2
  */
  */
-#include <asm-generic/device.h>
+struct dev_archdata {
+	struct dma_map_ops *dma_ops;
+};
 
 
+struct pdev_archdata {
+};

+ 32 - 0
arch/m32r/include/asm/dma-mapping.h

@@ -0,0 +1,32 @@
+#ifndef _ASM_M32R_DMA_MAPPING_H
+#define _ASM_M32R_DMA_MAPPING_H
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/scatterlist.h>
+#include <linux/dma-debug.h>
+#include <linux/io.h>
+
+#define DMA_ERROR_CODE (~(dma_addr_t)0x0)
+
+static inline struct dma_map_ops *get_dma_ops(struct device *dev)
+{
+	if (dev && dev->archdata.dma_ops)
+		return dev->archdata.dma_ops;
+	return &dma_noop_ops;
+}
+
+static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
+				  enum dma_data_direction direction)
+{
+}
+
+static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
+{
+	if (!dev->dma_mask)
+		return false;
+	return addr + size - 1 <= *dev->dma_mask;
+}
+
+#endif /* _ASM_M32R_DMA_MAPPING_H */

+ 2 - 0
arch/m32r/platforms/m32700ut/setup.c

@@ -201,6 +201,7 @@ static struct irq_chip m32700ut_lanpld_irq_type =
 #define lcdpldirq2port(x)	(unsigned long)((int)M32700UT_LCD_ICUCR1 + \
 #define lcdpldirq2port(x)	(unsigned long)((int)M32700UT_LCD_ICUCR1 + \
 				 (((x) - 1) * sizeof(unsigned short)))
 				 (((x) - 1) * sizeof(unsigned short)))
 
 
+#ifdef CONFIG_USB
 static pld_icu_data_t lcdpld_icu_data[M32700UT_NUM_LCD_PLD_IRQ];
 static pld_icu_data_t lcdpld_icu_data[M32700UT_NUM_LCD_PLD_IRQ];
 
 
 static void disable_m32700ut_lcdpld_irq(unsigned int irq)
 static void disable_m32700ut_lcdpld_irq(unsigned int irq)
@@ -253,6 +254,7 @@ static struct irq_chip m32700ut_lcdpld_irq_type =
 	.irq_mask	= mask_m32700ut_lcdpld,
 	.irq_mask	= mask_m32700ut_lcdpld,
 	.irq_unmask	= unmask_m32700ut_lcdpld,
 	.irq_unmask	= unmask_m32700ut_lcdpld,
 };
 };
+#endif
 
 
 void __init init_IRQ(void)
 void __init init_IRQ(void)
 {
 {

+ 12 - 1
arch/powerpc/include/asm/book3s/64/pgtable.h

@@ -1009,7 +1009,8 @@ static inline void pmdp_huge_split_prepare(struct vm_area_struct *vma,
 #define pmd_move_must_withdraw pmd_move_must_withdraw
 #define pmd_move_must_withdraw pmd_move_must_withdraw
 struct spinlock;
 struct spinlock;
 static inline int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl,
 static inline int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl,
-					 struct spinlock *old_pmd_ptl)
+					 struct spinlock *old_pmd_ptl,
+					 struct vm_area_struct *vma)
 {
 {
 	if (radix_enabled())
 	if (radix_enabled())
 		return false;
 		return false;
@@ -1020,6 +1021,16 @@ static inline int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl,
 	 */
 	 */
 	return true;
 	return true;
 }
 }
+
+
+#define arch_needs_pgtable_deposit arch_needs_pgtable_deposit
+static inline bool arch_needs_pgtable_deposit(void)
+{
+	if (radix_enabled())
+		return false;
+	return true;
+}
+
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 #endif /* __ASSEMBLY__ */
 #endif /* __ASSEMBLY__ */
 #endif /* _ASM_POWERPC_BOOK3S_64_PGTABLE_H_ */
 #endif /* _ASM_POWERPC_BOOK3S_64_PGTABLE_H_ */

+ 16 - 0
arch/powerpc/include/asm/tlb.h

@@ -28,6 +28,7 @@
 #define tlb_start_vma(tlb, vma)	do { } while (0)
 #define tlb_start_vma(tlb, vma)	do { } while (0)
 #define tlb_end_vma(tlb, vma)	do { } while (0)
 #define tlb_end_vma(tlb, vma)	do { } while (0)
 #define __tlb_remove_tlb_entry	__tlb_remove_tlb_entry
 #define __tlb_remove_tlb_entry	__tlb_remove_tlb_entry
+#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change
 
 
 extern void tlb_flush(struct mmu_gather *tlb);
 extern void tlb_flush(struct mmu_gather *tlb);
 
 
@@ -46,6 +47,21 @@ static inline void __tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep,
 #endif
 #endif
 }
 }
 
 
+static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb,
+						     unsigned int page_size)
+{
+	if (!tlb->page_size)
+		tlb->page_size = page_size;
+	else if (tlb->page_size != page_size) {
+		tlb_flush_mmu(tlb);
+		/*
+		 * update the page size after flush for the new
+		 * mmu_gather.
+		 */
+		tlb->page_size = page_size;
+	}
+}
+
 #ifdef CONFIG_SMP
 #ifdef CONFIG_SMP
 static inline int mm_is_core_local(struct mm_struct *mm)
 static inline int mm_is_core_local(struct mm_struct *mm)
 {
 {

+ 1 - 12
arch/powerpc/mm/numa.c

@@ -1085,7 +1085,7 @@ static int hot_add_node_scn_to_nid(unsigned long scn_addr)
 int hot_add_scn_to_nid(unsigned long scn_addr)
 int hot_add_scn_to_nid(unsigned long scn_addr)
 {
 {
 	struct device_node *memory = NULL;
 	struct device_node *memory = NULL;
-	int nid, found = 0;
+	int nid;
 
 
 	if (!numa_enabled || (min_common_depth < 0))
 	if (!numa_enabled || (min_common_depth < 0))
 		return first_online_node;
 		return first_online_node;
@@ -1101,17 +1101,6 @@ int hot_add_scn_to_nid(unsigned long scn_addr)
 	if (nid < 0 || !node_online(nid))
 	if (nid < 0 || !node_online(nid))
 		nid = first_online_node;
 		nid = first_online_node;
 
 
-	if (NODE_DATA(nid)->node_spanned_pages)
-		return nid;
-
-	for_each_online_node(nid) {
-		if (NODE_DATA(nid)->node_spanned_pages) {
-			found = 1;
-			break;
-		}
-	}
-
-	BUG_ON(!found);
 	return nid;
 	return nid;
 }
 }
 
 

+ 8 - 6
arch/s390/include/asm/tlb.h

@@ -104,12 +104,6 @@ static inline bool __tlb_remove_page_size(struct mmu_gather *tlb,
 	return __tlb_remove_page(tlb, page);
 	return __tlb_remove_page(tlb, page);
 }
 }
 
 
-static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb,
-					 struct page *page)
-{
-	return __tlb_remove_page(tlb, page);
-}
-
 static inline void tlb_remove_page_size(struct mmu_gather *tlb,
 static inline void tlb_remove_page_size(struct mmu_gather *tlb,
 					struct page *page, int page_size)
 					struct page *page, int page_size)
 {
 {
@@ -162,5 +156,13 @@ static inline void pud_free_tlb(struct mmu_gather *tlb, pud_t *pud,
 #define tlb_remove_tlb_entry(tlb, ptep, addr)	do { } while (0)
 #define tlb_remove_tlb_entry(tlb, ptep, addr)	do { } while (0)
 #define tlb_remove_pmd_tlb_entry(tlb, pmdp, addr)	do { } while (0)
 #define tlb_remove_pmd_tlb_entry(tlb, pmdp, addr)	do { } while (0)
 #define tlb_migrate_finish(mm)			do { } while (0)
 #define tlb_migrate_finish(mm)			do { } while (0)
+#define tlb_remove_huge_tlb_entry(h, tlb, ptep, address)	\
+	tlb_remove_tlb_entry(tlb, ptep, address)
+
+#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change
+static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb,
+						     unsigned int page_size)
+{
+}
 
 
 #endif /* _S390_TLB_H */
 #endif /* _S390_TLB_H */

+ 1 - 1
arch/s390/mm/gmap.c

@@ -1015,7 +1015,7 @@ static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr,
 	if (slot) {
 	if (slot) {
 		rmap->next = radix_tree_deref_slot_protected(slot,
 		rmap->next = radix_tree_deref_slot_protected(slot,
 							&sg->guest_table_lock);
 							&sg->guest_table_lock);
-		radix_tree_replace_slot(slot, rmap);
+		radix_tree_replace_slot(&sg->host_to_rmap, slot, rmap);
 	} else {
 	} else {
 		rmap->next = NULL;
 		rmap->next = NULL;
 		radix_tree_insert(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT,
 		radix_tree_insert(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT,

+ 9 - 6
arch/sh/include/asm/tlb.h

@@ -65,6 +65,9 @@ tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep, unsigned long address)
 		tlb->end = address + PAGE_SIZE;
 		tlb->end = address + PAGE_SIZE;
 }
 }
 
 
+#define tlb_remove_huge_tlb_entry(h, tlb, ptep, address)	\
+	tlb_remove_tlb_entry(tlb, ptep, address)
+
 /*
 /*
  * In the case of tlb vma handling, we can optimise these away in the
  * In the case of tlb vma handling, we can optimise these away in the
  * case where we're doing a full MM flush.  When we're doing a munmap,
  * case where we're doing a full MM flush.  When we're doing a munmap,
@@ -115,18 +118,18 @@ static inline bool __tlb_remove_page_size(struct mmu_gather *tlb,
 	return __tlb_remove_page(tlb, page);
 	return __tlb_remove_page(tlb, page);
 }
 }
 
 
-static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb,
-					 struct page *page)
-{
-	return __tlb_remove_page(tlb, page);
-}
-
 static inline void tlb_remove_page_size(struct mmu_gather *tlb,
 static inline void tlb_remove_page_size(struct mmu_gather *tlb,
 					struct page *page, int page_size)
 					struct page *page, int page_size)
 {
 {
 	return tlb_remove_page(tlb, page);
 	return tlb_remove_page(tlb, page);
 }
 }
 
 
+#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change
+static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb,
+						     unsigned int page_size)
+{
+}
+
 #define pte_free_tlb(tlb, ptep, addr)	pte_free((tlb)->mm, ptep)
 #define pte_free_tlb(tlb, ptep, addr)	pte_free((tlb)->mm, ptep)
 #define pmd_free_tlb(tlb, pmdp, addr)	pmd_free((tlb)->mm, pmdp)
 #define pmd_free_tlb(tlb, pmdp, addr)	pmd_free((tlb)->mm, pmdp)
 #define pud_free_tlb(tlb, pudp, addr)	pud_free((tlb)->mm, pudp)
 #define pud_free_tlb(tlb, pudp, addr)	pud_free((tlb)->mm, pudp)

+ 9 - 6
arch/um/include/asm/tlb.h

@@ -116,12 +116,6 @@ static inline bool __tlb_remove_page_size(struct mmu_gather *tlb,
 	return __tlb_remove_page(tlb, page);
 	return __tlb_remove_page(tlb, page);
 }
 }
 
 
-static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb,
-					 struct page *page)
-{
-	return __tlb_remove_page(tlb, page);
-}
-
 static inline void tlb_remove_page_size(struct mmu_gather *tlb,
 static inline void tlb_remove_page_size(struct mmu_gather *tlb,
 					struct page *page, int page_size)
 					struct page *page, int page_size)
 {
 {
@@ -141,6 +135,15 @@ static inline void tlb_remove_page_size(struct mmu_gather *tlb,
 		__tlb_remove_tlb_entry(tlb, ptep, address);	\
 		__tlb_remove_tlb_entry(tlb, ptep, address);	\
 	} while (0)
 	} while (0)
 
 
+#define tlb_remove_huge_tlb_entry(h, tlb, ptep, address)	\
+	tlb_remove_tlb_entry(tlb, ptep, address)
+
+#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change
+static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb,
+						     unsigned int page_size)
+{
+}
+
 #define pte_free_tlb(tlb, ptep, addr) __pte_free_tlb(tlb, ptep, addr)
 #define pte_free_tlb(tlb, ptep, addr) __pte_free_tlb(tlb, ptep, addr)
 
 
 #define pud_free_tlb(tlb, pudp, addr) __pud_free_tlb(tlb, pudp, addr)
 #define pud_free_tlb(tlb, pudp, addr) __pud_free_tlb(tlb, pudp, addr)

+ 1 - 1
arch/x86/kernel/ldt.c

@@ -93,7 +93,7 @@ static void free_ldt_struct(struct ldt_struct *ldt)
 
 
 	paravirt_free_ldt(ldt->entries, ldt->size);
 	paravirt_free_ldt(ldt->entries, ldt->size);
 	if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
 	if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
-		vfree(ldt->entries);
+		vfree_atomic(ldt->entries);
 	else
 	else
 		free_page((unsigned long)ldt->entries);
 		free_page((unsigned long)ldt->entries);
 	kfree(ldt);
 	kfree(ldt);

+ 24 - 0
arch/x86/kernel/setup.c

@@ -985,6 +985,30 @@ void __init setup_arch(char **cmdline_p)
 
 
 	parse_early_param();
 	parse_early_param();
 
 
+#ifdef CONFIG_MEMORY_HOTPLUG
+	/*
+	 * Memory used by the kernel cannot be hot-removed because Linux
+	 * cannot migrate the kernel pages. When memory hotplug is
+	 * enabled, we should prevent memblock from allocating memory
+	 * for the kernel.
+	 *
+	 * ACPI SRAT records all hotpluggable memory ranges. But before
+	 * SRAT is parsed, we don't know about it.
+	 *
+	 * The kernel image is loaded into memory at very early time. We
+	 * cannot prevent this anyway. So on NUMA system, we set any
+	 * node the kernel resides in as un-hotpluggable.
+	 *
+	 * Since on modern servers, one node could have double-digit
+	 * gigabytes memory, we can assume the memory around the kernel
+	 * image is also un-hotpluggable. So before SRAT is parsed, just
+	 * allocate memory near the kernel image to try the best to keep
+	 * the kernel away from hotpluggable memory.
+	 */
+	if (movable_node_is_enabled())
+		memblock_set_bottom_up(true);
+#endif
+
 	x86_report_nx();
 	x86_report_nx();
 
 
 	/* after early param, so could get panic from serial */
 	/* after early param, so could get panic from serial */

+ 1 - 0
block/blk-settings.c

@@ -249,6 +249,7 @@ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_secto
 	max_sectors = min_not_zero(max_hw_sectors, limits->max_dev_sectors);
 	max_sectors = min_not_zero(max_hw_sectors, limits->max_dev_sectors);
 	max_sectors = min_t(unsigned int, max_sectors, BLK_DEF_MAX_SECTORS);
 	max_sectors = min_t(unsigned int, max_sectors, BLK_DEF_MAX_SECTORS);
 	limits->max_sectors = max_sectors;
 	limits->max_sectors = max_sectors;
+	q->backing_dev_info.io_pages = max_sectors >> (PAGE_SHIFT - 9);
 }
 }
 EXPORT_SYMBOL(blk_queue_max_hw_sectors);
 EXPORT_SYMBOL(blk_queue_max_hw_sectors);
 
 

+ 1 - 0
block/blk-sysfs.c

@@ -212,6 +212,7 @@ queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
 
 
 	spin_lock_irq(q->queue_lock);
 	spin_lock_irq(q->queue_lock);
 	q->limits.max_sectors = max_sectors_kb << 1;
 	q->limits.max_sectors = max_sectors_kb << 1;
+	q->backing_dev_info.io_pages = max_sectors_kb >> (PAGE_SHIFT - 10);
 	spin_unlock_irq(q->queue_lock);
 	spin_unlock_irq(q->queue_lock);
 
 
 	return ret;
 	return ret;

+ 19 - 0
drivers/of/fdt.c

@@ -1015,6 +1015,7 @@ int __init early_init_dt_scan_memory(unsigned long node, const char *uname,
 	const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
 	const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
 	const __be32 *reg, *endp;
 	const __be32 *reg, *endp;
 	int l;
 	int l;
+	bool hotpluggable;
 
 
 	/* We are scanning "memory" nodes only */
 	/* We are scanning "memory" nodes only */
 	if (type == NULL) {
 	if (type == NULL) {
@@ -1034,6 +1035,7 @@ int __init early_init_dt_scan_memory(unsigned long node, const char *uname,
 		return 0;
 		return 0;
 
 
 	endp = reg + (l / sizeof(__be32));
 	endp = reg + (l / sizeof(__be32));
+	hotpluggable = of_get_flat_dt_prop(node, "hotpluggable", NULL);
 
 
 	pr_debug("memory scan node %s, reg size %d,\n", uname, l);
 	pr_debug("memory scan node %s, reg size %d,\n", uname, l);
 
 
@@ -1049,6 +1051,13 @@ int __init early_init_dt_scan_memory(unsigned long node, const char *uname,
 		    (unsigned long long)size);
 		    (unsigned long long)size);
 
 
 		early_init_dt_add_memory_arch(base, size);
 		early_init_dt_add_memory_arch(base, size);
+
+		if (!hotpluggable)
+			continue;
+
+		if (early_init_dt_mark_hotplug_memory_arch(base, size))
+			pr_warn("failed to mark hotplug range 0x%llx - 0x%llx\n",
+				base, base + size);
 	}
 	}
 
 
 	return 0;
 	return 0;
@@ -1146,6 +1155,11 @@ void __init __weak early_init_dt_add_memory_arch(u64 base, u64 size)
 	memblock_add(base, size);
 	memblock_add(base, size);
 }
 }
 
 
+int __init __weak early_init_dt_mark_hotplug_memory_arch(u64 base, u64 size)
+{
+	return memblock_mark_hotplug(base, size);
+}
+
 int __init __weak early_init_dt_reserve_memory_arch(phys_addr_t base,
 int __init __weak early_init_dt_reserve_memory_arch(phys_addr_t base,
 					phys_addr_t size, bool nomap)
 					phys_addr_t size, bool nomap)
 {
 {
@@ -1168,6 +1182,11 @@ void __init __weak early_init_dt_add_memory_arch(u64 base, u64 size)
 	WARN_ON(1);
 	WARN_ON(1);
 }
 }
 
 
+int __init __weak early_init_dt_mark_hotplug_memory_arch(u64 base, u64 size)
+{
+	return -ENOSYS;
+}
+
 int __init __weak early_init_dt_reserve_memory_arch(phys_addr_t base,
 int __init __weak early_init_dt_reserve_memory_arch(phys_addr_t base,
 					phys_addr_t size, bool nomap)
 					phys_addr_t size, bool nomap)
 {
 {

+ 28 - 13
drivers/pcmcia/m32r_pcc.c

@@ -296,10 +296,11 @@ static int __init is_alive(u_short sock)
 	return 0;
 	return 0;
 }
 }
 
 
-static void add_pcc_socket(ulong base, int irq, ulong mapaddr,
-			   unsigned int ioaddr)
+static int add_pcc_socket(ulong base, int irq, ulong mapaddr,
+			  unsigned int ioaddr)
 {
 {
   	pcc_socket_t *t = &socket[pcc_sockets];
   	pcc_socket_t *t = &socket[pcc_sockets];
+	int err;
 
 
 	/* add sockets */
 	/* add sockets */
 	t->ioaddr = ioaddr;
 	t->ioaddr = ioaddr;
@@ -328,11 +329,16 @@ static void add_pcc_socket(ulong base, int irq, ulong mapaddr,
 	t->socket.irq_mask = 0;
 	t->socket.irq_mask = 0;
 	t->socket.pci_irq = 2 + pcc_sockets; /* XXX */
 	t->socket.pci_irq = 2 + pcc_sockets; /* XXX */
 
 
-	request_irq(irq, pcc_interrupt, 0, "m32r-pcc", pcc_interrupt);
+	err = request_irq(irq, pcc_interrupt, 0, "m32r-pcc", pcc_interrupt);
+	if (err) {
+		if (t->base > 0)
+			release_region(t->base, 0x20);
+		return err;
+	}
 
 
 	pcc_sockets++;
 	pcc_sockets++;
 
 
-	return;
+	return 0;
 }
 }
 
 
 
 
@@ -683,26 +689,29 @@ static int __init init_m32r_pcc(void)
 		return ret;
 		return ret;
 
 
 	ret = platform_device_register(&pcc_device);
 	ret = platform_device_register(&pcc_device);
-	if (ret){
-		platform_driver_unregister(&pcc_driver);
-		return ret;
-	}
+	if (ret)
+		goto unreg_driv;
 
 
 	printk(KERN_INFO "m32r PCC probe:\n");
 	printk(KERN_INFO "m32r PCC probe:\n");
 
 
 	pcc_sockets = 0;
 	pcc_sockets = 0;
 
 
-	add_pcc_socket(M32R_PCC0_BASE, PCC0_IRQ, M32R_PCC0_MAPBASE, 0x1000);
+	ret = add_pcc_socket(M32R_PCC0_BASE, PCC0_IRQ, M32R_PCC0_MAPBASE,
+			     0x1000);
+	if (ret)
+		goto unreg_dev;
 
 
 #ifdef CONFIG_M32RPCC_SLOT2
 #ifdef CONFIG_M32RPCC_SLOT2
-	add_pcc_socket(M32R_PCC1_BASE, PCC1_IRQ, M32R_PCC1_MAPBASE, 0x2000);
+	ret = add_pcc_socket(M32R_PCC1_BASE, PCC1_IRQ, M32R_PCC1_MAPBASE,
+			     0x2000);
+	if (ret)
+		goto unreg_dev;
 #endif
 #endif
 
 
 	if (pcc_sockets == 0) {
 	if (pcc_sockets == 0) {
 		printk("socket is not found.\n");
 		printk("socket is not found.\n");
-		platform_device_unregister(&pcc_device);
-		platform_driver_unregister(&pcc_driver);
-		return -ENODEV;
+		ret = -ENODEV;
+		goto unreg_dev;
 	}
 	}
 
 
 	/* Set up interrupt handler(s) */
 	/* Set up interrupt handler(s) */
@@ -728,6 +737,12 @@ static int __init init_m32r_pcc(void)
 	}
 	}
 
 
 	return 0;
 	return 0;
+
+unreg_dev:
+	platform_device_unregister(&pcc_device);
+unreg_driv:
+	platform_driver_unregister(&pcc_driver);
+	return ret;
 } /* init_m32r_pcc */
 } /* init_m32r_pcc */
 
 
 static void __exit exit_m32r_pcc(void)
 static void __exit exit_m32r_pcc(void)

+ 1 - 1
drivers/sh/intc/virq.c

@@ -254,7 +254,7 @@ restart:
 
 
 		radix_tree_tag_clear(&d->tree, entry->enum_id,
 		radix_tree_tag_clear(&d->tree, entry->enum_id,
 				     INTC_TAG_VIRQ_NEEDS_ALLOC);
 				     INTC_TAG_VIRQ_NEEDS_ALLOC);
-		radix_tree_replace_slot((void **)entries[i],
+		radix_tree_replace_slot(&d->tree, (void **)entries[i],
 					&intc_irq_xlate[irq]);
 					&intc_irq_xlate[irq]);
 	}
 	}
 
 

+ 4 - 2
fs/binfmt_elf.c

@@ -2204,7 +2204,9 @@ static int elf_core_dump(struct coredump_params *cprm)
 
 
 	dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE);
 	dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE);
 
 
-	vma_filesz = kmalloc_array(segs - 1, sizeof(*vma_filesz), GFP_KERNEL);
+	if (segs - 1 > ULONG_MAX / sizeof(*vma_filesz))
+		goto end_coredump;
+	vma_filesz = vmalloc((segs - 1) * sizeof(*vma_filesz));
 	if (!vma_filesz)
 	if (!vma_filesz)
 		goto end_coredump;
 		goto end_coredump;
 
 
@@ -2311,7 +2313,7 @@ end_coredump:
 cleanup:
 cleanup:
 	free_note_info(&info);
 	free_note_info(&info);
 	kfree(shdr4extnum);
 	kfree(shdr4extnum);
-	kfree(vma_filesz);
+	vfree(vma_filesz);
 	kfree(phdr4note);
 	kfree(phdr4note);
 	kfree(elf);
 	kfree(elf);
 out:
 out:

+ 15 - 11
fs/btrfs/super.c

@@ -202,27 +202,31 @@ static struct ratelimit_state printk_limits[] = {
 void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
 void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
 {
 {
 	struct super_block *sb = fs_info->sb;
 	struct super_block *sb = fs_info->sb;
-	char lvl[4];
+	char lvl[PRINTK_MAX_SINGLE_HEADER_LEN + 1];
 	struct va_format vaf;
 	struct va_format vaf;
 	va_list args;
 	va_list args;
-	const char *type = logtypes[4];
+	const char *type = NULL;
 	int kern_level;
 	int kern_level;
 	struct ratelimit_state *ratelimit;
 	struct ratelimit_state *ratelimit;
 
 
 	va_start(args, fmt);
 	va_start(args, fmt);
 
 
-	kern_level = printk_get_level(fmt);
-	if (kern_level) {
+	while ((kern_level = printk_get_level(fmt)) != 0) {
 		size_t size = printk_skip_level(fmt) - fmt;
 		size_t size = printk_skip_level(fmt) - fmt;
-		memcpy(lvl, fmt,  size);
-		lvl[size] = '\0';
+
+		if (kern_level >= '0' && kern_level <= '7') {
+			memcpy(lvl, fmt,  size);
+			lvl[size] = '\0';
+			type = logtypes[kern_level - '0'];
+			ratelimit = &printk_limits[kern_level - '0'];
+		}
 		fmt += size;
 		fmt += size;
-		type = logtypes[kern_level - '0'];
-		ratelimit = &printk_limits[kern_level - '0'];
-	} else {
+	}
+
+	if (!type) {
 		*lvl = '\0';
 		*lvl = '\0';
-		/* Default to debug output */
-		ratelimit = &printk_limits[7];
+		type = logtypes[4];
+		ratelimit = &printk_limits[4];
 	}
 	}
 
 
 	vaf.fmt = fmt;
 	vaf.fmt = fmt;

+ 6 - 4
fs/dax.c

@@ -342,7 +342,7 @@ static inline void *lock_slot(struct address_space *mapping, void **slot)
 		radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
 		radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
 
 
 	entry |= RADIX_DAX_ENTRY_LOCK;
 	entry |= RADIX_DAX_ENTRY_LOCK;
-	radix_tree_replace_slot(slot, (void *)entry);
+	radix_tree_replace_slot(&mapping->page_tree, slot, (void *)entry);
 	return (void *)entry;
 	return (void *)entry;
 }
 }
 
 
@@ -356,7 +356,7 @@ static inline void *unlock_slot(struct address_space *mapping, void **slot)
 		radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
 		radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
 
 
 	entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK;
 	entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK;
-	radix_tree_replace_slot(slot, (void *)entry);
+	radix_tree_replace_slot(&mapping->page_tree, slot, (void *)entry);
 	return (void *)entry;
 	return (void *)entry;
 }
 }
 
 
@@ -643,12 +643,14 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
 		}
 		}
 		mapping->nrexceptional++;
 		mapping->nrexceptional++;
 	} else {
 	} else {
+		struct radix_tree_node *node;
 		void **slot;
 		void **slot;
 		void *ret;
 		void *ret;
 
 
-		ret = __radix_tree_lookup(page_tree, index, NULL, &slot);
+		ret = __radix_tree_lookup(page_tree, index, &node, &slot);
 		WARN_ON_ONCE(ret != entry);
 		WARN_ON_ONCE(ret != entry);
-		radix_tree_replace_slot(slot, new_entry);
+		__radix_tree_replace(page_tree, node, slot,
+				     new_entry, NULL, NULL);
 	}
 	}
 	if (vmf->flags & FAULT_FLAG_WRITE)
 	if (vmf->flags & FAULT_FLAG_WRITE)
 		radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
 		radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);

+ 7 - 9
fs/fs-writeback.c

@@ -1769,15 +1769,13 @@ static long wb_writeback(struct bdi_writeback *wb,
 		 * become available for writeback. Otherwise
 		 * become available for writeback. Otherwise
 		 * we'll just busyloop.
 		 * we'll just busyloop.
 		 */
 		 */
-		if (!list_empty(&wb->b_more_io))  {
-			trace_writeback_wait(wb, work);
-			inode = wb_inode(wb->b_more_io.prev);
-			spin_lock(&inode->i_lock);
-			spin_unlock(&wb->list_lock);
-			/* This function drops i_lock... */
-			inode_sleep_on_writeback(inode);
-			spin_lock(&wb->list_lock);
-		}
+		trace_writeback_wait(wb, work);
+		inode = wb_inode(wb->b_more_io.prev);
+		spin_lock(&inode->i_lock);
+		spin_unlock(&wb->list_lock);
+		/* This function drops i_lock... */
+		inode_sleep_on_writeback(inode);
+		spin_lock(&wb->list_lock);
 	}
 	}
 	spin_unlock(&wb->list_lock);
 	spin_unlock(&wb->list_lock);
 	blk_finish_plug(&plug);
 	blk_finish_plug(&plug);

+ 3 - 4
fs/ocfs2/aops.c

@@ -1950,8 +1950,7 @@ static void ocfs2_write_end_inline(struct inode *inode, loff_t pos,
 }
 }
 
 
 int ocfs2_write_end_nolock(struct address_space *mapping,
 int ocfs2_write_end_nolock(struct address_space *mapping,
-			   loff_t pos, unsigned len, unsigned copied,
-			   struct page *page, void *fsdata)
+			   loff_t pos, unsigned len, unsigned copied, void *fsdata)
 {
 {
 	int i, ret;
 	int i, ret;
 	unsigned from, to, start = pos & (PAGE_SIZE - 1);
 	unsigned from, to, start = pos & (PAGE_SIZE - 1);
@@ -2064,7 +2063,7 @@ static int ocfs2_write_end(struct file *file, struct address_space *mapping,
 	int ret;
 	int ret;
 	struct inode *inode = mapping->host;
 	struct inode *inode = mapping->host;
 
 
-	ret = ocfs2_write_end_nolock(mapping, pos, len, copied, page, fsdata);
+	ret = ocfs2_write_end_nolock(mapping, pos, len, copied, fsdata);
 
 
 	up_write(&OCFS2_I(inode)->ip_alloc_sem);
 	up_write(&OCFS2_I(inode)->ip_alloc_sem);
 	ocfs2_inode_unlock(inode, 1);
 	ocfs2_inode_unlock(inode, 1);
@@ -2241,7 +2240,7 @@ static int ocfs2_dio_get_block(struct inode *inode, sector_t iblock,
 		dwc->dw_zero_count++;
 		dwc->dw_zero_count++;
 	}
 	}
 
 
-	ret = ocfs2_write_end_nolock(inode->i_mapping, pos, len, len, NULL, wc);
+	ret = ocfs2_write_end_nolock(inode->i_mapping, pos, len, len, wc);
 	BUG_ON(ret != len);
 	BUG_ON(ret != len);
 	ret = 0;
 	ret = 0;
 unlock:
 unlock:

+ 1 - 2
fs/ocfs2/aops.h

@@ -44,8 +44,7 @@ int walk_page_buffers(	handle_t *handle,
 					struct buffer_head *bh));
 					struct buffer_head *bh));
 
 
 int ocfs2_write_end_nolock(struct address_space *mapping,
 int ocfs2_write_end_nolock(struct address_space *mapping,
-			   loff_t pos, unsigned len, unsigned copied,
-			   struct page *page, void *fsdata);
+			   loff_t pos, unsigned len, unsigned copied, void *fsdata);
 
 
 typedef enum {
 typedef enum {
 	OCFS2_WRITE_BUFFER = 0,
 	OCFS2_WRITE_BUFFER = 0,

+ 1 - 1
fs/ocfs2/cluster/heartbeat.c

@@ -741,7 +741,7 @@ static inline void o2hb_prepare_block(struct o2hb_region *reg,
 	hb_block = (struct o2hb_disk_heartbeat_block *)slot->ds_raw_block;
 	hb_block = (struct o2hb_disk_heartbeat_block *)slot->ds_raw_block;
 	memset(hb_block, 0, reg->hr_block_bytes);
 	memset(hb_block, 0, reg->hr_block_bytes);
 	/* TODO: time stuff */
 	/* TODO: time stuff */
-	cputime = CURRENT_TIME.tv_sec;
+	cputime = ktime_get_real_seconds();
 	if (!cputime)
 	if (!cputime)
 		cputime = 1;
 		cputime = 1;
 
 

+ 1 - 10
fs/ocfs2/dlm/dlmmaster.c

@@ -1609,8 +1609,6 @@ way_up_top:
 		__dlm_insert_mle(dlm, mle);
 		__dlm_insert_mle(dlm, mle);
 		response = DLM_MASTER_RESP_NO;
 		response = DLM_MASTER_RESP_NO;
 	} else {
 	} else {
-		// mlog(0, "mle was found\n");
-		set_maybe = 1;
 		spin_lock(&tmpmle->spinlock);
 		spin_lock(&tmpmle->spinlock);
 		if (tmpmle->master == dlm->node_num) {
 		if (tmpmle->master == dlm->node_num) {
 			mlog(ML_ERROR, "no lockres, but an mle with this node as master!\n");
 			mlog(ML_ERROR, "no lockres, but an mle with this node as master!\n");
@@ -1625,8 +1623,7 @@ way_up_top:
 			response = DLM_MASTER_RESP_NO;
 			response = DLM_MASTER_RESP_NO;
 		} else
 		} else
 			response = DLM_MASTER_RESP_MAYBE;
 			response = DLM_MASTER_RESP_MAYBE;
-		if (set_maybe)
-			set_bit(request->node_idx, tmpmle->maybe_map);
+		set_bit(request->node_idx, tmpmle->maybe_map);
 		spin_unlock(&tmpmle->spinlock);
 		spin_unlock(&tmpmle->spinlock);
 	}
 	}
 	spin_unlock(&dlm->master_lock);
 	spin_unlock(&dlm->master_lock);
@@ -1644,12 +1641,6 @@ send_response:
 	 * dlm_assert_master_worker() isn't called, we drop it here.
 	 * dlm_assert_master_worker() isn't called, we drop it here.
 	 */
 	 */
 	if (dispatch_assert) {
 	if (dispatch_assert) {
-		if (response != DLM_MASTER_RESP_YES)
-			mlog(ML_ERROR, "invalid response %d\n", response);
-		if (!res) {
-			mlog(ML_ERROR, "bad lockres while trying to assert!\n");
-			BUG();
-		}
 		mlog(0, "%u is the owner of %.*s, cleaning everyone else\n",
 		mlog(0, "%u is the owner of %.*s, cleaning everyone else\n",
 			     dlm->node_num, res->lockname.len, res->lockname.name);
 			     dlm->node_num, res->lockname.len, res->lockname.name);
 		spin_lock(&res->spinlock);
 		spin_lock(&res->spinlock);

+ 0 - 2
fs/ocfs2/dlm/dlmrecovery.c

@@ -2966,8 +2966,6 @@ int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data,
 			spin_unlock(&dlm->spinlock);
 			spin_unlock(&dlm->spinlock);
 			dlm_kick_recovery_thread(dlm);
 			dlm_kick_recovery_thread(dlm);
 			break;
 			break;
-		default:
-			BUG();
 	}
 	}
 
 
 	mlog(0, "%s: recovery done, reco master was %u, dead now %u, master now %u\n",
 	mlog(0, "%s: recovery done, reco master was %u, dead now %u, master now %u\n",

+ 1 - 1
fs/ocfs2/inode.c

@@ -703,7 +703,7 @@ static int ocfs2_remove_inode(struct inode *inode,
 		goto bail_commit;
 		goto bail_commit;
 	}
 	}
 
 
-	di->i_dtime = cpu_to_le64(CURRENT_TIME.tv_sec);
+	di->i_dtime = cpu_to_le64(ktime_get_real_seconds());
 	di->i_flags &= cpu_to_le32(~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL));
 	di->i_flags &= cpu_to_le32(~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL));
 	ocfs2_journal_dirty(handle, di_bh);
 	ocfs2_journal_dirty(handle, di_bh);
 
 

+ 2 - 2
fs/ocfs2/journal.c

@@ -1947,7 +1947,7 @@ static void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
 	 */
 	 */
 	seqno++;
 	seqno++;
 	os->os_count++;
 	os->os_count++;
-	os->os_scantime = CURRENT_TIME;
+	os->os_scantime = ktime_get_seconds();
 unlock:
 unlock:
 	ocfs2_orphan_scan_unlock(osb, seqno);
 	ocfs2_orphan_scan_unlock(osb, seqno);
 out:
 out:
@@ -2004,7 +2004,7 @@ void ocfs2_orphan_scan_start(struct ocfs2_super *osb)
 	struct ocfs2_orphan_scan *os;
 	struct ocfs2_orphan_scan *os;
 
 
 	os = &osb->osb_orphan_scan;
 	os = &osb->osb_orphan_scan;
-	os->os_scantime = CURRENT_TIME;
+	os->os_scantime = ktime_get_seconds();
 	if (ocfs2_is_hard_readonly(osb) || ocfs2_mount_local(osb))
 	if (ocfs2_is_hard_readonly(osb) || ocfs2_mount_local(osb))
 		atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE);
 		atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE);
 	else {
 	else {

+ 1 - 2
fs/ocfs2/mmap.c

@@ -120,8 +120,7 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
 		ret = VM_FAULT_NOPAGE;
 		ret = VM_FAULT_NOPAGE;
 		goto out;
 		goto out;
 	}
 	}
-	ret = ocfs2_write_end_nolock(mapping, pos, len, len, locked_page,
-				     fsdata);
+	ret = ocfs2_write_end_nolock(mapping, pos, len, len, fsdata);
 	BUG_ON(ret != len);
 	BUG_ON(ret != len);
 	ret = VM_FAULT_LOCKED;
 	ret = VM_FAULT_LOCKED;
 out:
 out:

+ 4 - 2
fs/ocfs2/namei.c

@@ -516,6 +516,7 @@ static int __ocfs2_mknod_locked(struct inode *dir,
 	struct ocfs2_extent_list *fel;
 	struct ocfs2_extent_list *fel;
 	u16 feat;
 	u16 feat;
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct timespec64 ts;
 
 
 	*new_fe_bh = NULL;
 	*new_fe_bh = NULL;
 
 
@@ -564,10 +565,11 @@ static int __ocfs2_mknod_locked(struct inode *dir,
 	fe->i_last_eb_blk = 0;
 	fe->i_last_eb_blk = 0;
 	strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE);
 	strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE);
 	fe->i_flags |= cpu_to_le32(OCFS2_VALID_FL);
 	fe->i_flags |= cpu_to_le32(OCFS2_VALID_FL);
+	ktime_get_real_ts64(&ts);
 	fe->i_atime = fe->i_ctime = fe->i_mtime =
 	fe->i_atime = fe->i_ctime = fe->i_mtime =
-		cpu_to_le64(CURRENT_TIME.tv_sec);
+		cpu_to_le64(ts.tv_sec);
 	fe->i_mtime_nsec = fe->i_ctime_nsec = fe->i_atime_nsec =
 	fe->i_mtime_nsec = fe->i_ctime_nsec = fe->i_atime_nsec =
-		cpu_to_le32(CURRENT_TIME.tv_nsec);
+		cpu_to_le32(ts.tv_nsec);
 	fe->i_dtime = 0;
 	fe->i_dtime = 0;
 
 
 	/*
 	/*

+ 1 - 1
fs/ocfs2/ocfs2.h

@@ -224,7 +224,7 @@ struct ocfs2_orphan_scan {
 	struct ocfs2_super 	*os_osb;
 	struct ocfs2_super 	*os_osb;
 	struct ocfs2_lock_res 	os_lockres;     /* lock to synchronize scans */
 	struct ocfs2_lock_res 	os_lockres;     /* lock to synchronize scans */
 	struct delayed_work 	os_orphan_scan_work;
 	struct delayed_work 	os_orphan_scan_work;
-	struct timespec		os_scantime;  /* time this node ran the scan */
+	time64_t		os_scantime;  /* time this node ran the scan */
 	u32			os_count;      /* tracks node specific scans */
 	u32			os_count;      /* tracks node specific scans */
 	u32  			os_seqno;       /* tracks cluster wide scans */
 	u32  			os_seqno;       /* tracks cluster wide scans */
 	atomic_t		os_state;              /* ACTIVE or INACTIVE */
 	atomic_t		os_state;              /* ACTIVE or INACTIVE */

+ 0 - 1
fs/ocfs2/refcounttree.c

@@ -478,7 +478,6 @@ again:
 	if (ret) {
 	if (ret) {
 		mlog_errno(ret);
 		mlog_errno(ret);
 		ocfs2_unlock_refcount_tree(osb, tree, rw);
 		ocfs2_unlock_refcount_tree(osb, tree, rw);
-		ocfs2_refcount_tree_put(tree);
 		goto out;
 		goto out;
 	}
 	}
 
 

+ 1 - 1
fs/ocfs2/super.c

@@ -337,7 +337,7 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
 		out += snprintf(buf + out, len - out, "Disabled\n");
 		out += snprintf(buf + out, len - out, "Disabled\n");
 	else
 	else
 		out += snprintf(buf + out, len - out, "%lu seconds ago\n",
 		out += snprintf(buf + out, len - out, "%lu seconds ago\n",
-				(get_seconds() - os->os_scantime.tv_sec));
+				(unsigned long)(ktime_get_seconds() - os->os_scantime));
 
 
 	out += snprintf(buf + out, len - out, "%10s => %3s  %10s\n",
 	out += snprintf(buf + out, len - out, "%10s => %3s  %10s\n",
 			"Slots", "Num", "RecoGen");
 			"Slots", "Num", "RecoGen");

+ 4 - 3
fs/proc/array.c

@@ -245,7 +245,7 @@ void render_sigset_t(struct seq_file *m, const char *header,
 		if (sigismember(set, i+2)) x |= 2;
 		if (sigismember(set, i+2)) x |= 2;
 		if (sigismember(set, i+3)) x |= 4;
 		if (sigismember(set, i+3)) x |= 4;
 		if (sigismember(set, i+4)) x |= 8;
 		if (sigismember(set, i+4)) x |= 8;
-		seq_printf(m, "%x", x);
+		seq_putc(m, hex_asc[x]);
 	} while (i >= 4);
 	} while (i >= 4);
 
 
 	seq_putc(m, '\n');
 	seq_putc(m, '\n');
@@ -342,10 +342,11 @@ static inline void task_cap(struct seq_file *m, struct task_struct *p)
 
 
 static inline void task_seccomp(struct seq_file *m, struct task_struct *p)
 static inline void task_seccomp(struct seq_file *m, struct task_struct *p)
 {
 {
+	seq_put_decimal_ull(m, "NoNewPrivs:\t", task_no_new_privs(p));
 #ifdef CONFIG_SECCOMP
 #ifdef CONFIG_SECCOMP
-	seq_put_decimal_ull(m, "Seccomp:\t", p->seccomp.mode);
-	seq_putc(m, '\n');
+	seq_put_decimal_ull(m, "\nSeccomp:\t", p->seccomp.mode);
 #endif
 #endif
+	seq_putc(m, '\n');
 }
 }
 
 
 static inline void task_context_switch_counts(struct seq_file *m,
 static inline void task_context_switch_counts(struct seq_file *m,

+ 19 - 12
fs/proc/base.c

@@ -104,9 +104,12 @@
  *	in /proc for a task before it execs a suid executable.
  *	in /proc for a task before it execs a suid executable.
  */
  */
 
 
+static u8 nlink_tid;
+static u8 nlink_tgid;
+
 struct pid_entry {
 struct pid_entry {
 	const char *name;
 	const char *name;
-	int len;
+	unsigned int len;
 	umode_t mode;
 	umode_t mode;
 	const struct inode_operations *iop;
 	const struct inode_operations *iop;
 	const struct file_operations *fop;
 	const struct file_operations *fop;
@@ -139,13 +142,13 @@ struct pid_entry {
  * Count the number of hardlinks for the pid_entry table, excluding the .
  * Count the number of hardlinks for the pid_entry table, excluding the .
  * and .. links.
  * and .. links.
  */
  */
-static unsigned int pid_entry_count_dirs(const struct pid_entry *entries,
+static unsigned int __init pid_entry_nlink(const struct pid_entry *entries,
 	unsigned int n)
 	unsigned int n)
 {
 {
 	unsigned int i;
 	unsigned int i;
 	unsigned int count;
 	unsigned int count;
 
 
-	count = 0;
+	count = 2;
 	for (i = 0; i < n; ++i) {
 	for (i = 0; i < n; ++i) {
 		if (S_ISDIR(entries[i].mode))
 		if (S_ISDIR(entries[i].mode))
 			++count;
 			++count;
@@ -1967,7 +1970,7 @@ out:
 
 
 struct map_files_info {
 struct map_files_info {
 	fmode_t		mode;
 	fmode_t		mode;
-	unsigned long	len;
+	unsigned int	len;
 	unsigned char	name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */
 	unsigned char	name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */
 };
 };
 
 
@@ -2412,14 +2415,14 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
 	 * Yes, it does not scale. And it should not. Don't add
 	 * Yes, it does not scale. And it should not. Don't add
 	 * new entries into /proc/<tgid>/ without very good reasons.
 	 * new entries into /proc/<tgid>/ without very good reasons.
 	 */
 	 */
-	last = &ents[nents - 1];
-	for (p = ents; p <= last; p++) {
+	last = &ents[nents];
+	for (p = ents; p < last; p++) {
 		if (p->len != dentry->d_name.len)
 		if (p->len != dentry->d_name.len)
 			continue;
 			continue;
 		if (!memcmp(dentry->d_name.name, p->name, p->len))
 		if (!memcmp(dentry->d_name.name, p->name, p->len))
 			break;
 			break;
 	}
 	}
-	if (p > last)
+	if (p >= last)
 		goto out;
 		goto out;
 
 
 	error = proc_pident_instantiate(dir, dentry, task, p);
 	error = proc_pident_instantiate(dir, dentry, task, p);
@@ -2444,7 +2447,7 @@ static int proc_pident_readdir(struct file *file, struct dir_context *ctx,
 	if (ctx->pos >= nents + 2)
 	if (ctx->pos >= nents + 2)
 		goto out;
 		goto out;
 
 
-	for (p = ents + (ctx->pos - 2); p <= ents + nents - 1; p++) {
+	for (p = ents + (ctx->pos - 2); p < ents + nents; p++) {
 		if (!proc_fill_cache(file, ctx, p->name, p->len,
 		if (!proc_fill_cache(file, ctx, p->name, p->len,
 				proc_pident_instantiate, task, p))
 				proc_pident_instantiate, task, p))
 			break;
 			break;
@@ -3068,8 +3071,7 @@ static int proc_pid_instantiate(struct inode *dir,
 	inode->i_fop = &proc_tgid_base_operations;
 	inode->i_fop = &proc_tgid_base_operations;
 	inode->i_flags|=S_IMMUTABLE;
 	inode->i_flags|=S_IMMUTABLE;
 
 
-	set_nlink(inode, 2 + pid_entry_count_dirs(tgid_base_stuff,
-						  ARRAY_SIZE(tgid_base_stuff)));
+	set_nlink(inode, nlink_tgid);
 
 
 	d_set_d_op(dentry, &pid_dentry_operations);
 	d_set_d_op(dentry, &pid_dentry_operations);
 
 
@@ -3361,8 +3363,7 @@ static int proc_task_instantiate(struct inode *dir,
 	inode->i_fop = &proc_tid_base_operations;
 	inode->i_fop = &proc_tid_base_operations;
 	inode->i_flags|=S_IMMUTABLE;
 	inode->i_flags|=S_IMMUTABLE;
 
 
-	set_nlink(inode, 2 + pid_entry_count_dirs(tid_base_stuff,
-						  ARRAY_SIZE(tid_base_stuff)));
+	set_nlink(inode, nlink_tid);
 
 
 	d_set_d_op(dentry, &pid_dentry_operations);
 	d_set_d_op(dentry, &pid_dentry_operations);
 
 
@@ -3552,3 +3553,9 @@ static const struct file_operations proc_task_operations = {
 	.iterate_shared	= proc_task_readdir,
 	.iterate_shared	= proc_task_readdir,
 	.llseek		= generic_file_llseek,
 	.llseek		= generic_file_llseek,
 };
 };
+
+void __init set_proc_pid_nlink(void)
+{
+	nlink_tid = pid_entry_nlink(tid_base_stuff, ARRAY_SIZE(tid_base_stuff));
+	nlink_tgid = pid_entry_nlink(tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff));
+}

+ 26 - 11
fs/proc/inode.c

@@ -138,6 +138,16 @@ static void unuse_pde(struct proc_dir_entry *pde)
 /* pde is locked */
 /* pde is locked */
 static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo)
 static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo)
 {
 {
+	/*
+	 * close() (proc_reg_release()) can't delete an entry and proceed:
+	 * ->release hook needs to be available at the right moment.
+	 *
+	 * rmmod (remove_proc_entry() et al) can't delete an entry and proceed:
+	 * "struct file" needs to be available at the right moment.
+	 *
+	 * Therefore, first process to enter this function does ->release() and
+	 * signals its completion to the other process which does nothing.
+	 */
 	if (pdeo->closing) {
 	if (pdeo->closing) {
 		/* somebody else is doing that, just wait */
 		/* somebody else is doing that, just wait */
 		DECLARE_COMPLETION_ONSTACK(c);
 		DECLARE_COMPLETION_ONSTACK(c);
@@ -147,12 +157,13 @@ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo)
 		spin_lock(&pde->pde_unload_lock);
 		spin_lock(&pde->pde_unload_lock);
 	} else {
 	} else {
 		struct file *file;
 		struct file *file;
-		pdeo->closing = 1;
+		pdeo->closing = true;
 		spin_unlock(&pde->pde_unload_lock);
 		spin_unlock(&pde->pde_unload_lock);
 		file = pdeo->file;
 		file = pdeo->file;
 		pde->proc_fops->release(file_inode(file), file);
 		pde->proc_fops->release(file_inode(file), file);
 		spin_lock(&pde->pde_unload_lock);
 		spin_lock(&pde->pde_unload_lock);
-		list_del_init(&pdeo->lh);
+		/* After ->release. */
+		list_del(&pdeo->lh);
 		if (pdeo->c)
 		if (pdeo->c)
 			complete(pdeo->c);
 			complete(pdeo->c);
 		kfree(pdeo);
 		kfree(pdeo);
@@ -167,6 +178,8 @@ void proc_entry_rundown(struct proc_dir_entry *de)
 	if (atomic_add_return(BIAS, &de->in_use) != BIAS)
 	if (atomic_add_return(BIAS, &de->in_use) != BIAS)
 		wait_for_completion(&c);
 		wait_for_completion(&c);
 
 
+	/* ->pde_openers list can't grow from now on. */
+
 	spin_lock(&de->pde_unload_lock);
 	spin_lock(&de->pde_unload_lock);
 	while (!list_empty(&de->pde_openers)) {
 	while (!list_empty(&de->pde_openers)) {
 		struct pde_opener *pdeo;
 		struct pde_opener *pdeo;
@@ -312,16 +325,17 @@ static int proc_reg_open(struct inode *inode, struct file *file)
 	struct pde_opener *pdeo;
 	struct pde_opener *pdeo;
 
 
 	/*
 	/*
-	 * What for, you ask? Well, we can have open, rmmod, remove_proc_entry
-	 * sequence. ->release won't be called because ->proc_fops will be
-	 * cleared. Depending on complexity of ->release, consequences vary.
+	 * Ensure that
+	 * 1) PDE's ->release hook will be called no matter what
+	 *    either normally by close()/->release, or forcefully by
+	 *    rmmod/remove_proc_entry.
+	 *
+	 * 2) rmmod isn't blocked by opening file in /proc and sitting on
+	 *    the descriptor (including "rmmod foo </proc/foo" scenario).
 	 *
 	 *
-	 * We can't wait for mercy when close will be done for real, it's
-	 * deadlockable: rmmod foo </proc/foo . So, we're going to do ->release
-	 * by hand in remove_proc_entry(). For this, save opener's credentials
-	 * for later.
+	 * Save every "struct file" with custom ->release hook.
 	 */
 	 */
-	pdeo = kzalloc(sizeof(struct pde_opener), GFP_KERNEL);
+	pdeo = kmalloc(sizeof(struct pde_opener), GFP_KERNEL);
 	if (!pdeo)
 	if (!pdeo)
 		return -ENOMEM;
 		return -ENOMEM;
 
 
@@ -338,7 +352,8 @@ static int proc_reg_open(struct inode *inode, struct file *file)
 	if (rv == 0 && release) {
 	if (rv == 0 && release) {
 		/* To know what to release. */
 		/* To know what to release. */
 		pdeo->file = file;
 		pdeo->file = file;
-		/* Strictly for "too late" ->release in proc_reg_release(). */
+		pdeo->closing = false;
+		pdeo->c = NULL;
 		spin_lock(&pde->pde_unload_lock);
 		spin_lock(&pde->pde_unload_lock);
 		list_add(&pdeo->lh, &pde->pde_openers);
 		list_add(&pdeo->lh, &pde->pde_openers);
 		spin_unlock(&pde->pde_unload_lock);
 		spin_unlock(&pde->pde_unload_lock);

+ 2 - 1
fs/proc/internal.h

@@ -203,7 +203,7 @@ struct proc_dir_entry *proc_create_mount_point(const char *name);
 struct pde_opener {
 struct pde_opener {
 	struct file *file;
 	struct file *file;
 	struct list_head lh;
 	struct list_head lh;
-	int closing;
+	bool closing;
 	struct completion *c;
 	struct completion *c;
 };
 };
 extern const struct inode_operations proc_link_inode_operations;
 extern const struct inode_operations proc_link_inode_operations;
@@ -211,6 +211,7 @@ extern const struct inode_operations proc_link_inode_operations;
 extern const struct inode_operations proc_pid_link_inode_operations;
 extern const struct inode_operations proc_pid_link_inode_operations;
 
 
 extern void proc_init_inodecache(void);
 extern void proc_init_inodecache(void);
+void set_proc_pid_nlink(void);
 extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
 extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
 extern int proc_fill_super(struct super_block *, void *data, int flags);
 extern int proc_fill_super(struct super_block *, void *data, int flags);
 extern void proc_entry_rundown(struct proc_dir_entry *);
 extern void proc_entry_rundown(struct proc_dir_entry *);

+ 1 - 0
fs/proc/root.c

@@ -122,6 +122,7 @@ void __init proc_root_init(void)
 	int err;
 	int err;
 
 
 	proc_init_inodecache();
 	proc_init_inodecache();
+	set_proc_pid_nlink();
 	err = register_filesystem(&proc_fs_type);
 	err = register_filesystem(&proc_fs_type);
 	if (err)
 	if (err)
 		return;
 		return;

+ 1 - 0
fs/proc/task_mmu.c

@@ -1588,6 +1588,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
 
 
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 	pte_unmap_unlock(orig_pte, ptl);
 	pte_unmap_unlock(orig_pte, ptl);
+	cond_resched();
 	return 0;
 	return 0;
 }
 }
 #ifdef CONFIG_HUGETLB_PAGE
 #ifdef CONFIG_HUGETLB_PAGE

+ 2 - 11
include/asm-generic/pgtable.h

@@ -652,18 +652,9 @@ static inline pmd_t pmd_read_atomic(pmd_t *pmdp)
 }
 }
 #endif
 #endif
 
 
-#ifndef pmd_move_must_withdraw
-static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
-					 spinlock_t *old_pmd_ptl)
-{
-	/*
-	 * With split pmd lock we also need to move preallocated
-	 * PTE page table if new_pmd is on different PMD page table.
-	 */
-	return new_pmd_ptl != old_pmd_ptl;
-}
+#ifndef arch_needs_pgtable_deposit
+#define arch_needs_pgtable_deposit() (false)
 #endif
 #endif
-
 /*
 /*
  * This function is meant to be used by sites walking pagetables with
  * This function is meant to be used by sites walking pagetables with
  * the mmap_sem hold in read mode to protect against MADV_DONTNEED and
  * the mmap_sem hold in read mode to protect against MADV_DONTNEED and

+ 50 - 33
include/asm-generic/tlb.h

@@ -107,11 +107,6 @@ struct mmu_gather {
 	struct mmu_gather_batch	local;
 	struct mmu_gather_batch	local;
 	struct page		*__pages[MMU_GATHER_BUNDLE];
 	struct page		*__pages[MMU_GATHER_BUNDLE];
 	unsigned int		batch_count;
 	unsigned int		batch_count;
-	/*
-	 * __tlb_adjust_range  will track the new addr here,
-	 * that that we can adjust the range after the flush
-	 */
-	unsigned long addr;
 	int page_size;
 	int page_size;
 };
 };
 
 
@@ -125,16 +120,11 @@ extern bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page,
 				   int page_size);
 				   int page_size);
 
 
 static inline void __tlb_adjust_range(struct mmu_gather *tlb,
 static inline void __tlb_adjust_range(struct mmu_gather *tlb,
-				      unsigned long address)
+				      unsigned long address,
+				      unsigned int range_size)
 {
 {
 	tlb->start = min(tlb->start, address);
 	tlb->start = min(tlb->start, address);
-	tlb->end = max(tlb->end, address + PAGE_SIZE);
-	/*
-	 * Track the last address with which we adjusted the range. This
-	 * will be used later to adjust again after a mmu_flush due to
-	 * failed __tlb_remove_page
-	 */
-	tlb->addr = address;
+	tlb->end = max(tlb->end, address + range_size);
 }
 }
 
 
 static inline void __tlb_reset_range(struct mmu_gather *tlb)
 static inline void __tlb_reset_range(struct mmu_gather *tlb)
@@ -150,15 +140,11 @@ static inline void __tlb_reset_range(struct mmu_gather *tlb)
 static inline void tlb_remove_page_size(struct mmu_gather *tlb,
 static inline void tlb_remove_page_size(struct mmu_gather *tlb,
 					struct page *page, int page_size)
 					struct page *page, int page_size)
 {
 {
-	if (__tlb_remove_page_size(tlb, page, page_size)) {
+	if (__tlb_remove_page_size(tlb, page, page_size))
 		tlb_flush_mmu(tlb);
 		tlb_flush_mmu(tlb);
-		tlb->page_size = page_size;
-		__tlb_adjust_range(tlb, tlb->addr);
-		__tlb_remove_page_size(tlb, page, page_size);
-	}
 }
 }
 
 
-static bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
+static inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
 {
 {
 	return __tlb_remove_page_size(tlb, page, PAGE_SIZE);
 	return __tlb_remove_page_size(tlb, page, PAGE_SIZE);
 }
 }
@@ -172,14 +158,21 @@ static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page)
 	return tlb_remove_page_size(tlb, page, PAGE_SIZE);
 	return tlb_remove_page_size(tlb, page, PAGE_SIZE);
 }
 }
 
 
-static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb, struct page *page)
+#ifndef tlb_remove_check_page_size_change
+#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change
+static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb,
+						     unsigned int page_size)
 {
 {
-	/* active->nr should be zero when we call this */
-	VM_BUG_ON_PAGE(tlb->active->nr, page);
-	tlb->page_size = PAGE_SIZE;
-	__tlb_adjust_range(tlb, tlb->addr);
-	return __tlb_remove_page(tlb, page);
+	/*
+	 * We don't care about page size change, just update
+	 * mmu_gather page size here so that debug checks
+	 * doesn't throw false warning.
+	 */
+#ifdef CONFIG_DEBUG_VM
+	tlb->page_size = page_size;
+#endif
 }
 }
+#endif
 
 
 /*
 /*
  * In the case of tlb vma handling, we can optimise these away in the
  * In the case of tlb vma handling, we can optimise these away in the
@@ -215,10 +208,16 @@ static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb, struct page *pa
  */
  */
 #define tlb_remove_tlb_entry(tlb, ptep, address)		\
 #define tlb_remove_tlb_entry(tlb, ptep, address)		\
 	do {							\
 	do {							\
-		__tlb_adjust_range(tlb, address);		\
+		__tlb_adjust_range(tlb, address, PAGE_SIZE);	\
 		__tlb_remove_tlb_entry(tlb, ptep, address);	\
 		__tlb_remove_tlb_entry(tlb, ptep, address);	\
 	} while (0)
 	} while (0)
 
 
+#define tlb_remove_huge_tlb_entry(h, tlb, ptep, address)	     \
+	do {							     \
+		__tlb_adjust_range(tlb, address, huge_page_size(h)); \
+		__tlb_remove_tlb_entry(tlb, ptep, address);	     \
+	} while (0)
+
 /**
 /**
  * tlb_remove_pmd_tlb_entry - remember a pmd mapping for later tlb invalidation
  * tlb_remove_pmd_tlb_entry - remember a pmd mapping for later tlb invalidation
  * This is a nop so far, because only x86 needs it.
  * This is a nop so far, because only x86 needs it.
@@ -227,29 +226,47 @@ static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb, struct page *pa
 #define __tlb_remove_pmd_tlb_entry(tlb, pmdp, address) do {} while (0)
 #define __tlb_remove_pmd_tlb_entry(tlb, pmdp, address) do {} while (0)
 #endif
 #endif
 
 
-#define tlb_remove_pmd_tlb_entry(tlb, pmdp, address)		\
-	do {							\
-		__tlb_adjust_range(tlb, address);		\
-		__tlb_remove_pmd_tlb_entry(tlb, pmdp, address);	\
+#define tlb_remove_pmd_tlb_entry(tlb, pmdp, address)			\
+	do {								\
+		__tlb_adjust_range(tlb, address, HPAGE_PMD_SIZE);	\
+		__tlb_remove_pmd_tlb_entry(tlb, pmdp, address);		\
 	} while (0)
 	} while (0)
 
 
+/*
+ * For things like page tables caches (ie caching addresses "inside" the
+ * page tables, like x86 does), for legacy reasons, flushing an
+ * individual page had better flush the page table caches behind it. This
+ * is definitely how x86 works, for example. And if you have an
+ * architected non-legacy page table cache (which I'm not aware of
+ * anybody actually doing), you're going to have some architecturally
+ * explicit flushing for that, likely *separate* from a regular TLB entry
+ * flush, and thus you'd need more than just some range expansion..
+ *
+ * So if we ever find an architecture
+ * that would want something that odd, I think it is up to that
+ * architecture to do its own odd thing, not cause pain for others
+ * http://lkml.kernel.org/r/CA+55aFzBggoXtNXQeng5d_mRoDnaMBE5Y+URs+PHR67nUpMtaw@mail.gmail.com
+ *
+ * For now w.r.t page table cache, mark the range_size as PAGE_SIZE
+ */
+
 #define pte_free_tlb(tlb, ptep, address)			\
 #define pte_free_tlb(tlb, ptep, address)			\
 	do {							\
 	do {							\
-		__tlb_adjust_range(tlb, address);		\
+		__tlb_adjust_range(tlb, address, PAGE_SIZE);	\
 		__pte_free_tlb(tlb, ptep, address);		\
 		__pte_free_tlb(tlb, ptep, address);		\
 	} while (0)
 	} while (0)
 
 
 #ifndef __ARCH_HAS_4LEVEL_HACK
 #ifndef __ARCH_HAS_4LEVEL_HACK
 #define pud_free_tlb(tlb, pudp, address)			\
 #define pud_free_tlb(tlb, pudp, address)			\
 	do {							\
 	do {							\
-		__tlb_adjust_range(tlb, address);		\
+		__tlb_adjust_range(tlb, address, PAGE_SIZE);	\
 		__pud_free_tlb(tlb, pudp, address);		\
 		__pud_free_tlb(tlb, pudp, address);		\
 	} while (0)
 	} while (0)
 #endif
 #endif
 
 
 #define pmd_free_tlb(tlb, pmdp, address)			\
 #define pmd_free_tlb(tlb, pmdp, address)			\
 	do {							\
 	do {							\
-		__tlb_adjust_range(tlb, address);		\
+		__tlb_adjust_range(tlb, address, PAGE_SIZE);	\
 		__pmd_free_tlb(tlb, pmdp, address);		\
 		__pmd_free_tlb(tlb, pmdp, address);		\
 	} while (0)
 	} while (0)
 
 

+ 2 - 1
include/linux/backing-dev-defs.h

@@ -136,12 +136,13 @@ struct bdi_writeback {
 struct backing_dev_info {
 struct backing_dev_info {
 	struct list_head bdi_list;
 	struct list_head bdi_list;
 	unsigned long ra_pages;	/* max readahead in PAGE_SIZE units */
 	unsigned long ra_pages;	/* max readahead in PAGE_SIZE units */
-	unsigned int capabilities; /* Device capabilities */
+	unsigned long io_pages;	/* max allowed IO size */
 	congested_fn *congested_fn; /* Function pointer if device is md/dm */
 	congested_fn *congested_fn; /* Function pointer if device is md/dm */
 	void *congested_data;	/* Pointer to aux data for congested func */
 	void *congested_data;	/* Pointer to aux data for congested func */
 
 
 	char *name;
 	char *name;
 
 
+	unsigned int capabilities; /* Device capabilities */
 	unsigned int min_ratio;
 	unsigned int min_ratio;
 	unsigned int max_ratio, max_prop_frac;
 	unsigned int max_ratio, max_prop_frac;
 
 

+ 3 - 0
include/linux/cma.h

@@ -1,6 +1,9 @@
 #ifndef __CMA_H__
 #ifndef __CMA_H__
 #define __CMA_H__
 #define __CMA_H__
 
 
+#include <linux/init.h>
+#include <linux/types.h>
+
 /*
 /*
  * There is always at least global CMA area and a few optional
  * There is always at least global CMA area and a few optional
  * areas configured in kernel .config.
  * areas configured in kernel .config.

+ 1 - 1
include/linux/compiler-gcc.h

@@ -21,7 +21,7 @@
  * clobbered. The issue is as follows: while the inline asm might
  * clobbered. The issue is as follows: while the inline asm might
  * access any memory it wants, the compiler could have fit all of
  * access any memory it wants, the compiler could have fit all of
  * @ptr into memory registers instead, and since @ptr never escaped
  * @ptr into memory registers instead, and since @ptr never escaped
- * from that, it proofed that the inline asm wasn't touching any of
+ * from that, it proved that the inline asm wasn't touching any of
  * it. This version works well with both compilers, i.e. we're telling
  * it. This version works well with both compilers, i.e. we're telling
  * the compiler that the inline asm absolutely may see the contents
  * the compiler that the inline asm absolutely may see the contents
  * of @ptr. See also: https://llvm.org/bugs/show_bug.cgi?id=15495
  * of @ptr. See also: https://llvm.org/bugs/show_bug.cgi?id=15495

+ 2 - 0
include/linux/huge_mm.h

@@ -189,6 +189,8 @@ static inline void deferred_split_huge_page(struct page *page) {}
 #define split_huge_pmd(__vma, __pmd, __address)	\
 #define split_huge_pmd(__vma, __pmd, __address)	\
 	do { } while (0)
 	do { } while (0)
 
 
+static inline void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+		unsigned long address, bool freeze, struct page *page) {}
 static inline void split_huge_pmd_address(struct vm_area_struct *vma,
 static inline void split_huge_pmd_address(struct vm_area_struct *vma,
 		unsigned long address, bool freeze, struct page *page) {}
 		unsigned long address, bool freeze, struct page *page) {}
 
 

+ 1 - 1
include/linux/kthread.h

@@ -175,7 +175,7 @@ __printf(2, 3)
 struct kthread_worker *
 struct kthread_worker *
 kthread_create_worker(unsigned int flags, const char namefmt[], ...);
 kthread_create_worker(unsigned int flags, const char namefmt[], ...);
 
 
-struct kthread_worker *
+__printf(3, 4) struct kthread_worker *
 kthread_create_worker_on_cpu(int cpu, unsigned int flags,
 kthread_create_worker_on_cpu(int cpu, unsigned int flags,
 			     const char namefmt[], ...);
 			     const char namefmt[], ...);
 
 

+ 8 - 0
include/linux/mempolicy.h

@@ -7,6 +7,7 @@
 
 
 
 
 #include <linux/mmzone.h>
 #include <linux/mmzone.h>
+#include <linux/dax.h>
 #include <linux/slab.h>
 #include <linux/slab.h>
 #include <linux/rbtree.h>
 #include <linux/rbtree.h>
 #include <linux/spinlock.h>
 #include <linux/spinlock.h>
@@ -177,6 +178,13 @@ static inline bool vma_migratable(struct vm_area_struct *vma)
 	if (vma->vm_flags & (VM_IO | VM_PFNMAP))
 	if (vma->vm_flags & (VM_IO | VM_PFNMAP))
 		return false;
 		return false;
 
 
+	/*
+	 * DAX device mappings require predictable access latency, so avoid
+	 * incurring periodic faults.
+	 */
+	if (vma_is_dax(vma))
+		return false;
+
 #ifndef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION
 #ifndef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION
 	if (vma->vm_flags & VM_HUGETLB)
 	if (vma->vm_flags & VM_HUGETLB)
 		return false;
 		return false;

+ 1 - 0
include/linux/of_fdt.h

@@ -71,6 +71,7 @@ extern int early_init_dt_scan_chosen_stdout(void);
 extern void early_init_fdt_scan_reserved_mem(void);
 extern void early_init_fdt_scan_reserved_mem(void);
 extern void early_init_fdt_reserve_self(void);
 extern void early_init_fdt_reserve_self(void);
 extern void early_init_dt_add_memory_arch(u64 base, u64 size);
 extern void early_init_dt_add_memory_arch(u64 base, u64 size);
+extern int early_init_dt_mark_hotplug_memory_arch(u64 base, u64 size);
 extern int early_init_dt_reserve_memory_arch(phys_addr_t base, phys_addr_t size,
 extern int early_init_dt_reserve_memory_arch(phys_addr_t base, phys_addr_t size,
 					     bool no_map);
 					     bool no_map);
 extern void * early_init_dt_alloc_memory_arch(u64 size, u64 align);
 extern void * early_init_dt_alloc_memory_arch(u64 size, u64 align);

+ 16 - 1
include/linux/printk.h

@@ -10,6 +10,8 @@
 extern const char linux_banner[];
 extern const char linux_banner[];
 extern const char linux_proc_banner[];
 extern const char linux_proc_banner[];
 
 
+#define PRINTK_MAX_SINGLE_HEADER_LEN 2
+
 static inline int printk_get_level(const char *buffer)
 static inline int printk_get_level(const char *buffer)
 {
 {
 	if (buffer[0] == KERN_SOH_ASCII && buffer[1]) {
 	if (buffer[0] == KERN_SOH_ASCII && buffer[1]) {
@@ -31,6 +33,14 @@ static inline const char *printk_skip_level(const char *buffer)
 	return buffer;
 	return buffer;
 }
 }
 
 
+static inline const char *printk_skip_headers(const char *buffer)
+{
+	while (printk_get_level(buffer))
+		buffer = printk_skip_level(buffer);
+
+	return buffer;
+}
+
 #define CONSOLE_EXT_LOG_MAX	8192
 #define CONSOLE_EXT_LOG_MAX	8192
 
 
 /* printk's without a loglevel use this.. */
 /* printk's without a loglevel use this.. */
@@ -40,10 +50,15 @@ static inline const char *printk_skip_level(const char *buffer)
 #define CONSOLE_LOGLEVEL_SILENT  0 /* Mum's the word */
 #define CONSOLE_LOGLEVEL_SILENT  0 /* Mum's the word */
 #define CONSOLE_LOGLEVEL_MIN	 1 /* Minimum loglevel we let people use */
 #define CONSOLE_LOGLEVEL_MIN	 1 /* Minimum loglevel we let people use */
 #define CONSOLE_LOGLEVEL_QUIET	 4 /* Shhh ..., when booted with "quiet" */
 #define CONSOLE_LOGLEVEL_QUIET	 4 /* Shhh ..., when booted with "quiet" */
-#define CONSOLE_LOGLEVEL_DEFAULT 7 /* anything MORE serious than KERN_DEBUG */
 #define CONSOLE_LOGLEVEL_DEBUG	10 /* issue debug messages */
 #define CONSOLE_LOGLEVEL_DEBUG	10 /* issue debug messages */
 #define CONSOLE_LOGLEVEL_MOTORMOUTH 15	/* You can't shut this one up */
 #define CONSOLE_LOGLEVEL_MOTORMOUTH 15	/* You can't shut this one up */
 
 
+/*
+ * Default used to be hard-coded at 7, we're now allowing it to be set from
+ * kernel config.
+ */
+#define CONSOLE_LOGLEVEL_DEFAULT CONFIG_CONSOLE_LOGLEVEL_DEFAULT
+
 extern int console_printk[];
 extern int console_printk[];
 
 
 #define console_loglevel (console_printk[0])
 #define console_loglevel (console_printk[0])

+ 12 - 22
include/linux/radix-tree.h

@@ -80,14 +80,11 @@ static inline bool radix_tree_is_internal_node(void *ptr)
 #define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \
 #define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \
 					  RADIX_TREE_MAP_SHIFT))
 					  RADIX_TREE_MAP_SHIFT))
 
 
-/* Internally used bits of node->count */
-#define RADIX_TREE_COUNT_SHIFT	(RADIX_TREE_MAP_SHIFT + 1)
-#define RADIX_TREE_COUNT_MASK	((1UL << RADIX_TREE_COUNT_SHIFT) - 1)
-
 struct radix_tree_node {
 struct radix_tree_node {
-	unsigned char	shift;	/* Bits remaining in each slot */
-	unsigned char	offset;	/* Slot offset in parent */
-	unsigned int	count;
+	unsigned char	shift;		/* Bits remaining in each slot */
+	unsigned char	offset;		/* Slot offset in parent */
+	unsigned char	count;		/* Total entry count */
+	unsigned char	exceptional;	/* Exceptional entry count */
 	union {
 	union {
 		struct {
 		struct {
 			/* Used when ascending tree */
 			/* Used when ascending tree */
@@ -248,20 +245,6 @@ static inline int radix_tree_exception(void *arg)
 	return unlikely((unsigned long)arg & RADIX_TREE_ENTRY_MASK);
 	return unlikely((unsigned long)arg & RADIX_TREE_ENTRY_MASK);
 }
 }
 
 
-/**
- * radix_tree_replace_slot	- replace item in a slot
- * @pslot:	pointer to slot, returned by radix_tree_lookup_slot
- * @item:	new item to store in the slot.
- *
- * For use with radix_tree_lookup_slot().  Caller must hold tree write locked
- * across slot lookup and replacement.
- */
-static inline void radix_tree_replace_slot(void **pslot, void *item)
-{
-	BUG_ON(radix_tree_is_internal_node(item));
-	rcu_assign_pointer(*pslot, item);
-}
-
 int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
 int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
 			unsigned order, struct radix_tree_node **nodep,
 			unsigned order, struct radix_tree_node **nodep,
 			void ***slotp);
 			void ***slotp);
@@ -276,7 +259,14 @@ void *__radix_tree_lookup(struct radix_tree_root *root, unsigned long index,
 			  struct radix_tree_node **nodep, void ***slotp);
 			  struct radix_tree_node **nodep, void ***slotp);
 void *radix_tree_lookup(struct radix_tree_root *, unsigned long);
 void *radix_tree_lookup(struct radix_tree_root *, unsigned long);
 void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long);
 void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long);
-bool __radix_tree_delete_node(struct radix_tree_root *root,
+typedef void (*radix_tree_update_node_t)(struct radix_tree_node *, void *);
+void __radix_tree_replace(struct radix_tree_root *root,
+			  struct radix_tree_node *node,
+			  void **slot, void *item,
+			  radix_tree_update_node_t update_node, void *private);
+void radix_tree_replace_slot(struct radix_tree_root *root,
+			     void **slot, void *item);
+void __radix_tree_delete_node(struct radix_tree_root *root,
 			      struct radix_tree_node *node);
 			      struct radix_tree_node *node);
 void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *);
 void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *);
 void *radix_tree_delete(struct radix_tree_root *, unsigned long);
 void *radix_tree_delete(struct radix_tree_root *, unsigned long);

+ 9 - 1
include/linux/rmap.h

@@ -137,11 +137,19 @@ static inline void anon_vma_unlock_read(struct anon_vma *anon_vma)
  * anon_vma helper functions.
  * anon_vma helper functions.
  */
  */
 void anon_vma_init(void);	/* create anon_vma_cachep */
 void anon_vma_init(void);	/* create anon_vma_cachep */
-int  anon_vma_prepare(struct vm_area_struct *);
+int  __anon_vma_prepare(struct vm_area_struct *);
 void unlink_anon_vmas(struct vm_area_struct *);
 void unlink_anon_vmas(struct vm_area_struct *);
 int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct *);
 int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct *);
 int anon_vma_fork(struct vm_area_struct *, struct vm_area_struct *);
 int anon_vma_fork(struct vm_area_struct *, struct vm_area_struct *);
 
 
+static inline int anon_vma_prepare(struct vm_area_struct *vma)
+{
+	if (likely(vma->anon_vma))
+		return 0;
+
+	return __anon_vma_prepare(vma);
+}
+
 static inline void anon_vma_merge(struct vm_area_struct *vma,
 static inline void anon_vma_merge(struct vm_area_struct *vma,
 				  struct vm_area_struct *next)
 				  struct vm_area_struct *next)
 {
 {

+ 5 - 1
include/linux/sched.h

@@ -540,7 +540,11 @@ static inline int get_dumpable(struct mm_struct *mm)
 					/* leave room for more dump flags */
 					/* leave room for more dump flags */
 #define MMF_VM_MERGEABLE	16	/* KSM may merge identical pages */
 #define MMF_VM_MERGEABLE	16	/* KSM may merge identical pages */
 #define MMF_VM_HUGEPAGE		17	/* set when VM_HUGEPAGE is set on vma */
 #define MMF_VM_HUGEPAGE		17	/* set when VM_HUGEPAGE is set on vma */
-#define MMF_EXE_FILE_CHANGED	18	/* see prctl_set_mm_exe_file() */
+/*
+ * This one-shot flag is dropped due to necessity of changing exe once again
+ * on NFS restore
+ */
+//#define MMF_EXE_FILE_CHANGED	18	/* see prctl_set_mm_exe_file() */
 
 
 #define MMF_HAS_UPROBES		19	/* has uprobes */
 #define MMF_HAS_UPROBES		19	/* has uprobes */
 #define MMF_RECALC_UPROBES	20	/* MMF_HAS_UPROBES can be wrong */
 #define MMF_RECALC_UPROBES	20	/* MMF_HAS_UPROBES can be wrong */

+ 1 - 33
include/linux/swap.h

@@ -246,39 +246,7 @@ struct swap_info_struct {
 void *workingset_eviction(struct address_space *mapping, struct page *page);
 void *workingset_eviction(struct address_space *mapping, struct page *page);
 bool workingset_refault(void *shadow);
 bool workingset_refault(void *shadow);
 void workingset_activation(struct page *page);
 void workingset_activation(struct page *page);
-extern struct list_lru workingset_shadow_nodes;
-
-static inline unsigned int workingset_node_pages(struct radix_tree_node *node)
-{
-	return node->count & RADIX_TREE_COUNT_MASK;
-}
-
-static inline void workingset_node_pages_inc(struct radix_tree_node *node)
-{
-	node->count++;
-}
-
-static inline void workingset_node_pages_dec(struct radix_tree_node *node)
-{
-	VM_WARN_ON_ONCE(!workingset_node_pages(node));
-	node->count--;
-}
-
-static inline unsigned int workingset_node_shadows(struct radix_tree_node *node)
-{
-	return node->count >> RADIX_TREE_COUNT_SHIFT;
-}
-
-static inline void workingset_node_shadows_inc(struct radix_tree_node *node)
-{
-	node->count += 1U << RADIX_TREE_COUNT_SHIFT;
-}
-
-static inline void workingset_node_shadows_dec(struct radix_tree_node *node)
-{
-	VM_WARN_ON_ONCE(!workingset_node_shadows(node));
-	node->count -= 1U << RADIX_TREE_COUNT_SHIFT;
-}
+void workingset_update_node(struct radix_tree_node *node, void *private);
 
 
 /* linux/mm/page_alloc.c */
 /* linux/mm/page_alloc.c */
 extern unsigned long totalram_pages;
 extern unsigned long totalram_pages;

+ 1 - 0
include/linux/vmalloc.h

@@ -82,6 +82,7 @@ extern void *__vmalloc_node_range(unsigned long size, unsigned long align,
 			const void *caller);
 			const void *caller);
 
 
 extern void vfree(const void *addr);
 extern void vfree(const void *addr);
+extern void vfree_atomic(const void *addr);
 
 
 extern void *vmap(struct page **pages, unsigned int count,
 extern void *vmap(struct page **pages, unsigned int count,
 			unsigned long flags, pgprot_t prot);
 			unsigned long flags, pgprot_t prot);

+ 1 - 1
init/do_mounts.c

@@ -588,7 +588,7 @@ void __init prepare_namespace(void)
 			saved_root_name);
 			saved_root_name);
 		while (driver_probe_done() != 0 ||
 		while (driver_probe_done() != 0 ||
 			(ROOT_DEV = name_to_dev_t(saved_root_name)) == 0)
 			(ROOT_DEV = name_to_dev_t(saved_root_name)) == 0)
-			msleep(100);
+			msleep(5);
 		async_synchronize_full();
 		async_synchronize_full();
 	}
 	}
 
 

+ 1 - 1
kernel/debug/kdb/kdb_io.c

@@ -697,7 +697,7 @@ kdb_printit:
 	 * Write to all consoles.
 	 * Write to all consoles.
 	 */
 	 */
 	retlen = strlen(kdb_buffer);
 	retlen = strlen(kdb_buffer);
-	cp = (char *) printk_skip_level(kdb_buffer);
+	cp = (char *) printk_skip_headers(kdb_buffer);
 	if (!dbg_kdb_mode && kgdb_connected) {
 	if (!dbg_kdb_mode && kgdb_connected) {
 		gdbstub_msg_write(cp, retlen - (cp - kdb_buffer));
 		gdbstub_msg_write(cp, retlen - (cp - kdb_buffer));
 	} else {
 	} else {

+ 1 - 1
kernel/fork.c

@@ -229,7 +229,7 @@ static inline void free_thread_stack(struct task_struct *tsk)
 		}
 		}
 		local_irq_restore(flags);
 		local_irq_restore(flags);
 
 
-		vfree(tsk->stack);
+		vfree_atomic(tsk->stack);
 		return;
 		return;
 	}
 	}
 #endif
 #endif

+ 2 - 1
kernel/hung_task.c

@@ -106,7 +106,8 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
 	 * complain:
 	 * complain:
 	 */
 	 */
 	if (sysctl_hung_task_warnings) {
 	if (sysctl_hung_task_warnings) {
-		sysctl_hung_task_warnings--;
+		if (sysctl_hung_task_warnings > 0)
+			sysctl_hung_task_warnings--;
 		pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n",
 		pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n",
 			t->comm, t->pid, timeout);
 			t->comm, t->pid, timeout);
 		pr_err("      %s %s %.*s\n",
 		pr_err("      %s %s %.*s\n",

+ 3 - 2
kernel/kthread.c

@@ -261,7 +261,8 @@ static void create_kthread(struct kthread_create_info *create)
 	}
 	}
 }
 }
 
 
-static struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
+static __printf(4, 0)
+struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
 						    void *data, int node,
 						    void *data, int node,
 						    const char namefmt[],
 						    const char namefmt[],
 						    va_list args)
 						    va_list args)
@@ -635,7 +636,7 @@ repeat:
 }
 }
 EXPORT_SYMBOL_GPL(kthread_worker_fn);
 EXPORT_SYMBOL_GPL(kthread_worker_fn);
 
 
-static struct kthread_worker *
+static __printf(3, 0) struct kthread_worker *
 __kthread_create_worker(int cpu, unsigned int flags,
 __kthread_create_worker(int cpu, unsigned int flags,
 			const char namefmt[], va_list args)
 			const char namefmt[], va_list args)
 {
 {

+ 53 - 30
kernel/printk/nmi.c

@@ -67,7 +67,8 @@ static int vprintk_nmi(const char *fmt, va_list args)
 again:
 again:
 	len = atomic_read(&s->len);
 	len = atomic_read(&s->len);
 
 
-	if (len >= sizeof(s->buffer)) {
+	/* The trailing '\0' is not counted into len. */
+	if (len >= sizeof(s->buffer) - 1) {
 		atomic_inc(&nmi_message_lost);
 		atomic_inc(&nmi_message_lost);
 		return 0;
 		return 0;
 	}
 	}
@@ -79,7 +80,7 @@ again:
 	if (!len)
 	if (!len)
 		smp_rmb();
 		smp_rmb();
 
 
-	add = vsnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, args);
+	add = vscnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, args);
 
 
 	/*
 	/*
 	 * Do it once again if the buffer has been flushed in the meantime.
 	 * Do it once again if the buffer has been flushed in the meantime.
@@ -113,16 +114,51 @@ static void printk_nmi_flush_line(const char *text, int len)
 
 
 }
 }
 
 
-/*
- * printk one line from the temporary buffer from @start index until
- * and including the @end index.
- */
-static void printk_nmi_flush_seq_line(struct nmi_seq_buf *s,
-					int start, int end)
+/* printk part of the temporary buffer line by line */
+static int printk_nmi_flush_buffer(const char *start, size_t len)
 {
 {
-	const char *buf = s->buffer + start;
+	const char *c, *end;
+	bool header;
+
+	c = start;
+	end = start + len;
+	header = true;
+
+	/* Print line by line. */
+	while (c < end) {
+		if (*c == '\n') {
+			printk_nmi_flush_line(start, c - start + 1);
+			start = ++c;
+			header = true;
+			continue;
+		}
+
+		/* Handle continuous lines or missing new line. */
+		if ((c + 1 < end) && printk_get_level(c)) {
+			if (header) {
+				c = printk_skip_level(c);
+				continue;
+			}
+
+			printk_nmi_flush_line(start, c - start);
+			start = c++;
+			header = true;
+			continue;
+		}
+
+		header = false;
+		c++;
+	}
 
 
-	printk_nmi_flush_line(buf, (end - start) + 1);
+	/* Check if there was a partial line. Ignore pure header. */
+	if (start < end && !header) {
+		static const char newline[] = KERN_CONT "\n";
+
+		printk_nmi_flush_line(start, end - start);
+		printk_nmi_flush_line(newline, strlen(newline));
+	}
+
+	return len;
 }
 }
 
 
 /*
 /*
@@ -135,8 +171,8 @@ static void __printk_nmi_flush(struct irq_work *work)
 		__RAW_SPIN_LOCK_INITIALIZER(read_lock);
 		__RAW_SPIN_LOCK_INITIALIZER(read_lock);
 	struct nmi_seq_buf *s = container_of(work, struct nmi_seq_buf, work);
 	struct nmi_seq_buf *s = container_of(work, struct nmi_seq_buf, work);
 	unsigned long flags;
 	unsigned long flags;
-	size_t len, size;
-	int i, last_i;
+	size_t len;
+	int i;
 
 
 	/*
 	/*
 	 * The lock has two functions. First, one reader has to flush all
 	 * The lock has two functions. First, one reader has to flush all
@@ -154,12 +190,14 @@ more:
 	/*
 	/*
 	 * This is just a paranoid check that nobody has manipulated
 	 * This is just a paranoid check that nobody has manipulated
 	 * the buffer an unexpected way. If we printed something then
 	 * the buffer an unexpected way. If we printed something then
-	 * @len must only increase.
+	 * @len must only increase. Also it should never overflow the
+	 * buffer size.
 	 */
 	 */
-	if (i && i >= len) {
+	if ((i && i >= len) || len > sizeof(s->buffer)) {
 		const char *msg = "printk_nmi_flush: internal error\n";
 		const char *msg = "printk_nmi_flush: internal error\n";
 
 
 		printk_nmi_flush_line(msg, strlen(msg));
 		printk_nmi_flush_line(msg, strlen(msg));
+		len = 0;
 	}
 	}
 
 
 	if (!len)
 	if (!len)
@@ -167,22 +205,7 @@ more:
 
 
 	/* Make sure that data has been written up to the @len */
 	/* Make sure that data has been written up to the @len */
 	smp_rmb();
 	smp_rmb();
-
-	size = min(len, sizeof(s->buffer));
-	last_i = i;
-
-	/* Print line by line. */
-	for (; i < size; i++) {
-		if (s->buffer[i] == '\n') {
-			printk_nmi_flush_seq_line(s, last_i, i);
-			last_i = i + 1;
-		}
-	}
-	/* Check if there was a partial line. */
-	if (last_i < size) {
-		printk_nmi_flush_seq_line(s, last_i, size - 1);
-		printk_nmi_flush_line("\n", strlen("\n"));
-	}
+	i += printk_nmi_flush_buffer(s->buffer + i, len - i);
 
 
 	/*
 	/*
 	 * Check that nothing has got added in the meantime and truncate
 	 * Check that nothing has got added in the meantime and truncate

+ 0 - 10
kernel/sys.c

@@ -1697,16 +1697,6 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
 		fput(exe_file);
 		fput(exe_file);
 	}
 	}
 
 
-	/*
-	 * The symlink can be changed only once, just to disallow arbitrary
-	 * transitions malicious software might bring in. This means one
-	 * could make a snapshot over all processes running and monitor
-	 * /proc/pid/exe changes to notice unusual activity if needed.
-	 */
-	err = -EPERM;
-	if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags))
-		goto exit;
-
 	err = 0;
 	err = 0;
 	/* set the new file, lockless */
 	/* set the new file, lockless */
 	get_file(exe.file);
 	get_file(exe.file);

+ 20 - 1
lib/Kconfig.debug

@@ -15,6 +15,21 @@ config PRINTK_TIME
 	  The behavior is also controlled by the kernel command line
 	  The behavior is also controlled by the kernel command line
 	  parameter printk.time=1. See Documentation/kernel-parameters.txt
 	  parameter printk.time=1. See Documentation/kernel-parameters.txt
 
 
+config CONSOLE_LOGLEVEL_DEFAULT
+	int "Default console loglevel (1-15)"
+	range 1 15
+	default "7"
+	help
+	  Default loglevel to determine what will be printed on the console.
+
+	  Setting a default here is equivalent to passing in loglevel=<x> in
+	  the kernel bootargs. loglevel=<x> continues to override whatever
+	  value is specified here as well.
+
+	  Note: This does not affect the log level of un-prefixed prink()
+	  usage in the kernel. That is controlled by the MESSAGE_LOGLEVEL_DEFAULT
+	  option.
+
 config MESSAGE_LOGLEVEL_DEFAULT
 config MESSAGE_LOGLEVEL_DEFAULT
 	int "Default message log level (1-7)"
 	int "Default message log level (1-7)"
 	range 1 7
 	range 1 7
@@ -26,6 +41,10 @@ config MESSAGE_LOGLEVEL_DEFAULT
 	  that are auditing their logs closely may want to set it to a lower
 	  that are auditing their logs closely may want to set it to a lower
 	  priority.
 	  priority.
 
 
+	  Note: This does not affect what message level gets printed on the console
+	  by default. To change that, use loglevel=<x> in the kernel bootargs,
+	  or pick a different CONSOLE_LOGLEVEL_DEFAULT configuration value.
+
 config BOOT_PRINTK_DELAY
 config BOOT_PRINTK_DELAY
 	bool "Delay each boot printk message by N milliseconds"
 	bool "Delay each boot printk message by N milliseconds"
 	depends on DEBUG_KERNEL && PRINTK && GENERIC_CALIBRATE_DELAY
 	depends on DEBUG_KERNEL && PRINTK && GENERIC_CALIBRATE_DELAY
@@ -1986,7 +2005,7 @@ config ARCH_HAS_DEVMEM_IS_ALLOWED
 
 
 config STRICT_DEVMEM
 config STRICT_DEVMEM
 	bool "Filter access to /dev/mem"
 	bool "Filter access to /dev/mem"
-	depends on MMU
+	depends on MMU && DEVMEM
 	depends on ARCH_HAS_DEVMEM_IS_ALLOWED
 	depends on ARCH_HAS_DEVMEM_IS_ALLOWED
 	default y if TILE || PPC
 	default y if TILE || PPC
 	---help---
 	---help---

+ 11 - 0
lib/idr.c

@@ -927,6 +927,9 @@ EXPORT_SYMBOL(ida_pre_get);
  * and go back to the ida_pre_get() call.  If the ida is full, it will
  * and go back to the ida_pre_get() call.  If the ida is full, it will
  * return %-ENOSPC.
  * return %-ENOSPC.
  *
  *
+ * Note that callers must ensure that concurrent access to @ida is not possible.
+ * See ida_simple_get() for a varaint which takes care of locking.
+ *
  * @p_id returns a value in the range @starting_id ... %0x7fffffff.
  * @p_id returns a value in the range @starting_id ... %0x7fffffff.
  */
  */
 int ida_get_new_above(struct ida *ida, int starting_id, int *p_id)
 int ida_get_new_above(struct ida *ida, int starting_id, int *p_id)
@@ -1073,6 +1076,9 @@ EXPORT_SYMBOL(ida_destroy);
  * Allocates an id in the range start <= id < end, or returns -ENOSPC.
  * Allocates an id in the range start <= id < end, or returns -ENOSPC.
  * On memory allocation failure, returns -ENOMEM.
  * On memory allocation failure, returns -ENOMEM.
  *
  *
+ * Compared to ida_get_new_above() this function does its own locking, and
+ * should be used unless there are special requirements.
+ *
  * Use ida_simple_remove() to get rid of an id.
  * Use ida_simple_remove() to get rid of an id.
  */
  */
 int ida_simple_get(struct ida *ida, unsigned int start, unsigned int end,
 int ida_simple_get(struct ida *ida, unsigned int start, unsigned int end,
@@ -1119,6 +1125,11 @@ EXPORT_SYMBOL(ida_simple_get);
  * ida_simple_remove - remove an allocated id.
  * ida_simple_remove - remove an allocated id.
  * @ida: the (initialized) ida.
  * @ida: the (initialized) ida.
  * @id: the id returned by ida_simple_get.
  * @id: the id returned by ida_simple_get.
+ *
+ * Use to release an id allocated with ida_simple_get().
+ *
+ * Compared to ida_remove() this function does its own locking, and should be
+ * used unless there are special requirements.
  */
  */
 void ida_simple_remove(struct ida *ida, unsigned int id)
 void ida_simple_remove(struct ida *ida, unsigned int id)
 {
 {

+ 190 - 107
lib/radix-tree.c

@@ -220,10 +220,10 @@ static void dump_node(struct radix_tree_node *node, unsigned long index)
 {
 {
 	unsigned long i;
 	unsigned long i;
 
 
-	pr_debug("radix node: %p offset %d tags %lx %lx %lx shift %d count %d parent %p\n",
+	pr_debug("radix node: %p offset %d tags %lx %lx %lx shift %d count %d exceptional %d parent %p\n",
 		node, node->offset,
 		node, node->offset,
 		node->tags[0][0], node->tags[1][0], node->tags[2][0],
 		node->tags[0][0], node->tags[1][0], node->tags[2][0],
-		node->shift, node->count, node->parent);
+		node->shift, node->count, node->exceptional, node->parent);
 
 
 	for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) {
 	for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) {
 		unsigned long first = index | (i << node->shift);
 		unsigned long first = index | (i << node->shift);
@@ -325,7 +325,6 @@ static void radix_tree_node_rcu_free(struct rcu_head *head)
 		tag_clear(node, i, 0);
 		tag_clear(node, i, 0);
 
 
 	node->slots[0] = NULL;
 	node->slots[0] = NULL;
-	node->count = 0;
 
 
 	kmem_cache_free(radix_tree_node_cachep, node);
 	kmem_cache_free(radix_tree_node_cachep, node);
 }
 }
@@ -522,8 +521,13 @@ static int radix_tree_extend(struct radix_tree_root *root,
 		node->offset = 0;
 		node->offset = 0;
 		node->count = 1;
 		node->count = 1;
 		node->parent = NULL;
 		node->parent = NULL;
-		if (radix_tree_is_internal_node(slot))
+		if (radix_tree_is_internal_node(slot)) {
 			entry_to_node(slot)->parent = node;
 			entry_to_node(slot)->parent = node;
+		} else {
+			/* Moving an exceptional root->rnode to a node */
+			if (radix_tree_exceptional_entry(slot))
+				node->exceptional = 1;
+		}
 		node->slots[0] = slot;
 		node->slots[0] = slot;
 		slot = node_to_entry(node);
 		slot = node_to_entry(node);
 		rcu_assign_pointer(root->rnode, slot);
 		rcu_assign_pointer(root->rnode, slot);
@@ -533,6 +537,104 @@ out:
 	return maxshift + RADIX_TREE_MAP_SHIFT;
 	return maxshift + RADIX_TREE_MAP_SHIFT;
 }
 }
 
 
+/**
+ *	radix_tree_shrink    -    shrink radix tree to minimum height
+ *	@root		radix tree root
+ */
+static inline void radix_tree_shrink(struct radix_tree_root *root,
+				     radix_tree_update_node_t update_node,
+				     void *private)
+{
+	for (;;) {
+		struct radix_tree_node *node = root->rnode;
+		struct radix_tree_node *child;
+
+		if (!radix_tree_is_internal_node(node))
+			break;
+		node = entry_to_node(node);
+
+		/*
+		 * The candidate node has more than one child, or its child
+		 * is not at the leftmost slot, or the child is a multiorder
+		 * entry, we cannot shrink.
+		 */
+		if (node->count != 1)
+			break;
+		child = node->slots[0];
+		if (!child)
+			break;
+		if (!radix_tree_is_internal_node(child) && node->shift)
+			break;
+
+		if (radix_tree_is_internal_node(child))
+			entry_to_node(child)->parent = NULL;
+
+		/*
+		 * We don't need rcu_assign_pointer(), since we are simply
+		 * moving the node from one part of the tree to another: if it
+		 * was safe to dereference the old pointer to it
+		 * (node->slots[0]), it will be safe to dereference the new
+		 * one (root->rnode) as far as dependent read barriers go.
+		 */
+		root->rnode = child;
+
+		/*
+		 * We have a dilemma here. The node's slot[0] must not be
+		 * NULLed in case there are concurrent lookups expecting to
+		 * find the item. However if this was a bottom-level node,
+		 * then it may be subject to the slot pointer being visible
+		 * to callers dereferencing it. If item corresponding to
+		 * slot[0] is subsequently deleted, these callers would expect
+		 * their slot to become empty sooner or later.
+		 *
+		 * For example, lockless pagecache will look up a slot, deref
+		 * the page pointer, and if the page has 0 refcount it means it
+		 * was concurrently deleted from pagecache so try the deref
+		 * again. Fortunately there is already a requirement for logic
+		 * to retry the entire slot lookup -- the indirect pointer
+		 * problem (replacing direct root node with an indirect pointer
+		 * also results in a stale slot). So tag the slot as indirect
+		 * to force callers to retry.
+		 */
+		node->count = 0;
+		if (!radix_tree_is_internal_node(child)) {
+			node->slots[0] = RADIX_TREE_RETRY;
+			if (update_node)
+				update_node(node, private);
+		}
+
+		radix_tree_node_free(node);
+	}
+}
+
+static void delete_node(struct radix_tree_root *root,
+			struct radix_tree_node *node,
+			radix_tree_update_node_t update_node, void *private)
+{
+	do {
+		struct radix_tree_node *parent;
+
+		if (node->count) {
+			if (node == entry_to_node(root->rnode))
+				radix_tree_shrink(root, update_node, private);
+			return;
+		}
+
+		parent = node->parent;
+		if (parent) {
+			parent->slots[node->offset] = NULL;
+			parent->count--;
+		} else {
+			root_tag_clear_all(root);
+			root->rnode = NULL;
+		}
+
+		radix_tree_node_free(node);
+
+		node = parent;
+	} while (node);
+}
+
 /**
 /**
  *	__radix_tree_create	-	create a slot in a radix tree
  *	__radix_tree_create	-	create a slot in a radix tree
  *	@root:		radix tree root
  *	@root:		radix tree root
@@ -649,6 +751,8 @@ int __radix_tree_insert(struct radix_tree_root *root, unsigned long index,
 	if (node) {
 	if (node) {
 		unsigned offset = get_slot_offset(node, slot);
 		unsigned offset = get_slot_offset(node, slot);
 		node->count++;
 		node->count++;
+		if (radix_tree_exceptional_entry(item))
+			node->exceptional++;
 		BUG_ON(tag_get(node, 0, offset));
 		BUG_ON(tag_get(node, 0, offset));
 		BUG_ON(tag_get(node, 1, offset));
 		BUG_ON(tag_get(node, 1, offset));
 		BUG_ON(tag_get(node, 2, offset));
 		BUG_ON(tag_get(node, 2, offset));
@@ -746,6 +850,85 @@ void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index)
 }
 }
 EXPORT_SYMBOL(radix_tree_lookup);
 EXPORT_SYMBOL(radix_tree_lookup);
 
 
+static void replace_slot(struct radix_tree_root *root,
+			 struct radix_tree_node *node,
+			 void **slot, void *item,
+			 bool warn_typeswitch)
+{
+	void *old = rcu_dereference_raw(*slot);
+	int count, exceptional;
+
+	WARN_ON_ONCE(radix_tree_is_internal_node(item));
+
+	count = !!item - !!old;
+	exceptional = !!radix_tree_exceptional_entry(item) -
+		      !!radix_tree_exceptional_entry(old);
+
+	WARN_ON_ONCE(warn_typeswitch && (count || exceptional));
+
+	if (node) {
+		node->count += count;
+		node->exceptional += exceptional;
+	}
+
+	rcu_assign_pointer(*slot, item);
+}
+
+/**
+ * __radix_tree_replace		- replace item in a slot
+ * @root:		radix tree root
+ * @node:		pointer to tree node
+ * @slot:		pointer to slot in @node
+ * @item:		new item to store in the slot.
+ * @update_node:	callback for changing leaf nodes
+ * @private:		private data to pass to @update_node
+ *
+ * For use with __radix_tree_lookup().  Caller must hold tree write locked
+ * across slot lookup and replacement.
+ */
+void __radix_tree_replace(struct radix_tree_root *root,
+			  struct radix_tree_node *node,
+			  void **slot, void *item,
+			  radix_tree_update_node_t update_node, void *private)
+{
+	/*
+	 * This function supports replacing exceptional entries and
+	 * deleting entries, but that needs accounting against the
+	 * node unless the slot is root->rnode.
+	 */
+	replace_slot(root, node, slot, item,
+		     !node && slot != (void **)&root->rnode);
+
+	if (!node)
+		return;
+
+	if (update_node)
+		update_node(node, private);
+
+	delete_node(root, node, update_node, private);
+}
+
+/**
+ * radix_tree_replace_slot	- replace item in a slot
+ * @root:	radix tree root
+ * @slot:	pointer to slot
+ * @item:	new item to store in the slot.
+ *
+ * For use with radix_tree_lookup_slot(), radix_tree_gang_lookup_slot(),
+ * radix_tree_gang_lookup_tag_slot().  Caller must hold tree write locked
+ * across slot lookup and replacement.
+ *
+ * NOTE: This cannot be used to switch between non-entries (empty slots),
+ * regular entries, and exceptional entries, as that requires accounting
+ * inside the radix tree node. When switching from one type of entry or
+ * deleting, use __radix_tree_lookup() and __radix_tree_replace().
+ */
+void radix_tree_replace_slot(struct radix_tree_root *root,
+			     void **slot, void *item)
+{
+	replace_slot(root, NULL, slot, item, true);
+}
+
 /**
 /**
  *	radix_tree_tag_set - set a tag on a radix tree node
  *	radix_tree_tag_set - set a tag on a radix tree node
  *	@root:		radix tree root
  *	@root:		radix tree root
@@ -1393,75 +1576,6 @@ unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item)
 }
 }
 #endif /* CONFIG_SHMEM && CONFIG_SWAP */
 #endif /* CONFIG_SHMEM && CONFIG_SWAP */
 
 
-/**
- *	radix_tree_shrink    -    shrink radix tree to minimum height
- *	@root		radix tree root
- */
-static inline bool radix_tree_shrink(struct radix_tree_root *root)
-{
-	bool shrunk = false;
-
-	for (;;) {
-		struct radix_tree_node *node = root->rnode;
-		struct radix_tree_node *child;
-
-		if (!radix_tree_is_internal_node(node))
-			break;
-		node = entry_to_node(node);
-
-		/*
-		 * The candidate node has more than one child, or its child
-		 * is not at the leftmost slot, or the child is a multiorder
-		 * entry, we cannot shrink.
-		 */
-		if (node->count != 1)
-			break;
-		child = node->slots[0];
-		if (!child)
-			break;
-		if (!radix_tree_is_internal_node(child) && node->shift)
-			break;
-
-		if (radix_tree_is_internal_node(child))
-			entry_to_node(child)->parent = NULL;
-
-		/*
-		 * We don't need rcu_assign_pointer(), since we are simply
-		 * moving the node from one part of the tree to another: if it
-		 * was safe to dereference the old pointer to it
-		 * (node->slots[0]), it will be safe to dereference the new
-		 * one (root->rnode) as far as dependent read barriers go.
-		 */
-		root->rnode = child;
-
-		/*
-		 * We have a dilemma here. The node's slot[0] must not be
-		 * NULLed in case there are concurrent lookups expecting to
-		 * find the item. However if this was a bottom-level node,
-		 * then it may be subject to the slot pointer being visible
-		 * to callers dereferencing it. If item corresponding to
-		 * slot[0] is subsequently deleted, these callers would expect
-		 * their slot to become empty sooner or later.
-		 *
-		 * For example, lockless pagecache will look up a slot, deref
-		 * the page pointer, and if the page has 0 refcount it means it
-		 * was concurrently deleted from pagecache so try the deref
-		 * again. Fortunately there is already a requirement for logic
-		 * to retry the entire slot lookup -- the indirect pointer
-		 * problem (replacing direct root node with an indirect pointer
-		 * also results in a stale slot). So tag the slot as indirect
-		 * to force callers to retry.
-		 */
-		if (!radix_tree_is_internal_node(child))
-			node->slots[0] = RADIX_TREE_RETRY;
-
-		radix_tree_node_free(node);
-		shrunk = true;
-	}
-
-	return shrunk;
-}
-
 /**
 /**
  *	__radix_tree_delete_node    -    try to free node after clearing a slot
  *	__radix_tree_delete_node    -    try to free node after clearing a slot
  *	@root:		radix tree root
  *	@root:		radix tree root
@@ -1470,39 +1584,11 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root)
  *	After clearing the slot at @index in @node from radix tree
  *	After clearing the slot at @index in @node from radix tree
  *	rooted at @root, call this function to attempt freeing the
  *	rooted at @root, call this function to attempt freeing the
  *	node and shrinking the tree.
  *	node and shrinking the tree.
- *
- *	Returns %true if @node was freed, %false otherwise.
  */
  */
-bool __radix_tree_delete_node(struct radix_tree_root *root,
+void __radix_tree_delete_node(struct radix_tree_root *root,
 			      struct radix_tree_node *node)
 			      struct radix_tree_node *node)
 {
 {
-	bool deleted = false;
-
-	do {
-		struct radix_tree_node *parent;
-
-		if (node->count) {
-			if (node == entry_to_node(root->rnode))
-				deleted |= radix_tree_shrink(root);
-			return deleted;
-		}
-
-		parent = node->parent;
-		if (parent) {
-			parent->slots[node->offset] = NULL;
-			parent->count--;
-		} else {
-			root_tag_clear_all(root);
-			root->rnode = NULL;
-		}
-
-		radix_tree_node_free(node);
-		deleted = true;
-
-		node = parent;
-	} while (node);
-
-	return deleted;
+	delete_node(root, node, NULL, NULL);
 }
 }
 
 
 static inline void delete_sibling_entries(struct radix_tree_node *node,
 static inline void delete_sibling_entries(struct radix_tree_node *node,
@@ -1559,10 +1645,7 @@ void *radix_tree_delete_item(struct radix_tree_root *root,
 		node_tag_clear(root, node, tag, offset);
 		node_tag_clear(root, node, tag, offset);
 
 
 	delete_sibling_entries(node, node_to_entry(slot), offset);
 	delete_sibling_entries(node, node_to_entry(slot), offset);
-	node->slots[offset] = NULL;
-	node->count--;
-
-	__radix_tree_delete_node(root, node);
+	__radix_tree_replace(root, node, slot, NULL, NULL, NULL);
 
 
 	return entry;
 	return entry;
 }
 }

+ 19 - 4
lib/rbtree.c

@@ -296,11 +296,26 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root,
 				 *
 				 *
 				 *   (p)           (p)
 				 *   (p)           (p)
 				 *   / \           / \
 				 *   / \           / \
-				 *  N   S    -->  N   Sl
+				 *  N   S    -->  N   sl
 				 *     / \             \
 				 *     / \             \
-				 *    sl  Sr            s
+				 *    sl  Sr            S
 				 *                       \
 				 *                       \
 				 *                        Sr
 				 *                        Sr
+				 *
+				 * Note: p might be red, and then both
+				 * p and sl are red after rotation(which
+				 * breaks property 4). This is fixed in
+				 * Case 4 (in __rb_rotate_set_parents()
+				 *         which set sl the color of p
+				 *         and set p RB_BLACK)
+				 *
+				 *   (p)            (sl)
+				 *   / \            /  \
+				 *  N   sl   -->   P    S
+				 *       \        /      \
+				 *        S      N        Sr
+				 *         \
+				 *          Sr
 				 */
 				 */
 				tmp1 = tmp2->rb_right;
 				tmp1 = tmp2->rb_right;
 				WRITE_ONCE(sibling->rb_left, tmp1);
 				WRITE_ONCE(sibling->rb_left, tmp1);
@@ -365,7 +380,7 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root,
 					}
 					}
 					break;
 					break;
 				}
 				}
-				/* Case 3 - right rotate at sibling */
+				/* Case 3 - left rotate at sibling */
 				tmp1 = tmp2->rb_left;
 				tmp1 = tmp2->rb_left;
 				WRITE_ONCE(sibling->rb_right, tmp1);
 				WRITE_ONCE(sibling->rb_right, tmp1);
 				WRITE_ONCE(tmp2->rb_left, sibling);
 				WRITE_ONCE(tmp2->rb_left, sibling);
@@ -377,7 +392,7 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root,
 				tmp1 = sibling;
 				tmp1 = sibling;
 				sibling = tmp2;
 				sibling = tmp2;
 			}
 			}
-			/* Case 4 - left rotate at parent + color flips */
+			/* Case 4 - right rotate at parent + color flips */
 			tmp2 = sibling->rb_right;
 			tmp2 = sibling->rb_right;
 			WRITE_ONCE(parent->rb_left, tmp2);
 			WRITE_ONCE(parent->rb_left, tmp2);
 			WRITE_ONCE(sibling->rb_right, parent);
 			WRITE_ONCE(sibling->rb_right, parent);

+ 2 - 6
mm/Kconfig

@@ -153,7 +153,7 @@ config MOVABLE_NODE
 	bool "Enable to assign a node which has only movable memory"
 	bool "Enable to assign a node which has only movable memory"
 	depends on HAVE_MEMBLOCK
 	depends on HAVE_MEMBLOCK
 	depends on NO_BOOTMEM
 	depends on NO_BOOTMEM
-	depends on X86_64
+	depends on X86_64 || OF_EARLY_FLATTREE || MEMORY_HOTPLUG
 	depends on NUMA
 	depends on NUMA
 	default n
 	default n
 	help
 	help
@@ -447,13 +447,9 @@ choice
 	  benefit.
 	  benefit.
 endchoice
 endchoice
 
 
-#
-# We don't deposit page tables on file THP mapping,
-# but Power makes use of them to address MMU quirk.
-#
 config	TRANSPARENT_HUGE_PAGECACHE
 config	TRANSPARENT_HUGE_PAGECACHE
 	def_bool y
 	def_bool y
-	depends on TRANSPARENT_HUGEPAGE && !PPC
+	depends on TRANSPARENT_HUGEPAGE
 
 
 #
 #
 # UP and nommu archs use km based percpu allocator
 # UP and nommu archs use km based percpu allocator

+ 3 - 22
mm/compaction.c

@@ -634,22 +634,6 @@ isolate_freepages_range(struct compact_control *cc,
 	return pfn;
 	return pfn;
 }
 }
 
 
-/* Update the number of anon and file isolated pages in the zone */
-static void acct_isolated(struct zone *zone, struct compact_control *cc)
-{
-	struct page *page;
-	unsigned int count[2] = { 0, };
-
-	if (list_empty(&cc->migratepages))
-		return;
-
-	list_for_each_entry(page, &cc->migratepages, lru)
-		count[!!page_is_file_cache(page)]++;
-
-	mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON, count[0]);
-	mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, count[1]);
-}
-
 /* Similar to reclaim, but different enough that they don't share logic */
 /* Similar to reclaim, but different enough that they don't share logic */
 static bool too_many_isolated(struct zone *zone)
 static bool too_many_isolated(struct zone *zone)
 {
 {
@@ -866,6 +850,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 
 
 		/* Successfully isolated */
 		/* Successfully isolated */
 		del_page_from_lru_list(page, lruvec, page_lru(page));
 		del_page_from_lru_list(page, lruvec, page_lru(page));
+		inc_node_page_state(page,
+				NR_ISOLATED_ANON + page_is_file_cache(page));
 
 
 isolate_success:
 isolate_success:
 		list_add(&page->lru, &cc->migratepages);
 		list_add(&page->lru, &cc->migratepages);
@@ -902,7 +888,6 @@ isolate_fail:
 				spin_unlock_irqrestore(zone_lru_lock(zone), flags);
 				spin_unlock_irqrestore(zone_lru_lock(zone), flags);
 				locked = false;
 				locked = false;
 			}
 			}
-			acct_isolated(zone, cc);
 			putback_movable_pages(&cc->migratepages);
 			putback_movable_pages(&cc->migratepages);
 			cc->nr_migratepages = 0;
 			cc->nr_migratepages = 0;
 			cc->last_migrated_pfn = 0;
 			cc->last_migrated_pfn = 0;
@@ -988,7 +973,6 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
 		if (cc->nr_migratepages == COMPACT_CLUSTER_MAX)
 		if (cc->nr_migratepages == COMPACT_CLUSTER_MAX)
 			break;
 			break;
 	}
 	}
-	acct_isolated(cc->zone, cc);
 
 
 	return pfn;
 	return pfn;
 }
 }
@@ -1258,10 +1242,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
 		low_pfn = isolate_migratepages_block(cc, low_pfn,
 		low_pfn = isolate_migratepages_block(cc, low_pfn,
 						block_end_pfn, isolate_mode);
 						block_end_pfn, isolate_mode);
 
 
-		if (!low_pfn || cc->contended) {
-			acct_isolated(zone, cc);
+		if (!low_pfn || cc->contended)
 			return ISOLATE_ABORT;
 			return ISOLATE_ABORT;
-		}
 
 
 		/*
 		/*
 		 * Either we isolated something and proceed with migration. Or
 		 * Either we isolated something and proceed with migration. Or
@@ -1271,7 +1253,6 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
 		break;
 		break;
 	}
 	}
 
 
-	acct_isolated(zone, cc);
 	/* Record where migration scanner will be restarted. */
 	/* Record where migration scanner will be restarted. */
 	cc->migrate_pfn = low_pfn;
 	cc->migrate_pfn = low_pfn;
 
 

+ 4 - 0
mm/debug.c

@@ -59,6 +59,10 @@ void __dump_page(struct page *page, const char *reason)
 
 
 	pr_emerg("flags: %#lx(%pGp)\n", page->flags, &page->flags);
 	pr_emerg("flags: %#lx(%pGp)\n", page->flags, &page->flags);
 
 
+	print_hex_dump(KERN_ALERT, "raw: ", DUMP_PREFIX_NONE, 32,
+			sizeof(unsigned long), page,
+			sizeof(struct page), false);
+
 	if (reason)
 	if (reason)
 		pr_alert("page dumped because: %s\n", reason);
 		pr_alert("page dumped because: %s\n", reason);
 
 

+ 10 - 58
mm/filemap.c

@@ -132,44 +132,29 @@ static int page_cache_tree_insert(struct address_space *mapping,
 		if (!dax_mapping(mapping)) {
 		if (!dax_mapping(mapping)) {
 			if (shadowp)
 			if (shadowp)
 				*shadowp = p;
 				*shadowp = p;
-			if (node)
-				workingset_node_shadows_dec(node);
 		} else {
 		} else {
 			/* DAX can replace empty locked entry with a hole */
 			/* DAX can replace empty locked entry with a hole */
 			WARN_ON_ONCE(p !=
 			WARN_ON_ONCE(p !=
 				(void *)(RADIX_TREE_EXCEPTIONAL_ENTRY |
 				(void *)(RADIX_TREE_EXCEPTIONAL_ENTRY |
 					 RADIX_DAX_ENTRY_LOCK));
 					 RADIX_DAX_ENTRY_LOCK));
-			/* DAX accounts exceptional entries as normal pages */
-			if (node)
-				workingset_node_pages_dec(node);
 			/* Wakeup waiters for exceptional entry lock */
 			/* Wakeup waiters for exceptional entry lock */
 			dax_wake_mapping_entry_waiter(mapping, page->index,
 			dax_wake_mapping_entry_waiter(mapping, page->index,
 						      false);
 						      false);
 		}
 		}
 	}
 	}
-	radix_tree_replace_slot(slot, page);
+	__radix_tree_replace(&mapping->page_tree, node, slot, page,
+			     workingset_update_node, mapping);
 	mapping->nrpages++;
 	mapping->nrpages++;
-	if (node) {
-		workingset_node_pages_inc(node);
-		/*
-		 * Don't track node that contains actual pages.
-		 *
-		 * Avoid acquiring the list_lru lock if already
-		 * untracked.  The list_empty() test is safe as
-		 * node->private_list is protected by
-		 * mapping->tree_lock.
-		 */
-		if (!list_empty(&node->private_list))
-			list_lru_del(&workingset_shadow_nodes,
-				     &node->private_list);
-	}
 	return 0;
 	return 0;
 }
 }
 
 
 static void page_cache_tree_delete(struct address_space *mapping,
 static void page_cache_tree_delete(struct address_space *mapping,
 				   struct page *page, void *shadow)
 				   struct page *page, void *shadow)
 {
 {
-	int i, nr = PageHuge(page) ? 1 : hpage_nr_pages(page);
+	int i, nr;
+
+	/* hugetlb pages are represented by one entry in the radix tree */
+	nr = PageHuge(page) ? 1 : hpage_nr_pages(page);
 
 
 	VM_BUG_ON_PAGE(!PageLocked(page), page);
 	VM_BUG_ON_PAGE(!PageLocked(page), page);
 	VM_BUG_ON_PAGE(PageTail(page), page);
 	VM_BUG_ON_PAGE(PageTail(page), page);
@@ -182,44 +167,11 @@ static void page_cache_tree_delete(struct address_space *mapping,
 		__radix_tree_lookup(&mapping->page_tree, page->index + i,
 		__radix_tree_lookup(&mapping->page_tree, page->index + i,
 				    &node, &slot);
 				    &node, &slot);
 
 
-		radix_tree_clear_tags(&mapping->page_tree, node, slot);
-
-		if (!node) {
-			VM_BUG_ON_PAGE(nr != 1, page);
-			/*
-			 * We need a node to properly account shadow
-			 * entries. Don't plant any without. XXX
-			 */
-			shadow = NULL;
-		}
-
-		radix_tree_replace_slot(slot, shadow);
+		VM_BUG_ON_PAGE(!node && nr != 1, page);
 
 
-		if (!node)
-			break;
-
-		workingset_node_pages_dec(node);
-		if (shadow)
-			workingset_node_shadows_inc(node);
-		else
-			if (__radix_tree_delete_node(&mapping->page_tree, node))
-				continue;
-
-		/*
-		 * Track node that only contains shadow entries. DAX mappings
-		 * contain no shadow entries and may contain other exceptional
-		 * entries so skip those.
-		 *
-		 * Avoid acquiring the list_lru lock if already tracked.
-		 * The list_empty() test is safe as node->private_list is
-		 * protected by mapping->tree_lock.
-		 */
-		if (!dax_mapping(mapping) && !workingset_node_pages(node) &&
-				list_empty(&node->private_list)) {
-			node->private_data = mapping;
-			list_lru_add(&workingset_shadow_nodes,
-					&node->private_list);
-		}
+		radix_tree_clear_tags(&mapping->page_tree, node, slot);
+		__radix_tree_replace(&mapping->page_tree, node, slot, shadow,
+				     workingset_update_node, mapping);
 	}
 	}
 
 
 	if (shadow) {
 	if (shadow) {

+ 8 - 11
mm/gup.c

@@ -632,7 +632,8 @@ next_page:
 	return i;
 	return i;
 }
 }
 
 
-bool vma_permits_fault(struct vm_area_struct *vma, unsigned int fault_flags)
+static bool vma_permits_fault(struct vm_area_struct *vma,
+			      unsigned int fault_flags)
 {
 {
 	bool write   = !!(fault_flags & FAULT_FLAG_WRITE);
 	bool write   = !!(fault_flags & FAULT_FLAG_WRITE);
 	bool foreign = !!(fault_flags & FAULT_FLAG_REMOTE);
 	bool foreign = !!(fault_flags & FAULT_FLAG_REMOTE);
@@ -857,14 +858,12 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
 EXPORT_SYMBOL(get_user_pages_locked);
 EXPORT_SYMBOL(get_user_pages_locked);
 
 
 /*
 /*
- * Same as get_user_pages_unlocked(...., FOLL_TOUCH) but it allows to
- * pass additional gup_flags as last parameter (like FOLL_HWPOISON).
+ * Same as get_user_pages_unlocked(...., FOLL_TOUCH) but it allows for
+ * tsk, mm to be specified.
  *
  *
  * NOTE: here FOLL_TOUCH is not set implicitly and must be set by the
  * NOTE: here FOLL_TOUCH is not set implicitly and must be set by the
- * caller if required (just like with __get_user_pages). "FOLL_GET",
- * "FOLL_WRITE" and "FOLL_FORCE" are set implicitly as needed
- * according to the parameters "pages", "write", "force"
- * respectively.
+ * caller if required (just like with __get_user_pages). "FOLL_GET"
+ * is set implicitly if "pages" is non-NULL.
  */
  */
 __always_inline long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
 __always_inline long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
 					       unsigned long start, unsigned long nr_pages,
 					       unsigned long start, unsigned long nr_pages,
@@ -894,10 +893,8 @@ EXPORT_SYMBOL(__get_user_pages_unlocked);
  *      get_user_pages_unlocked(tsk, mm, ..., pages);
  *      get_user_pages_unlocked(tsk, mm, ..., pages);
  *
  *
  * It is functionally equivalent to get_user_pages_fast so
  * It is functionally equivalent to get_user_pages_fast so
- * get_user_pages_fast should be used instead, if the two parameters
- * "tsk" and "mm" are respectively equal to current and current->mm,
- * or if "force" shall be set to 1 (get_user_pages_fast misses the
- * "force" parameter).
+ * get_user_pages_fast should be used instead if specific gup_flags
+ * (e.g. FOLL_FORCE) are not required.
  */
  */
 long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
 long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
 			     struct page **pages, unsigned int gup_flags)
 			     struct page **pages, unsigned int gup_flags)

+ 49 - 4
mm/huge_memory.c

@@ -285,6 +285,15 @@ static ssize_t use_zero_page_store(struct kobject *kobj,
 }
 }
 static struct kobj_attribute use_zero_page_attr =
 static struct kobj_attribute use_zero_page_attr =
 	__ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store);
 	__ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store);
+
+static ssize_t hpage_pmd_size_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%lu\n", HPAGE_PMD_SIZE);
+}
+static struct kobj_attribute hpage_pmd_size_attr =
+	__ATTR_RO(hpage_pmd_size);
+
 #ifdef CONFIG_DEBUG_VM
 #ifdef CONFIG_DEBUG_VM
 static ssize_t debug_cow_show(struct kobject *kobj,
 static ssize_t debug_cow_show(struct kobject *kobj,
 				struct kobj_attribute *attr, char *buf)
 				struct kobj_attribute *attr, char *buf)
@@ -307,6 +316,7 @@ static struct attribute *hugepage_attr[] = {
 	&enabled_attr.attr,
 	&enabled_attr.attr,
 	&defrag_attr.attr,
 	&defrag_attr.attr,
 	&use_zero_page_attr.attr,
 	&use_zero_page_attr.attr,
+	&hpage_pmd_size_attr.attr,
 #if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE)
 #if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE)
 	&shmem_enabled_attr.attr,
 	&shmem_enabled_attr.attr,
 #endif
 #endif
@@ -1323,6 +1333,8 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 	struct mm_struct *mm = tlb->mm;
 	struct mm_struct *mm = tlb->mm;
 	bool ret = false;
 	bool ret = false;
 
 
+	tlb_remove_check_page_size_change(tlb, HPAGE_PMD_SIZE);
+
 	ptl = pmd_trans_huge_lock(pmd, vma);
 	ptl = pmd_trans_huge_lock(pmd, vma);
 	if (!ptl)
 	if (!ptl)
 		goto out_unlocked;
 		goto out_unlocked;
@@ -1378,12 +1390,23 @@ out_unlocked:
 	return ret;
 	return ret;
 }
 }
 
 
+static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
+{
+	pgtable_t pgtable;
+
+	pgtable = pgtable_trans_huge_withdraw(mm, pmd);
+	pte_free(mm, pgtable);
+	atomic_long_dec(&mm->nr_ptes);
+}
+
 int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 		 pmd_t *pmd, unsigned long addr)
 		 pmd_t *pmd, unsigned long addr)
 {
 {
 	pmd_t orig_pmd;
 	pmd_t orig_pmd;
 	spinlock_t *ptl;
 	spinlock_t *ptl;
 
 
+	tlb_remove_check_page_size_change(tlb, HPAGE_PMD_SIZE);
+
 	ptl = __pmd_trans_huge_lock(pmd, vma);
 	ptl = __pmd_trans_huge_lock(pmd, vma);
 	if (!ptl)
 	if (!ptl)
 		return 0;
 		return 0;
@@ -1399,12 +1422,12 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 	if (vma_is_dax(vma)) {
 	if (vma_is_dax(vma)) {
 		spin_unlock(ptl);
 		spin_unlock(ptl);
 		if (is_huge_zero_pmd(orig_pmd))
 		if (is_huge_zero_pmd(orig_pmd))
-			tlb_remove_page(tlb, pmd_page(orig_pmd));
+			tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE);
 	} else if (is_huge_zero_pmd(orig_pmd)) {
 	} else if (is_huge_zero_pmd(orig_pmd)) {
 		pte_free(tlb->mm, pgtable_trans_huge_withdraw(tlb->mm, pmd));
 		pte_free(tlb->mm, pgtable_trans_huge_withdraw(tlb->mm, pmd));
 		atomic_long_dec(&tlb->mm->nr_ptes);
 		atomic_long_dec(&tlb->mm->nr_ptes);
 		spin_unlock(ptl);
 		spin_unlock(ptl);
-		tlb_remove_page(tlb, pmd_page(orig_pmd));
+		tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE);
 	} else {
 	} else {
 		struct page *page = pmd_page(orig_pmd);
 		struct page *page = pmd_page(orig_pmd);
 		page_remove_rmap(page, true);
 		page_remove_rmap(page, true);
@@ -1417,6 +1440,8 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 			atomic_long_dec(&tlb->mm->nr_ptes);
 			atomic_long_dec(&tlb->mm->nr_ptes);
 			add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
 			add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
 		} else {
 		} else {
+			if (arch_needs_pgtable_deposit())
+				zap_deposited_table(tlb->mm, pmd);
 			add_mm_counter(tlb->mm, MM_FILEPAGES, -HPAGE_PMD_NR);
 			add_mm_counter(tlb->mm, MM_FILEPAGES, -HPAGE_PMD_NR);
 		}
 		}
 		spin_unlock(ptl);
 		spin_unlock(ptl);
@@ -1425,6 +1450,21 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 	return 1;
 	return 1;
 }
 }
 
 
+#ifndef pmd_move_must_withdraw
+static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
+					 spinlock_t *old_pmd_ptl,
+					 struct vm_area_struct *vma)
+{
+	/*
+	 * With split pmd lock we also need to move preallocated
+	 * PTE page table if new_pmd is on different PMD page table.
+	 *
+	 * We also don't deposit and withdraw tables for file pages.
+	 */
+	return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma);
+}
+#endif
+
 bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 		  unsigned long new_addr, unsigned long old_end,
 		  unsigned long new_addr, unsigned long old_end,
 		  pmd_t *old_pmd, pmd_t *new_pmd, bool *need_flush)
 		  pmd_t *old_pmd, pmd_t *new_pmd, bool *need_flush)
@@ -1462,8 +1502,7 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 			force_flush = true;
 			force_flush = true;
 		VM_BUG_ON(!pmd_none(*new_pmd));
 		VM_BUG_ON(!pmd_none(*new_pmd));
 
 
-		if (pmd_move_must_withdraw(new_ptl, old_ptl) &&
-				vma_is_anonymous(vma)) {
+		if (pmd_move_must_withdraw(new_ptl, old_ptl, vma)) {
 			pgtable_t pgtable;
 			pgtable_t pgtable;
 			pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
 			pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
 			pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
 			pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
@@ -1589,6 +1628,12 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 
 
 	if (!vma_is_anonymous(vma)) {
 	if (!vma_is_anonymous(vma)) {
 		_pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
 		_pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
+		/*
+		 * We are going to unmap this huge page. So
+		 * just go ahead and zap it
+		 */
+		if (arch_needs_pgtable_deposit())
+			zap_deposited_table(mm, pmd);
 		if (vma_is_dax(vma))
 		if (vma_is_dax(vma))
 			return;
 			return;
 		page = pmd_page(_pmd);
 		page = pmd_page(_pmd);

+ 15 - 10
mm/hugetlb.c

@@ -3286,6 +3286,11 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
 	BUG_ON(start & ~huge_page_mask(h));
 	BUG_ON(start & ~huge_page_mask(h));
 	BUG_ON(end & ~huge_page_mask(h));
 	BUG_ON(end & ~huge_page_mask(h));
 
 
+	/*
+	 * This is a hugetlb vma, all the pte entries should point
+	 * to huge page.
+	 */
+	tlb_remove_check_page_size_change(tlb, sz);
 	tlb_start_vma(tlb, vma);
 	tlb_start_vma(tlb, vma);
 	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
 	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
 	address = start;
 	address = start;
@@ -3336,7 +3341,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
 		}
 		}
 
 
 		pte = huge_ptep_get_and_clear(mm, address, ptep);
 		pte = huge_ptep_get_and_clear(mm, address, ptep);
-		tlb_remove_tlb_entry(tlb, ptep, address);
+		tlb_remove_huge_tlb_entry(h, tlb, ptep, address);
 		if (huge_pte_dirty(pte))
 		if (huge_pte_dirty(pte))
 			set_page_dirty(page);
 			set_page_dirty(page);
 
 
@@ -3450,15 +3455,17 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
  * Keep the pte_same checks anyway to make transition from the mutex easier.
  * Keep the pte_same checks anyway to make transition from the mutex easier.
  */
  */
 static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
 static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
-			unsigned long address, pte_t *ptep, pte_t pte,
-			struct page *pagecache_page, spinlock_t *ptl)
+		       unsigned long address, pte_t *ptep,
+		       struct page *pagecache_page, spinlock_t *ptl)
 {
 {
+	pte_t pte;
 	struct hstate *h = hstate_vma(vma);
 	struct hstate *h = hstate_vma(vma);
 	struct page *old_page, *new_page;
 	struct page *old_page, *new_page;
 	int ret = 0, outside_reserve = 0;
 	int ret = 0, outside_reserve = 0;
 	unsigned long mmun_start;	/* For mmu_notifiers */
 	unsigned long mmun_start;	/* For mmu_notifiers */
 	unsigned long mmun_end;		/* For mmu_notifiers */
 	unsigned long mmun_end;		/* For mmu_notifiers */
 
 
+	pte = huge_ptep_get(ptep);
 	old_page = pte_page(pte);
 	old_page = pte_page(pte);
 
 
 retry_avoidcopy:
 retry_avoidcopy:
@@ -3711,8 +3718,7 @@ retry:
 		vma_end_reservation(h, vma, address);
 		vma_end_reservation(h, vma, address);
 	}
 	}
 
 
-	ptl = huge_pte_lockptr(h, mm, ptep);
-	spin_lock(ptl);
+	ptl = huge_pte_lock(h, mm, ptep);
 	size = i_size_read(mapping->host) >> huge_page_shift(h);
 	size = i_size_read(mapping->host) >> huge_page_shift(h);
 	if (idx >= size)
 	if (idx >= size)
 		goto backout;
 		goto backout;
@@ -3733,7 +3739,7 @@ retry:
 	hugetlb_count_add(pages_per_huge_page(h), mm);
 	hugetlb_count_add(pages_per_huge_page(h), mm);
 	if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
 	if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
 		/* Optimization, do the COW without a second fault */
 		/* Optimization, do the COW without a second fault */
-		ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page, ptl);
+		ret = hugetlb_cow(mm, vma, address, ptep, page, ptl);
 	}
 	}
 
 
 	spin_unlock(ptl);
 	spin_unlock(ptl);
@@ -3888,8 +3894,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 
 
 	if (flags & FAULT_FLAG_WRITE) {
 	if (flags & FAULT_FLAG_WRITE) {
 		if (!huge_pte_write(entry)) {
 		if (!huge_pte_write(entry)) {
-			ret = hugetlb_cow(mm, vma, address, ptep, entry,
-					pagecache_page, ptl);
+			ret = hugetlb_cow(mm, vma, address, ptep,
+					  pagecache_page, ptl);
 			goto out_put_page;
 			goto out_put_page;
 		}
 		}
 		entry = huge_pte_mkdirty(entry);
 		entry = huge_pte_mkdirty(entry);
@@ -4330,8 +4336,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
 	if (!spte)
 	if (!spte)
 		goto out;
 		goto out;
 
 
-	ptl = huge_pte_lockptr(hstate_vma(vma), mm, spte);
-	spin_lock(ptl);
+	ptl = huge_pte_lock(hstate_vma(vma), mm, spte);
 	if (pud_none(*pud)) {
 	if (pud_none(*pud)) {
 		pud_populate(mm, pud,
 		pud_populate(mm, pud,
 				(pmd_t *)((unsigned long)spte & PAGE_MASK));
 				(pmd_t *)((unsigned long)spte & PAGE_MASK));

+ 48 - 46
mm/kasan/quarantine.c

@@ -86,24 +86,9 @@ static void qlist_move_all(struct qlist_head *from, struct qlist_head *to)
 	qlist_init(from);
 	qlist_init(from);
 }
 }
 
 
-static void qlist_move(struct qlist_head *from, struct qlist_node *last,
-		struct qlist_head *to, size_t size)
-{
-	if (unlikely(last == from->tail)) {
-		qlist_move_all(from, to);
-		return;
-	}
-	if (qlist_empty(to))
-		to->head = from->head;
-	else
-		to->tail->next = from->head;
-	to->tail = last;
-	from->head = last->next;
-	last->next = NULL;
-	from->bytes -= size;
-	to->bytes += size;
-}
-
+#define QUARANTINE_PERCPU_SIZE (1 << 20)
+#define QUARANTINE_BATCHES \
+	(1024 > 4 * CONFIG_NR_CPUS ? 1024 : 4 * CONFIG_NR_CPUS)
 
 
 /*
 /*
  * The object quarantine consists of per-cpu queues and a global queue,
  * The object quarantine consists of per-cpu queues and a global queue,
@@ -111,11 +96,22 @@ static void qlist_move(struct qlist_head *from, struct qlist_node *last,
  */
  */
 static DEFINE_PER_CPU(struct qlist_head, cpu_quarantine);
 static DEFINE_PER_CPU(struct qlist_head, cpu_quarantine);
 
 
-static struct qlist_head global_quarantine;
+/* Round-robin FIFO array of batches. */
+static struct qlist_head global_quarantine[QUARANTINE_BATCHES];
+static int quarantine_head;
+static int quarantine_tail;
+/* Total size of all objects in global_quarantine across all batches. */
+static unsigned long quarantine_size;
 static DEFINE_SPINLOCK(quarantine_lock);
 static DEFINE_SPINLOCK(quarantine_lock);
 
 
 /* Maximum size of the global queue. */
 /* Maximum size of the global queue. */
-static unsigned long quarantine_size;
+static unsigned long quarantine_max_size;
+
+/*
+ * Target size of a batch in global_quarantine.
+ * Usually equal to QUARANTINE_PERCPU_SIZE unless we have too much RAM.
+ */
+static unsigned long quarantine_batch_size;
 
 
 /*
 /*
  * The fraction of physical memory the quarantine is allowed to occupy.
  * The fraction of physical memory the quarantine is allowed to occupy.
@@ -124,9 +120,6 @@ static unsigned long quarantine_size;
  */
  */
 #define QUARANTINE_FRACTION 32
 #define QUARANTINE_FRACTION 32
 
 
-#define QUARANTINE_LOW_SIZE (READ_ONCE(quarantine_size) * 3 / 4)
-#define QUARANTINE_PERCPU_SIZE (1 << 20)
-
 static struct kmem_cache *qlink_to_cache(struct qlist_node *qlink)
 static struct kmem_cache *qlink_to_cache(struct qlist_node *qlink)
 {
 {
 	return virt_to_head_page(qlink)->slab_cache;
 	return virt_to_head_page(qlink)->slab_cache;
@@ -191,21 +184,30 @@ void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache)
 
 
 	if (unlikely(!qlist_empty(&temp))) {
 	if (unlikely(!qlist_empty(&temp))) {
 		spin_lock_irqsave(&quarantine_lock, flags);
 		spin_lock_irqsave(&quarantine_lock, flags);
-		qlist_move_all(&temp, &global_quarantine);
+		WRITE_ONCE(quarantine_size, quarantine_size + temp.bytes);
+		qlist_move_all(&temp, &global_quarantine[quarantine_tail]);
+		if (global_quarantine[quarantine_tail].bytes >=
+				READ_ONCE(quarantine_batch_size)) {
+			int new_tail;
+
+			new_tail = quarantine_tail + 1;
+			if (new_tail == QUARANTINE_BATCHES)
+				new_tail = 0;
+			if (new_tail != quarantine_head)
+				quarantine_tail = new_tail;
+		}
 		spin_unlock_irqrestore(&quarantine_lock, flags);
 		spin_unlock_irqrestore(&quarantine_lock, flags);
 	}
 	}
 }
 }
 
 
 void quarantine_reduce(void)
 void quarantine_reduce(void)
 {
 {
-	size_t new_quarantine_size, percpu_quarantines;
+	size_t total_size, new_quarantine_size, percpu_quarantines;
 	unsigned long flags;
 	unsigned long flags;
 	struct qlist_head to_free = QLIST_INIT;
 	struct qlist_head to_free = QLIST_INIT;
-	size_t size_to_free = 0;
-	struct qlist_node *last;
 
 
-	if (likely(READ_ONCE(global_quarantine.bytes) <=
-		   READ_ONCE(quarantine_size)))
+	if (likely(READ_ONCE(quarantine_size) <=
+		   READ_ONCE(quarantine_max_size)))
 		return;
 		return;
 
 
 	spin_lock_irqsave(&quarantine_lock, flags);
 	spin_lock_irqsave(&quarantine_lock, flags);
@@ -214,24 +216,23 @@ void quarantine_reduce(void)
 	 * Update quarantine size in case of hotplug. Allocate a fraction of
 	 * Update quarantine size in case of hotplug. Allocate a fraction of
 	 * the installed memory to quarantine minus per-cpu queue limits.
 	 * the installed memory to quarantine minus per-cpu queue limits.
 	 */
 	 */
-	new_quarantine_size = (READ_ONCE(totalram_pages) << PAGE_SHIFT) /
+	total_size = (READ_ONCE(totalram_pages) << PAGE_SHIFT) /
 		QUARANTINE_FRACTION;
 		QUARANTINE_FRACTION;
 	percpu_quarantines = QUARANTINE_PERCPU_SIZE * num_online_cpus();
 	percpu_quarantines = QUARANTINE_PERCPU_SIZE * num_online_cpus();
-	new_quarantine_size = (new_quarantine_size < percpu_quarantines) ?
-		0 : new_quarantine_size - percpu_quarantines;
-	WRITE_ONCE(quarantine_size, new_quarantine_size);
-
-	last = global_quarantine.head;
-	while (last) {
-		struct kmem_cache *cache = qlink_to_cache(last);
-
-		size_to_free += cache->size;
-		if (!last->next || size_to_free >
-		    global_quarantine.bytes - QUARANTINE_LOW_SIZE)
-			break;
-		last = last->next;
+	new_quarantine_size = (total_size < percpu_quarantines) ?
+		0 : total_size - percpu_quarantines;
+	WRITE_ONCE(quarantine_max_size, new_quarantine_size);
+	/* Aim at consuming at most 1/2 of slots in quarantine. */
+	WRITE_ONCE(quarantine_batch_size, max((size_t)QUARANTINE_PERCPU_SIZE,
+		2 * total_size / QUARANTINE_BATCHES));
+
+	if (likely(quarantine_size > quarantine_max_size)) {
+		qlist_move_all(&global_quarantine[quarantine_head], &to_free);
+		WRITE_ONCE(quarantine_size, quarantine_size - to_free.bytes);
+		quarantine_head++;
+		if (quarantine_head == QUARANTINE_BATCHES)
+			quarantine_head = 0;
 	}
 	}
-	qlist_move(&global_quarantine, last, &to_free, size_to_free);
 
 
 	spin_unlock_irqrestore(&quarantine_lock, flags);
 	spin_unlock_irqrestore(&quarantine_lock, flags);
 
 
@@ -275,13 +276,14 @@ static void per_cpu_remove_cache(void *arg)
 
 
 void quarantine_remove_cache(struct kmem_cache *cache)
 void quarantine_remove_cache(struct kmem_cache *cache)
 {
 {
-	unsigned long flags;
+	unsigned long flags, i;
 	struct qlist_head to_free = QLIST_INIT;
 	struct qlist_head to_free = QLIST_INIT;
 
 
 	on_each_cpu(per_cpu_remove_cache, cache, 1);
 	on_each_cpu(per_cpu_remove_cache, cache, 1);
 
 
 	spin_lock_irqsave(&quarantine_lock, flags);
 	spin_lock_irqsave(&quarantine_lock, flags);
-	qlist_move_cache(&global_quarantine, &to_free, cache);
+	for (i = 0; i < QUARANTINE_BATCHES; i++)
+		qlist_move_cache(&global_quarantine[i], &to_free, cache);
 	spin_unlock_irqrestore(&quarantine_lock, flags);
 	spin_unlock_irqrestore(&quarantine_lock, flags);
 
 
 	qlist_free_all(&to_free, cache);
 	qlist_free_all(&to_free, cache);

+ 2 - 0
mm/kasan/report.c

@@ -136,6 +136,8 @@ static void kasan_end_report(unsigned long *flags)
 	pr_err("==================================================================\n");
 	pr_err("==================================================================\n");
 	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
 	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
 	spin_unlock_irqrestore(&report_lock, *flags);
 	spin_unlock_irqrestore(&report_lock, *flags);
+	if (panic_on_warn)
+		panic("panic_on_warn set ...\n");
 	kasan_enable_current();
 	kasan_enable_current();
 }
 }
 
 

+ 31 - 6
mm/khugepaged.c

@@ -1242,6 +1242,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
 	struct vm_area_struct *vma;
 	struct vm_area_struct *vma;
 	unsigned long addr;
 	unsigned long addr;
 	pmd_t *pmd, _pmd;
 	pmd_t *pmd, _pmd;
+	bool deposited = false;
 
 
 	i_mmap_lock_write(mapping);
 	i_mmap_lock_write(mapping);
 	vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
 	vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
@@ -1266,10 +1267,26 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
 			spinlock_t *ptl = pmd_lock(vma->vm_mm, pmd);
 			spinlock_t *ptl = pmd_lock(vma->vm_mm, pmd);
 			/* assume page table is clear */
 			/* assume page table is clear */
 			_pmd = pmdp_collapse_flush(vma, addr, pmd);
 			_pmd = pmdp_collapse_flush(vma, addr, pmd);
+			/*
+			 * now deposit the pgtable for arch that need it
+			 * otherwise free it.
+			 */
+			if (arch_needs_pgtable_deposit()) {
+				/*
+				 * The deposit should be visibile only after
+				 * collapse is seen by others.
+				 */
+				smp_wmb();
+				pgtable_trans_huge_deposit(vma->vm_mm, pmd,
+							   pmd_pgtable(_pmd));
+				deposited = true;
+			}
 			spin_unlock(ptl);
 			spin_unlock(ptl);
 			up_write(&vma->vm_mm->mmap_sem);
 			up_write(&vma->vm_mm->mmap_sem);
-			atomic_long_dec(&vma->vm_mm->nr_ptes);
-			pte_free(vma->vm_mm, pmd_pgtable(_pmd));
+			if (!deposited) {
+				atomic_long_dec(&vma->vm_mm->nr_ptes);
+				pte_free(vma->vm_mm, pmd_pgtable(_pmd));
+			}
 		}
 		}
 	}
 	}
 	i_mmap_unlock_write(mapping);
 	i_mmap_unlock_write(mapping);
@@ -1403,6 +1420,9 @@ static void collapse_shmem(struct mm_struct *mm,
 
 
 		spin_lock_irq(&mapping->tree_lock);
 		spin_lock_irq(&mapping->tree_lock);
 
 
+		slot = radix_tree_lookup_slot(&mapping->page_tree, index);
+		VM_BUG_ON_PAGE(page != radix_tree_deref_slot_protected(slot,
+					&mapping->tree_lock), page);
 		VM_BUG_ON_PAGE(page_mapped(page), page);
 		VM_BUG_ON_PAGE(page_mapped(page), page);
 
 
 		/*
 		/*
@@ -1423,9 +1443,10 @@ static void collapse_shmem(struct mm_struct *mm,
 		list_add_tail(&page->lru, &pagelist);
 		list_add_tail(&page->lru, &pagelist);
 
 
 		/* Finally, replace with the new page. */
 		/* Finally, replace with the new page. */
-		radix_tree_replace_slot(slot,
+		radix_tree_replace_slot(&mapping->page_tree, slot,
 				new_page + (index % HPAGE_PMD_NR));
 				new_page + (index % HPAGE_PMD_NR));
 
 
+		slot = radix_tree_iter_next(&iter);
 		index++;
 		index++;
 		continue;
 		continue;
 out_lru:
 out_lru:
@@ -1521,9 +1542,11 @@ tree_unlocked:
 			if (!page || iter.index < page->index) {
 			if (!page || iter.index < page->index) {
 				if (!nr_none)
 				if (!nr_none)
 					break;
 					break;
-				/* Put holes back where they were */
-				radix_tree_replace_slot(slot, NULL);
 				nr_none--;
 				nr_none--;
+				/* Put holes back where they were */
+				radix_tree_delete(&mapping->page_tree,
+						  iter.index);
+				slot = radix_tree_iter_next(&iter);
 				continue;
 				continue;
 			}
 			}
 
 
@@ -1532,11 +1555,13 @@ tree_unlocked:
 			/* Unfreeze the page. */
 			/* Unfreeze the page. */
 			list_del(&page->lru);
 			list_del(&page->lru);
 			page_ref_unfreeze(page, 2);
 			page_ref_unfreeze(page, 2);
-			radix_tree_replace_slot(slot, page);
+			radix_tree_replace_slot(&mapping->page_tree,
+						slot, page);
 			spin_unlock_irq(&mapping->tree_lock);
 			spin_unlock_irq(&mapping->tree_lock);
 			putback_lru_page(page);
 			putback_lru_page(page);
 			unlock_page(page);
 			unlock_page(page);
 			spin_lock_irq(&mapping->tree_lock);
 			spin_lock_irq(&mapping->tree_lock);
+			slot = radix_tree_iter_next(&iter);
 		}
 		}
 		VM_BUG_ON(nr_none);
 		VM_BUG_ON(nr_none);
 		spin_unlock_irq(&mapping->tree_lock);
 		spin_unlock_irq(&mapping->tree_lock);

+ 1 - 1
mm/kmemleak.c

@@ -19,7 +19,7 @@
  *
  *
  *
  *
  * For more information on the algorithm and kmemleak usage, please see
  * For more information on the algorithm and kmemleak usage, please see
- * Documentation/kmemleak.txt.
+ * Documentation/dev-tools/kmemleak.rst.
  *
  *
  * Notes on locking
  * Notes on locking
  * ----------------
  * ----------------

+ 1 - 0
mm/madvise.c

@@ -281,6 +281,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 	if (pmd_trans_unstable(pmd))
 	if (pmd_trans_unstable(pmd))
 		return 0;
 		return 0;
 
 
+	tlb_remove_check_page_size_change(tlb, PAGE_SIZE);
 	orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
 	orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
 	arch_enter_lazy_mmu_mode();
 	arch_enter_lazy_mmu_mode();
 	for (; addr != end; pte++, addr += PAGE_SIZE) {
 	for (; addr != end; pte++, addr += PAGE_SIZE) {

+ 14 - 1
mm/memcontrol.c

@@ -2145,6 +2145,8 @@ struct memcg_kmem_cache_create_work {
 	struct work_struct work;
 	struct work_struct work;
 };
 };
 
 
+static struct workqueue_struct *memcg_kmem_cache_create_wq;
+
 static void memcg_kmem_cache_create_func(struct work_struct *w)
 static void memcg_kmem_cache_create_func(struct work_struct *w)
 {
 {
 	struct memcg_kmem_cache_create_work *cw =
 	struct memcg_kmem_cache_create_work *cw =
@@ -2176,7 +2178,7 @@ static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
 	cw->cachep = cachep;
 	cw->cachep = cachep;
 	INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
 	INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
 
 
-	schedule_work(&cw->work);
+	queue_work(memcg_kmem_cache_create_wq, &cw->work);
 }
 }
 
 
 static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
 static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
@@ -5774,6 +5776,17 @@ static int __init mem_cgroup_init(void)
 {
 {
 	int cpu, node;
 	int cpu, node;
 
 
+#ifndef CONFIG_SLOB
+	/*
+	 * Kmem cache creation is mostly done with the slab_mutex held,
+	 * so use a special workqueue to avoid stalling all worker
+	 * threads in case lots of cgroups are created simultaneously.
+	 */
+	memcg_kmem_cache_create_wq =
+		alloc_ordered_workqueue("memcg_kmem_cache_create", 0);
+	BUG_ON(!memcg_kmem_cache_create_wq);
+#endif
+
 	cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
 	cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
 				  memcg_hotplug_cpu_dead);
 				  memcg_hotplug_cpu_dead);
 
 

+ 64 - 28
mm/memory.c

@@ -300,15 +300,14 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_
 	struct mmu_gather_batch *batch;
 	struct mmu_gather_batch *batch;
 
 
 	VM_BUG_ON(!tlb->end);
 	VM_BUG_ON(!tlb->end);
-
-	if (!tlb->page_size)
-		tlb->page_size = page_size;
-	else {
-		if (page_size != tlb->page_size)
-			return true;
-	}
+	VM_WARN_ON(tlb->page_size != page_size);
 
 
 	batch = tlb->active;
 	batch = tlb->active;
+	/*
+	 * Add the page and check if we are full. If so
+	 * force a flush.
+	 */
+	batch->pages[batch->nr++] = page;
 	if (batch->nr == batch->max) {
 	if (batch->nr == batch->max) {
 		if (!tlb_next_batch(tlb))
 		if (!tlb_next_batch(tlb))
 			return true;
 			return true;
@@ -316,7 +315,6 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_
 	}
 	}
 	VM_BUG_ON_PAGE(batch->nr > batch->max, page);
 	VM_BUG_ON_PAGE(batch->nr > batch->max, page);
 
 
-	batch->pages[batch->nr++] = page;
 	return false;
 	return false;
 }
 }
 
 
@@ -528,7 +526,11 @@ void free_pgd_range(struct mmu_gather *tlb,
 		end -= PMD_SIZE;
 		end -= PMD_SIZE;
 	if (addr > end - 1)
 	if (addr > end - 1)
 		return;
 		return;
-
+	/*
+	 * We add page table cache pages with PAGE_SIZE,
+	 * (see pte_free_tlb()), flush the tlb if we need
+	 */
+	tlb_remove_check_page_size_change(tlb, PAGE_SIZE);
 	pgd = pgd_offset(tlb->mm, addr);
 	pgd = pgd_offset(tlb->mm, addr);
 	do {
 	do {
 		next = pgd_addr_end(addr, end);
 		next = pgd_addr_end(addr, end);
@@ -1118,8 +1120,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 	pte_t *start_pte;
 	pte_t *start_pte;
 	pte_t *pte;
 	pte_t *pte;
 	swp_entry_t entry;
 	swp_entry_t entry;
-	struct page *pending_page = NULL;
 
 
+	tlb_remove_check_page_size_change(tlb, PAGE_SIZE);
 again:
 again:
 	init_rss_vec(rss);
 	init_rss_vec(rss);
 	start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
 	start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
@@ -1172,7 +1174,6 @@ again:
 				print_bad_pte(vma, addr, ptent, page);
 				print_bad_pte(vma, addr, ptent, page);
 			if (unlikely(__tlb_remove_page(tlb, page))) {
 			if (unlikely(__tlb_remove_page(tlb, page))) {
 				force_flush = 1;
 				force_flush = 1;
-				pending_page = page;
 				addr += PAGE_SIZE;
 				addr += PAGE_SIZE;
 				break;
 				break;
 			}
 			}
@@ -1213,11 +1214,6 @@ again:
 	if (force_flush) {
 	if (force_flush) {
 		force_flush = 0;
 		force_flush = 0;
 		tlb_flush_mmu_free(tlb);
 		tlb_flush_mmu_free(tlb);
-		if (pending_page) {
-			/* remove the page with new size */
-			__tlb_remove_pte_page(tlb, pending_page);
-			pending_page = NULL;
-		}
 		if (addr != end)
 		if (addr != end)
 			goto again;
 			goto again;
 	}
 	}
@@ -1240,7 +1236,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
 			if (next - addr != HPAGE_PMD_SIZE) {
 			if (next - addr != HPAGE_PMD_SIZE) {
 				VM_BUG_ON_VMA(vma_is_anonymous(vma) &&
 				VM_BUG_ON_VMA(vma_is_anonymous(vma) &&
 				    !rwsem_is_locked(&tlb->mm->mmap_sem), vma);
 				    !rwsem_is_locked(&tlb->mm->mmap_sem), vma);
-				split_huge_pmd(vma, pmd, addr);
+				__split_huge_pmd(vma, pmd, addr, false, NULL);
 			} else if (zap_huge_pmd(tlb, vma, pmd, addr))
 			} else if (zap_huge_pmd(tlb, vma, pmd, addr))
 				goto next;
 				goto next;
 			/* fall through */
 			/* fall through */
@@ -2939,6 +2935,19 @@ static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
 	return true;
 	return true;
 }
 }
 
 
+static void deposit_prealloc_pte(struct fault_env *fe)
+{
+	struct vm_area_struct *vma = fe->vma;
+
+	pgtable_trans_huge_deposit(vma->vm_mm, fe->pmd, fe->prealloc_pte);
+	/*
+	 * We are going to consume the prealloc table,
+	 * count that as nr_ptes.
+	 */
+	atomic_long_inc(&vma->vm_mm->nr_ptes);
+	fe->prealloc_pte = 0;
+}
+
 static int do_set_pmd(struct fault_env *fe, struct page *page)
 static int do_set_pmd(struct fault_env *fe, struct page *page)
 {
 {
 	struct vm_area_struct *vma = fe->vma;
 	struct vm_area_struct *vma = fe->vma;
@@ -2953,6 +2962,17 @@ static int do_set_pmd(struct fault_env *fe, struct page *page)
 	ret = VM_FAULT_FALLBACK;
 	ret = VM_FAULT_FALLBACK;
 	page = compound_head(page);
 	page = compound_head(page);
 
 
+	/*
+	 * Archs like ppc64 need additonal space to store information
+	 * related to pte entry. Use the preallocated table for that.
+	 */
+	if (arch_needs_pgtable_deposit() && !fe->prealloc_pte) {
+		fe->prealloc_pte = pte_alloc_one(vma->vm_mm, fe->address);
+		if (!fe->prealloc_pte)
+			return VM_FAULT_OOM;
+		smp_wmb(); /* See comment in __pte_alloc() */
+	}
+
 	fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
 	fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
 	if (unlikely(!pmd_none(*fe->pmd)))
 	if (unlikely(!pmd_none(*fe->pmd)))
 		goto out;
 		goto out;
@@ -2966,6 +2986,11 @@ static int do_set_pmd(struct fault_env *fe, struct page *page)
 
 
 	add_mm_counter(vma->vm_mm, MM_FILEPAGES, HPAGE_PMD_NR);
 	add_mm_counter(vma->vm_mm, MM_FILEPAGES, HPAGE_PMD_NR);
 	page_add_file_rmap(page, true);
 	page_add_file_rmap(page, true);
+	/*
+	 * deposit and withdraw with pmd lock held
+	 */
+	if (arch_needs_pgtable_deposit())
+		deposit_prealloc_pte(fe);
 
 
 	set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry);
 	set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry);
 
 
@@ -2975,6 +3000,13 @@ static int do_set_pmd(struct fault_env *fe, struct page *page)
 	ret = 0;
 	ret = 0;
 	count_vm_event(THP_FILE_MAPPED);
 	count_vm_event(THP_FILE_MAPPED);
 out:
 out:
+	/*
+	 * If we are going to fallback to pte mapping, do a
+	 * withdraw with pmd lock held.
+	 */
+	if (arch_needs_pgtable_deposit() && ret == VM_FAULT_FALLBACK)
+		fe->prealloc_pte = pgtable_trans_huge_withdraw(vma->vm_mm,
+							       fe->pmd);
 	spin_unlock(fe->ptl);
 	spin_unlock(fe->ptl);
 	return ret;
 	return ret;
 }
 }
@@ -3014,18 +3046,20 @@ int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg,
 
 
 		ret = do_set_pmd(fe, page);
 		ret = do_set_pmd(fe, page);
 		if (ret != VM_FAULT_FALLBACK)
 		if (ret != VM_FAULT_FALLBACK)
-			return ret;
+			goto fault_handled;
 	}
 	}
 
 
 	if (!fe->pte) {
 	if (!fe->pte) {
 		ret = pte_alloc_one_map(fe);
 		ret = pte_alloc_one_map(fe);
 		if (ret)
 		if (ret)
-			return ret;
+			goto fault_handled;
 	}
 	}
 
 
 	/* Re-check under ptl */
 	/* Re-check under ptl */
-	if (unlikely(!pte_none(*fe->pte)))
-		return VM_FAULT_NOPAGE;
+	if (unlikely(!pte_none(*fe->pte))) {
+		ret = VM_FAULT_NOPAGE;
+		goto fault_handled;
+	}
 
 
 	flush_icache_page(vma, page);
 	flush_icache_page(vma, page);
 	entry = mk_pte(page, vma->vm_page_prot);
 	entry = mk_pte(page, vma->vm_page_prot);
@@ -3045,8 +3079,15 @@ int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg,
 
 
 	/* no need to invalidate: a not-present page won't be cached */
 	/* no need to invalidate: a not-present page won't be cached */
 	update_mmu_cache(vma, fe->address, fe->pte);
 	update_mmu_cache(vma, fe->address, fe->pte);
+	ret = 0;
 
 
-	return 0;
+fault_handled:
+	/* preallocated pagetable is unused: free it */
+	if (fe->prealloc_pte) {
+		pte_free(fe->vma->vm_mm, fe->prealloc_pte);
+		fe->prealloc_pte = 0;
+	}
+	return ret;
 }
 }
 
 
 static unsigned long fault_around_bytes __read_mostly =
 static unsigned long fault_around_bytes __read_mostly =
@@ -3145,11 +3186,6 @@ static int do_fault_around(struct fault_env *fe, pgoff_t start_pgoff)
 
 
 	fe->vma->vm_ops->map_pages(fe, start_pgoff, end_pgoff);
 	fe->vma->vm_ops->map_pages(fe, start_pgoff, end_pgoff);
 
 
-	/* preallocated pagetable is unused: free it */
-	if (fe->prealloc_pte) {
-		pte_free(fe->vma->vm_mm, fe->prealloc_pte);
-		fe->prealloc_pte = 0;
-	}
 	/* Huge page is mapped? Page fault is solved */
 	/* Huge page is mapped? Page fault is solved */
 	if (pmd_trans_huge(*fe->pmd)) {
 	if (pmd_trans_huge(*fe->pmd)) {
 		ret = VM_FAULT_NOPAGE;
 		ret = VM_FAULT_NOPAGE;
@@ -3454,7 +3490,7 @@ static int wp_huge_pmd(struct fault_env *fe, pmd_t orig_pmd)
 
 
 	/* COW handled on pte level: split pmd */
 	/* COW handled on pte level: split pmd */
 	VM_BUG_ON_VMA(fe->vma->vm_flags & VM_SHARED, fe->vma);
 	VM_BUG_ON_VMA(fe->vma->vm_flags & VM_SHARED, fe->vma);
-	split_huge_pmd(fe->vma, fe->pmd, fe->address);
+	__split_huge_pmd(fe->vma, fe->pmd, fe->address, false, NULL);
 
 
 	return VM_FAULT_FALLBACK;
 	return VM_FAULT_FALLBACK;
 }
 }

+ 0 - 20
mm/memory_hotplug.c

@@ -1727,26 +1727,6 @@ static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
 static int __init cmdline_parse_movable_node(char *p)
 static int __init cmdline_parse_movable_node(char *p)
 {
 {
 #ifdef CONFIG_MOVABLE_NODE
 #ifdef CONFIG_MOVABLE_NODE
-	/*
-	 * Memory used by the kernel cannot be hot-removed because Linux
-	 * cannot migrate the kernel pages. When memory hotplug is
-	 * enabled, we should prevent memblock from allocating memory
-	 * for the kernel.
-	 *
-	 * ACPI SRAT records all hotpluggable memory ranges. But before
-	 * SRAT is parsed, we don't know about it.
-	 *
-	 * The kernel image is loaded into memory at very early time. We
-	 * cannot prevent this anyway. So on NUMA system, we set any
-	 * node the kernel resides in as un-hotpluggable.
-	 *
-	 * Since on modern servers, one node could have double-digit
-	 * gigabytes memory, we can assume the memory around the kernel
-	 * image is also un-hotpluggable. So before SRAT is parsed, just
-	 * allocate memory near the kernel image to try the best to keep
-	 * the kernel away from hotpluggable memory.
-	 */
-	memblock_set_bottom_up(true);
 	movable_node_enabled = true;
 	movable_node_enabled = true;
 #else
 #else
 	pr_warn("movable_node option not supported\n");
 	pr_warn("movable_node option not supported\n");

+ 12 - 18
mm/mempolicy.c

@@ -276,7 +276,9 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 				return ERR_PTR(-EINVAL);
 				return ERR_PTR(-EINVAL);
 		}
 		}
 	} else if (mode == MPOL_LOCAL) {
 	} else if (mode == MPOL_LOCAL) {
-		if (!nodes_empty(*nodes))
+		if (!nodes_empty(*nodes) ||
+		    (flags & MPOL_F_STATIC_NODES) ||
+		    (flags & MPOL_F_RELATIVE_NODES))
 			return ERR_PTR(-EINVAL);
 			return ERR_PTR(-EINVAL);
 		mode = MPOL_PREFERRED;
 		mode = MPOL_PREFERRED;
 	} else if (nodes_empty(*nodes))
 	} else if (nodes_empty(*nodes))
@@ -496,7 +498,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
 			page = pmd_page(*pmd);
 			page = pmd_page(*pmd);
 			if (is_huge_zero_page(page)) {
 			if (is_huge_zero_page(page)) {
 				spin_unlock(ptl);
 				spin_unlock(ptl);
-				split_huge_pmd(vma, pmd, addr);
+				__split_huge_pmd(vma, pmd, addr, false, NULL);
 			} else {
 			} else {
 				get_page(page);
 				get_page(page);
 				spin_unlock(ptl);
 				spin_unlock(ptl);
@@ -1679,25 +1681,17 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
 static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
 static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
 	int nd)
 	int nd)
 {
 {
-	switch (policy->mode) {
-	case MPOL_PREFERRED:
-		if (!(policy->flags & MPOL_F_LOCAL))
-			nd = policy->v.preferred_node;
-		break;
-	case MPOL_BIND:
+	if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL))
+		nd = policy->v.preferred_node;
+	else {
 		/*
 		/*
-		 * Normally, MPOL_BIND allocations are node-local within the
-		 * allowed nodemask.  However, if __GFP_THISNODE is set and the
-		 * current node isn't part of the mask, we use the zonelist for
-		 * the first node in the mask instead.
+		 * __GFP_THISNODE shouldn't even be used with the bind policy
+		 * because we might easily break the expectation to stay on the
+		 * requested node and not break the policy.
 		 */
 		 */
-		if (unlikely(gfp & __GFP_THISNODE) &&
-				unlikely(!node_isset(nd, policy->v.nodes)))
-			nd = first_node(policy->v.nodes);
-		break;
-	default:
-		BUG();
+		WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
 	}
 	}
+
 	return node_zonelist(nd, gfp);
 	return node_zonelist(nd, gfp);
 }
 }
 
 

+ 13 - 6
mm/migrate.c

@@ -168,8 +168,6 @@ void putback_movable_pages(struct list_head *l)
 			continue;
 			continue;
 		}
 		}
 		list_del(&page->lru);
 		list_del(&page->lru);
-		dec_node_page_state(page, NR_ISOLATED_ANON +
-				page_is_file_cache(page));
 		/*
 		/*
 		 * We isolated non-lru movable page so here we can use
 		 * We isolated non-lru movable page so here we can use
 		 * __PageMovable because LRU page's mapping cannot have
 		 * __PageMovable because LRU page's mapping cannot have
@@ -186,6 +184,8 @@ void putback_movable_pages(struct list_head *l)
 			put_page(page);
 			put_page(page);
 		} else {
 		} else {
 			putback_lru_page(page);
 			putback_lru_page(page);
+			dec_node_page_state(page, NR_ISOLATED_ANON +
+					page_is_file_cache(page));
 		}
 		}
 	}
 	}
 }
 }
@@ -482,7 +482,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
 		SetPageDirty(newpage);
 		SetPageDirty(newpage);
 	}
 	}
 
 
-	radix_tree_replace_slot(pslot, newpage);
+	radix_tree_replace_slot(&mapping->page_tree, pslot, newpage);
 
 
 	/*
 	/*
 	 * Drop cache reference from old page by unfreezing
 	 * Drop cache reference from old page by unfreezing
@@ -556,7 +556,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
 
 
 	get_page(newpage);
 	get_page(newpage);
 
 
-	radix_tree_replace_slot(pslot, newpage);
+	radix_tree_replace_slot(&mapping->page_tree, pslot, newpage);
 
 
 	page_ref_unfreeze(page, expected_count - 1);
 	page_ref_unfreeze(page, expected_count - 1);
 
 
@@ -1121,8 +1121,15 @@ out:
 		 * restored.
 		 * restored.
 		 */
 		 */
 		list_del(&page->lru);
 		list_del(&page->lru);
-		dec_node_page_state(page, NR_ISOLATED_ANON +
-				page_is_file_cache(page));
+
+		/*
+		 * Compaction can migrate also non-LRU pages which are
+		 * not accounted to NR_ISOLATED_*. They can be recognized
+		 * as __PageMovable
+		 */
+		if (likely(!__PageMovable(page)))
+			dec_node_page_state(page, NR_ISOLATED_ANON +
+					page_is_file_cache(page));
 	}
 	}
 
 
 	/*
 	/*

+ 18 - 1
mm/mprotect.c

@@ -69,11 +69,17 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 	pte_t *pte, oldpte;
 	pte_t *pte, oldpte;
 	spinlock_t *ptl;
 	spinlock_t *ptl;
 	unsigned long pages = 0;
 	unsigned long pages = 0;
+	int target_node = NUMA_NO_NODE;
 
 
 	pte = lock_pte_protection(vma, pmd, addr, prot_numa, &ptl);
 	pte = lock_pte_protection(vma, pmd, addr, prot_numa, &ptl);
 	if (!pte)
 	if (!pte)
 		return 0;
 		return 0;
 
 
+	/* Get target node for single threaded private VMAs */
+	if (prot_numa && !(vma->vm_flags & VM_SHARED) &&
+	    atomic_read(&vma->vm_mm->mm_users) == 1)
+		target_node = numa_node_id();
+
 	arch_enter_lazy_mmu_mode();
 	arch_enter_lazy_mmu_mode();
 	do {
 	do {
 		oldpte = *pte;
 		oldpte = *pte;
@@ -95,6 +101,13 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 				/* Avoid TLB flush if possible */
 				/* Avoid TLB flush if possible */
 				if (pte_protnone(oldpte))
 				if (pte_protnone(oldpte))
 					continue;
 					continue;
+
+				/*
+				 * Don't mess with PTEs if page is already on the node
+				 * a single-threaded process is running on.
+				 */
+				if (target_node == page_to_nid(page))
+					continue;
 			}
 			}
 
 
 			ptent = ptep_modify_prot_start(mm, addr, pte);
 			ptent = ptep_modify_prot_start(mm, addr, pte);
@@ -163,7 +176,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
 
 
 		if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
 		if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
 			if (next - addr != HPAGE_PMD_SIZE) {
 			if (next - addr != HPAGE_PMD_SIZE) {
-				split_huge_pmd(vma, pmd, addr);
+				__split_huge_pmd(vma, pmd, addr, false, NULL);
 				if (pmd_trans_unstable(pmd))
 				if (pmd_trans_unstable(pmd))
 					continue;
 					continue;
 			} else {
 			} else {
@@ -484,6 +497,8 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
 	return do_mprotect_pkey(start, len, prot, -1);
 	return do_mprotect_pkey(start, len, prot, -1);
 }
 }
 
 
+#ifdef CONFIG_ARCH_HAS_PKEYS
+
 SYSCALL_DEFINE4(pkey_mprotect, unsigned long, start, size_t, len,
 SYSCALL_DEFINE4(pkey_mprotect, unsigned long, start, size_t, len,
 		unsigned long, prot, int, pkey)
 		unsigned long, prot, int, pkey)
 {
 {
@@ -534,3 +549,5 @@ SYSCALL_DEFINE1(pkey_free, int, pkey)
 	 */
 	 */
 	return ret;
 	return ret;
 }
 }
+
+#endif /* CONFIG_ARCH_HAS_PKEYS */

+ 56 - 19
mm/page_alloc.c

@@ -2058,8 +2058,12 @@ out_unlock:
  * potentially hurts the reliability of high-order allocations when under
  * potentially hurts the reliability of high-order allocations when under
  * intense memory pressure but failed atomic allocations should be easier
  * intense memory pressure but failed atomic allocations should be easier
  * to recover from than an OOM.
  * to recover from than an OOM.
+ *
+ * If @force is true, try to unreserve a pageblock even though highatomic
+ * pageblock is exhausted.
  */
  */
-static void unreserve_highatomic_pageblock(const struct alloc_context *ac)
+static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
+						bool force)
 {
 {
 	struct zonelist *zonelist = ac->zonelist;
 	struct zonelist *zonelist = ac->zonelist;
 	unsigned long flags;
 	unsigned long flags;
@@ -2067,11 +2071,16 @@ static void unreserve_highatomic_pageblock(const struct alloc_context *ac)
 	struct zone *zone;
 	struct zone *zone;
 	struct page *page;
 	struct page *page;
 	int order;
 	int order;
+	bool ret;
 
 
 	for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx,
 	for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx,
 								ac->nodemask) {
 								ac->nodemask) {
-		/* Preserve at least one pageblock */
-		if (zone->nr_reserved_highatomic <= pageblock_nr_pages)
+		/*
+		 * Preserve at least one pageblock unless memory pressure
+		 * is really high.
+		 */
+		if (!force && zone->nr_reserved_highatomic <=
+					pageblock_nr_pages)
 			continue;
 			continue;
 
 
 		spin_lock_irqsave(&zone->lock, flags);
 		spin_lock_irqsave(&zone->lock, flags);
@@ -2085,13 +2094,25 @@ static void unreserve_highatomic_pageblock(const struct alloc_context *ac)
 				continue;
 				continue;
 
 
 			/*
 			/*
-			 * It should never happen but changes to locking could
-			 * inadvertently allow a per-cpu drain to add pages
-			 * to MIGRATE_HIGHATOMIC while unreserving so be safe
-			 * and watch for underflows.
+			 * In page freeing path, migratetype change is racy so
+			 * we can counter several free pages in a pageblock
+			 * in this loop althoug we changed the pageblock type
+			 * from highatomic to ac->migratetype. So we should
+			 * adjust the count once.
 			 */
 			 */
-			zone->nr_reserved_highatomic -= min(pageblock_nr_pages,
-				zone->nr_reserved_highatomic);
+			if (get_pageblock_migratetype(page) ==
+							MIGRATE_HIGHATOMIC) {
+				/*
+				 * It should never happen but changes to
+				 * locking could inadvertently allow a per-cpu
+				 * drain to add pages to MIGRATE_HIGHATOMIC
+				 * while unreserving so be safe and watch for
+				 * underflows.
+				 */
+				zone->nr_reserved_highatomic -= min(
+						pageblock_nr_pages,
+						zone->nr_reserved_highatomic);
+			}
 
 
 			/*
 			/*
 			 * Convert to ac->migratetype and avoid the normal
 			 * Convert to ac->migratetype and avoid the normal
@@ -2103,12 +2124,16 @@ static void unreserve_highatomic_pageblock(const struct alloc_context *ac)
 			 * may increase.
 			 * may increase.
 			 */
 			 */
 			set_pageblock_migratetype(page, ac->migratetype);
 			set_pageblock_migratetype(page, ac->migratetype);
-			move_freepages_block(zone, page, ac->migratetype);
-			spin_unlock_irqrestore(&zone->lock, flags);
-			return;
+			ret = move_freepages_block(zone, page, ac->migratetype);
+			if (ret) {
+				spin_unlock_irqrestore(&zone->lock, flags);
+				return ret;
+			}
 		}
 		}
 		spin_unlock_irqrestore(&zone->lock, flags);
 		spin_unlock_irqrestore(&zone->lock, flags);
 	}
 	}
+
+	return false;
 }
 }
 
 
 /* Remove an element from the buddy allocator from the fallback list */
 /* Remove an element from the buddy allocator from the fallback list */
@@ -2133,7 +2158,8 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
 
 
 		page = list_first_entry(&area->free_list[fallback_mt],
 		page = list_first_entry(&area->free_list[fallback_mt],
 						struct page, lru);
 						struct page, lru);
-		if (can_steal)
+		if (can_steal &&
+			get_pageblock_migratetype(page) != MIGRATE_HIGHATOMIC)
 			steal_suitable_fallback(zone, page, start_migratetype);
 			steal_suitable_fallback(zone, page, start_migratetype);
 
 
 		/* Remove the page from the freelists */
 		/* Remove the page from the freelists */
@@ -2192,7 +2218,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
 			unsigned long count, struct list_head *list,
 			unsigned long count, struct list_head *list,
 			int migratetype, bool cold)
 			int migratetype, bool cold)
 {
 {
-	int i;
+	int i, alloced = 0;
 
 
 	spin_lock(&zone->lock);
 	spin_lock(&zone->lock);
 	for (i = 0; i < count; ++i) {
 	for (i = 0; i < count; ++i) {
@@ -2217,13 +2243,21 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
 		else
 		else
 			list_add_tail(&page->lru, list);
 			list_add_tail(&page->lru, list);
 		list = &page->lru;
 		list = &page->lru;
+		alloced++;
 		if (is_migrate_cma(get_pcppage_migratetype(page)))
 		if (is_migrate_cma(get_pcppage_migratetype(page)))
 			__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
 			__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
 					      -(1 << order));
 					      -(1 << order));
 	}
 	}
+
+	/*
+	 * i pages were removed from the buddy list even if some leak due
+	 * to check_pcp_refill failing so adjust NR_FREE_PAGES based
+	 * on i. Do not confuse with 'alloced' which is the number of
+	 * pages added to the pcp list.
+	 */
 	__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
 	__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
 	spin_unlock(&zone->lock);
 	spin_unlock(&zone->lock);
-	return i;
+	return alloced;
 }
 }
 
 
 #ifdef CONFIG_NUMA
 #ifdef CONFIG_NUMA
@@ -2534,7 +2568,8 @@ int __isolate_free_page(struct page *page, unsigned int order)
 		struct page *endpage = page + (1 << order) - 1;
 		struct page *endpage = page + (1 << order) - 1;
 		for (; page < endpage; page += pageblock_nr_pages) {
 		for (; page < endpage; page += pageblock_nr_pages) {
 			int mt = get_pageblock_migratetype(page);
 			int mt = get_pageblock_migratetype(page);
-			if (!is_migrate_isolate(mt) && !is_migrate_cma(mt))
+			if (!is_migrate_isolate(mt) && !is_migrate_cma(mt)
+				&& mt != MIGRATE_HIGHATOMIC)
 				set_pageblock_migratetype(page,
 				set_pageblock_migratetype(page,
 							  MIGRATE_MOVABLE);
 							  MIGRATE_MOVABLE);
 		}
 		}
@@ -3305,7 +3340,7 @@ retry:
 	 * Shrink them them and try again
 	 * Shrink them them and try again
 	 */
 	 */
 	if (!page && !drained) {
 	if (!page && !drained) {
-		unreserve_highatomic_pageblock(ac);
+		unreserve_highatomic_pageblock(ac, false);
 		drain_all_pages(NULL);
 		drain_all_pages(NULL);
 		drained = true;
 		drained = true;
 		goto retry;
 		goto retry;
@@ -3422,8 +3457,10 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
 	 * Make sure we converge to OOM if we cannot make any progress
 	 * Make sure we converge to OOM if we cannot make any progress
 	 * several times in the row.
 	 * several times in the row.
 	 */
 	 */
-	if (*no_progress_loops > MAX_RECLAIM_RETRIES)
-		return false;
+	if (*no_progress_loops > MAX_RECLAIM_RETRIES) {
+		/* Before OOM, exhaust highatomic_reserve */
+		return unreserve_highatomic_pageblock(ac, true);
+	}
 
 
 	/*
 	/*
 	 * Keep reclaiming pages while there is a chance this will lead
 	 * Keep reclaiming pages while there is a chance this will lead

+ 12 - 4
mm/percpu.c

@@ -2093,6 +2093,8 @@ int __init pcpu_page_first_chunk(size_t reserved_size,
 	size_t pages_size;
 	size_t pages_size;
 	struct page **pages;
 	struct page **pages;
 	int unit, i, j, rc;
 	int unit, i, j, rc;
+	int upa;
+	int nr_g0_units;
 
 
 	snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10);
 	snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10);
 
 
@@ -2100,7 +2102,12 @@ int __init pcpu_page_first_chunk(size_t reserved_size,
 	if (IS_ERR(ai))
 	if (IS_ERR(ai))
 		return PTR_ERR(ai);
 		return PTR_ERR(ai);
 	BUG_ON(ai->nr_groups != 1);
 	BUG_ON(ai->nr_groups != 1);
-	BUG_ON(ai->groups[0].nr_units != num_possible_cpus());
+	upa = ai->alloc_size/ai->unit_size;
+	nr_g0_units = roundup(num_possible_cpus(), upa);
+	if (unlikely(WARN_ON(ai->groups[0].nr_units != nr_g0_units))) {
+		pcpu_free_alloc_info(ai);
+		return -EINVAL;
+	}
 
 
 	unit_pages = ai->unit_size >> PAGE_SHIFT;
 	unit_pages = ai->unit_size >> PAGE_SHIFT;
 
 
@@ -2111,21 +2118,22 @@ int __init pcpu_page_first_chunk(size_t reserved_size,
 
 
 	/* allocate pages */
 	/* allocate pages */
 	j = 0;
 	j = 0;
-	for (unit = 0; unit < num_possible_cpus(); unit++)
+	for (unit = 0; unit < num_possible_cpus(); unit++) {
+		unsigned int cpu = ai->groups[0].cpu_map[unit];
 		for (i = 0; i < unit_pages; i++) {
 		for (i = 0; i < unit_pages; i++) {
-			unsigned int cpu = ai->groups[0].cpu_map[unit];
 			void *ptr;
 			void *ptr;
 
 
 			ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE);
 			ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE);
 			if (!ptr) {
 			if (!ptr) {
 				pr_warn("failed to allocate %s page for cpu%u\n",
 				pr_warn("failed to allocate %s page for cpu%u\n",
-					psize_str, cpu);
+						psize_str, cpu);
 				goto enomem;
 				goto enomem;
 			}
 			}
 			/* kmemleak tracks the percpu allocations separately */
 			/* kmemleak tracks the percpu allocations separately */
 			kmemleak_free(ptr);
 			kmemleak_free(ptr);
 			pages[j++] = virt_to_page(ptr);
 			pages[j++] = virt_to_page(ptr);
 		}
 		}
+	}
 
 
 	/* allocate vm area, map the pages and copy static data */
 	/* allocate vm area, map the pages and copy static data */
 	vm.flags = VM_ALLOC;
 	vm.flags = VM_ALLOC;

+ 28 - 11
mm/readahead.c

@@ -207,12 +207,21 @@ out:
  * memory at once.
  * memory at once.
  */
  */
 int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
 int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
-		pgoff_t offset, unsigned long nr_to_read)
+			       pgoff_t offset, unsigned long nr_to_read)
 {
 {
+	struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
+	struct file_ra_state *ra = &filp->f_ra;
+	unsigned long max_pages;
+
 	if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages))
 	if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages))
 		return -EINVAL;
 		return -EINVAL;
 
 
-	nr_to_read = min(nr_to_read, inode_to_bdi(mapping->host)->ra_pages);
+	/*
+	 * If the request exceeds the readahead window, allow the read to
+	 * be up to the optimal hardware IO size
+	 */
+	max_pages = max_t(unsigned long, bdi->io_pages, ra->ra_pages);
+	nr_to_read = min(nr_to_read, max_pages);
 	while (nr_to_read) {
 	while (nr_to_read) {
 		int err;
 		int err;
 
 
@@ -369,9 +378,17 @@ ondemand_readahead(struct address_space *mapping,
 		   bool hit_readahead_marker, pgoff_t offset,
 		   bool hit_readahead_marker, pgoff_t offset,
 		   unsigned long req_size)
 		   unsigned long req_size)
 {
 {
-	unsigned long max = ra->ra_pages;
+	struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
+	unsigned long max_pages = ra->ra_pages;
 	pgoff_t prev_offset;
 	pgoff_t prev_offset;
 
 
+	/*
+	 * If the request exceeds the readahead window, allow the read to
+	 * be up to the optimal hardware IO size
+	 */
+	if (req_size > max_pages && bdi->io_pages > max_pages)
+		max_pages = min(req_size, bdi->io_pages);
+
 	/*
 	/*
 	 * start of file
 	 * start of file
 	 */
 	 */
@@ -385,7 +402,7 @@ ondemand_readahead(struct address_space *mapping,
 	if ((offset == (ra->start + ra->size - ra->async_size) ||
 	if ((offset == (ra->start + ra->size - ra->async_size) ||
 	     offset == (ra->start + ra->size))) {
 	     offset == (ra->start + ra->size))) {
 		ra->start += ra->size;
 		ra->start += ra->size;
-		ra->size = get_next_ra_size(ra, max);
+		ra->size = get_next_ra_size(ra, max_pages);
 		ra->async_size = ra->size;
 		ra->async_size = ra->size;
 		goto readit;
 		goto readit;
 	}
 	}
@@ -400,16 +417,16 @@ ondemand_readahead(struct address_space *mapping,
 		pgoff_t start;
 		pgoff_t start;
 
 
 		rcu_read_lock();
 		rcu_read_lock();
-		start = page_cache_next_hole(mapping, offset + 1, max);
+		start = page_cache_next_hole(mapping, offset + 1, max_pages);
 		rcu_read_unlock();
 		rcu_read_unlock();
 
 
-		if (!start || start - offset > max)
+		if (!start || start - offset > max_pages)
 			return 0;
 			return 0;
 
 
 		ra->start = start;
 		ra->start = start;
 		ra->size = start - offset;	/* old async_size */
 		ra->size = start - offset;	/* old async_size */
 		ra->size += req_size;
 		ra->size += req_size;
-		ra->size = get_next_ra_size(ra, max);
+		ra->size = get_next_ra_size(ra, max_pages);
 		ra->async_size = ra->size;
 		ra->async_size = ra->size;
 		goto readit;
 		goto readit;
 	}
 	}
@@ -417,7 +434,7 @@ ondemand_readahead(struct address_space *mapping,
 	/*
 	/*
 	 * oversize read
 	 * oversize read
 	 */
 	 */
-	if (req_size > max)
+	if (req_size > max_pages)
 		goto initial_readahead;
 		goto initial_readahead;
 
 
 	/*
 	/*
@@ -433,7 +450,7 @@ ondemand_readahead(struct address_space *mapping,
 	 * Query the page cache and look for the traces(cached history pages)
 	 * Query the page cache and look for the traces(cached history pages)
 	 * that a sequential stream would leave behind.
 	 * that a sequential stream would leave behind.
 	 */
 	 */
-	if (try_context_readahead(mapping, ra, offset, req_size, max))
+	if (try_context_readahead(mapping, ra, offset, req_size, max_pages))
 		goto readit;
 		goto readit;
 
 
 	/*
 	/*
@@ -444,7 +461,7 @@ ondemand_readahead(struct address_space *mapping,
 
 
 initial_readahead:
 initial_readahead:
 	ra->start = offset;
 	ra->start = offset;
-	ra->size = get_init_ra_size(req_size, max);
+	ra->size = get_init_ra_size(req_size, max_pages);
 	ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size;
 	ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size;
 
 
 readit:
 readit:
@@ -454,7 +471,7 @@ readit:
 	 * the resulted next readahead window into the current one.
 	 * the resulted next readahead window into the current one.
 	 */
 	 */
 	if (offset == ra->start && ra->size == ra->async_size) {
 	if (offset == ra->start && ra->size == ra->async_size) {
-		ra->async_size = get_next_ra_size(ra, max);
+		ra->async_size = get_next_ra_size(ra, max_pages);
 		ra->size += ra->async_size;
 		ra->size += ra->async_size;
 	}
 	}
 
 

+ 34 - 35
mm/rmap.c

@@ -141,14 +141,15 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
 }
 }
 
 
 /**
 /**
- * anon_vma_prepare - attach an anon_vma to a memory region
+ * __anon_vma_prepare - attach an anon_vma to a memory region
  * @vma: the memory region in question
  * @vma: the memory region in question
  *
  *
  * This makes sure the memory mapping described by 'vma' has
  * This makes sure the memory mapping described by 'vma' has
  * an 'anon_vma' attached to it, so that we can associate the
  * an 'anon_vma' attached to it, so that we can associate the
  * anonymous pages mapped into it with that anon_vma.
  * anonymous pages mapped into it with that anon_vma.
  *
  *
- * The common case will be that we already have one, but if
+ * The common case will be that we already have one, which
+ * is handled inline by anon_vma_prepare(). But if
  * not we either need to find an adjacent mapping that we
  * not we either need to find an adjacent mapping that we
  * can re-use the anon_vma from (very common when the only
  * can re-use the anon_vma from (very common when the only
  * reason for splitting a vma has been mprotect()), or we
  * reason for splitting a vma has been mprotect()), or we
@@ -167,48 +168,46 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
  *
  *
  * This must be called with the mmap_sem held for reading.
  * This must be called with the mmap_sem held for reading.
  */
  */
-int anon_vma_prepare(struct vm_area_struct *vma)
+int __anon_vma_prepare(struct vm_area_struct *vma)
 {
 {
-	struct anon_vma *anon_vma = vma->anon_vma;
+	struct mm_struct *mm = vma->vm_mm;
+	struct anon_vma *anon_vma, *allocated;
 	struct anon_vma_chain *avc;
 	struct anon_vma_chain *avc;
 
 
 	might_sleep();
 	might_sleep();
-	if (unlikely(!anon_vma)) {
-		struct mm_struct *mm = vma->vm_mm;
-		struct anon_vma *allocated;
 
 
-		avc = anon_vma_chain_alloc(GFP_KERNEL);
-		if (!avc)
-			goto out_enomem;
+	avc = anon_vma_chain_alloc(GFP_KERNEL);
+	if (!avc)
+		goto out_enomem;
+
+	anon_vma = find_mergeable_anon_vma(vma);
+	allocated = NULL;
+	if (!anon_vma) {
+		anon_vma = anon_vma_alloc();
+		if (unlikely(!anon_vma))
+			goto out_enomem_free_avc;
+		allocated = anon_vma;
+	}
 
 
-		anon_vma = find_mergeable_anon_vma(vma);
+	anon_vma_lock_write(anon_vma);
+	/* page_table_lock to protect against threads */
+	spin_lock(&mm->page_table_lock);
+	if (likely(!vma->anon_vma)) {
+		vma->anon_vma = anon_vma;
+		anon_vma_chain_link(vma, avc, anon_vma);
+		/* vma reference or self-parent link for new root */
+		anon_vma->degree++;
 		allocated = NULL;
 		allocated = NULL;
-		if (!anon_vma) {
-			anon_vma = anon_vma_alloc();
-			if (unlikely(!anon_vma))
-				goto out_enomem_free_avc;
-			allocated = anon_vma;
-		}
+		avc = NULL;
+	}
+	spin_unlock(&mm->page_table_lock);
+	anon_vma_unlock_write(anon_vma);
 
 
-		anon_vma_lock_write(anon_vma);
-		/* page_table_lock to protect against threads */
-		spin_lock(&mm->page_table_lock);
-		if (likely(!vma->anon_vma)) {
-			vma->anon_vma = anon_vma;
-			anon_vma_chain_link(vma, avc, anon_vma);
-			/* vma reference or self-parent link for new root */
-			anon_vma->degree++;
-			allocated = NULL;
-			avc = NULL;
-		}
-		spin_unlock(&mm->page_table_lock);
-		anon_vma_unlock_write(anon_vma);
+	if (unlikely(allocated))
+		put_anon_vma(allocated);
+	if (unlikely(avc))
+		anon_vma_chain_free(avc);
 
 
-		if (unlikely(allocated))
-			put_anon_vma(allocated);
-		if (unlikely(avc))
-			anon_vma_chain_free(avc);
-	}
 	return 0;
 	return 0;
 
 
  out_enomem_free_avc:
  out_enomem_free_avc:

+ 8 - 7
mm/shmem.c

@@ -300,18 +300,19 @@ void shmem_uncharge(struct inode *inode, long pages)
 static int shmem_radix_tree_replace(struct address_space *mapping,
 static int shmem_radix_tree_replace(struct address_space *mapping,
 			pgoff_t index, void *expected, void *replacement)
 			pgoff_t index, void *expected, void *replacement)
 {
 {
+	struct radix_tree_node *node;
 	void **pslot;
 	void **pslot;
 	void *item;
 	void *item;
 
 
 	VM_BUG_ON(!expected);
 	VM_BUG_ON(!expected);
 	VM_BUG_ON(!replacement);
 	VM_BUG_ON(!replacement);
-	pslot = radix_tree_lookup_slot(&mapping->page_tree, index);
-	if (!pslot)
+	item = __radix_tree_lookup(&mapping->page_tree, index, &node, &pslot);
+	if (!item)
 		return -ENOENT;
 		return -ENOENT;
-	item = radix_tree_deref_slot_protected(pslot, &mapping->tree_lock);
 	if (item != expected)
 	if (item != expected)
 		return -ENOENT;
 		return -ENOENT;
-	radix_tree_replace_slot(pslot, replacement);
+	__radix_tree_replace(&mapping->page_tree, node, pslot,
+			     replacement, NULL, NULL);
 	return 0;
 	return 0;
 }
 }
 
 
@@ -370,6 +371,7 @@ static bool shmem_confirm_swap(struct address_space *mapping,
 
 
 int shmem_huge __read_mostly;
 int shmem_huge __read_mostly;
 
 
+#if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS)
 static int shmem_parse_huge(const char *str)
 static int shmem_parse_huge(const char *str)
 {
 {
 	if (!strcmp(str, "never"))
 	if (!strcmp(str, "never"))
@@ -407,6 +409,7 @@ static const char *shmem_format_huge(int huge)
 		return "bad_val";
 		return "bad_val";
 	}
 	}
 }
 }
+#endif
 
 
 static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
 static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
 		struct shrink_control *sc, unsigned long nr_to_split)
 		struct shrink_control *sc, unsigned long nr_to_split)
@@ -1539,7 +1542,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
 	struct mm_struct *fault_mm, int *fault_type)
 	struct mm_struct *fault_mm, int *fault_type)
 {
 {
 	struct address_space *mapping = inode->i_mapping;
 	struct address_space *mapping = inode->i_mapping;
-	struct shmem_inode_info *info;
+	struct shmem_inode_info *info = SHMEM_I(inode);
 	struct shmem_sb_info *sbinfo;
 	struct shmem_sb_info *sbinfo;
 	struct mm_struct *charge_mm;
 	struct mm_struct *charge_mm;
 	struct mem_cgroup *memcg;
 	struct mem_cgroup *memcg;
@@ -1589,7 +1592,6 @@ repeat:
 	 * Fast cache lookup did not find it:
 	 * Fast cache lookup did not find it:
 	 * bring it back from swap or allocate.
 	 * bring it back from swap or allocate.
 	 */
 	 */
-	info = SHMEM_I(inode);
 	sbinfo = SHMEM_SB(inode->i_sb);
 	sbinfo = SHMEM_SB(inode->i_sb);
 	charge_mm = fault_mm ? : current->mm;
 	charge_mm = fault_mm ? : current->mm;
 
 
@@ -1837,7 +1839,6 @@ unlock:
 		put_page(page);
 		put_page(page);
 	}
 	}
 	if (error == -ENOSPC && !once++) {
 	if (error == -ENOSPC && !once++) {
-		info = SHMEM_I(inode);
 		spin_lock_irq(&info->lock);
 		spin_lock_irq(&info->lock);
 		shmem_recalc_inode(inode);
 		shmem_recalc_inode(inode);
 		spin_unlock_irq(&info->lock);
 		spin_unlock_irq(&info->lock);

+ 47 - 82
mm/slab.c

@@ -227,13 +227,14 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent)
 	INIT_LIST_HEAD(&parent->slabs_full);
 	INIT_LIST_HEAD(&parent->slabs_full);
 	INIT_LIST_HEAD(&parent->slabs_partial);
 	INIT_LIST_HEAD(&parent->slabs_partial);
 	INIT_LIST_HEAD(&parent->slabs_free);
 	INIT_LIST_HEAD(&parent->slabs_free);
+	parent->total_slabs = 0;
+	parent->free_slabs = 0;
 	parent->shared = NULL;
 	parent->shared = NULL;
 	parent->alien = NULL;
 	parent->alien = NULL;
 	parent->colour_next = 0;
 	parent->colour_next = 0;
 	spin_lock_init(&parent->list_lock);
 	spin_lock_init(&parent->list_lock);
 	parent->free_objects = 0;
 	parent->free_objects = 0;
 	parent->free_touched = 0;
 	parent->free_touched = 0;
-	parent->num_slabs = 0;
 }
 }
 
 
 #define MAKE_LIST(cachep, listp, slab, nodeid)				\
 #define MAKE_LIST(cachep, listp, slab, nodeid)				\
@@ -1366,7 +1367,6 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
 {
 {
 #if DEBUG
 #if DEBUG
 	struct kmem_cache_node *n;
 	struct kmem_cache_node *n;
-	struct page *page;
 	unsigned long flags;
 	unsigned long flags;
 	int node;
 	int node;
 	static DEFINE_RATELIMIT_STATE(slab_oom_rs, DEFAULT_RATELIMIT_INTERVAL,
 	static DEFINE_RATELIMIT_STATE(slab_oom_rs, DEFAULT_RATELIMIT_INTERVAL,
@@ -1381,32 +1381,18 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
 		cachep->name, cachep->size, cachep->gfporder);
 		cachep->name, cachep->size, cachep->gfporder);
 
 
 	for_each_kmem_cache_node(cachep, node, n) {
 	for_each_kmem_cache_node(cachep, node, n) {
-		unsigned long active_objs = 0, num_objs = 0, free_objects = 0;
-		unsigned long active_slabs = 0, num_slabs = 0;
-		unsigned long num_slabs_partial = 0, num_slabs_free = 0;
-		unsigned long num_slabs_full;
+		unsigned long total_slabs, free_slabs, free_objs;
 
 
 		spin_lock_irqsave(&n->list_lock, flags);
 		spin_lock_irqsave(&n->list_lock, flags);
-		num_slabs = n->num_slabs;
-		list_for_each_entry(page, &n->slabs_partial, lru) {
-			active_objs += page->active;
-			num_slabs_partial++;
-		}
-		list_for_each_entry(page, &n->slabs_free, lru)
-			num_slabs_free++;
-
-		free_objects += n->free_objects;
+		total_slabs = n->total_slabs;
+		free_slabs = n->free_slabs;
+		free_objs = n->free_objects;
 		spin_unlock_irqrestore(&n->list_lock, flags);
 		spin_unlock_irqrestore(&n->list_lock, flags);
 
 
-		num_objs = num_slabs * cachep->num;
-		active_slabs = num_slabs - num_slabs_free;
-		num_slabs_full = num_slabs -
-			(num_slabs_partial + num_slabs_free);
-		active_objs += (num_slabs_full * cachep->num);
-
-		pr_warn("  node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n",
-			node, active_slabs, num_slabs, active_objs, num_objs,
-			free_objects);
+		pr_warn("  node %d: slabs: %ld/%ld, objs: %ld/%ld\n",
+			node, total_slabs - free_slabs, total_slabs,
+			(total_slabs * cachep->num) - free_objs,
+			total_slabs * cachep->num);
 	}
 	}
 #endif
 #endif
 }
 }
@@ -2318,7 +2304,8 @@ static int drain_freelist(struct kmem_cache *cache,
 
 
 		page = list_entry(p, struct page, lru);
 		page = list_entry(p, struct page, lru);
 		list_del(&page->lru);
 		list_del(&page->lru);
-		n->num_slabs--;
+		n->free_slabs--;
+		n->total_slabs--;
 		/*
 		/*
 		 * Safe to drop the lock. The slab is no longer linked
 		 * Safe to drop the lock. The slab is no longer linked
 		 * to the cache.
 		 * to the cache.
@@ -2332,7 +2319,7 @@ out:
 	return nr_freed;
 	return nr_freed;
 }
 }
 
 
-int __kmem_cache_shrink(struct kmem_cache *cachep, bool deactivate)
+int __kmem_cache_shrink(struct kmem_cache *cachep)
 {
 {
 	int ret = 0;
 	int ret = 0;
 	int node;
 	int node;
@@ -2352,7 +2339,7 @@ int __kmem_cache_shrink(struct kmem_cache *cachep, bool deactivate)
 
 
 int __kmem_cache_shutdown(struct kmem_cache *cachep)
 int __kmem_cache_shutdown(struct kmem_cache *cachep)
 {
 {
-	return __kmem_cache_shrink(cachep, false);
+	return __kmem_cache_shrink(cachep);
 }
 }
 
 
 void __kmem_cache_release(struct kmem_cache *cachep)
 void __kmem_cache_release(struct kmem_cache *cachep)
@@ -2753,12 +2740,13 @@ static void cache_grow_end(struct kmem_cache *cachep, struct page *page)
 	n = get_node(cachep, page_to_nid(page));
 	n = get_node(cachep, page_to_nid(page));
 
 
 	spin_lock(&n->list_lock);
 	spin_lock(&n->list_lock);
-	if (!page->active)
+	n->total_slabs++;
+	if (!page->active) {
 		list_add_tail(&page->lru, &(n->slabs_free));
 		list_add_tail(&page->lru, &(n->slabs_free));
-	else
+		n->free_slabs++;
+	} else
 		fixup_slab_list(cachep, n, page, &list);
 		fixup_slab_list(cachep, n, page, &list);
 
 
-	n->num_slabs++;
 	STATS_INC_GROWN(cachep);
 	STATS_INC_GROWN(cachep);
 	n->free_objects += cachep->num - page->active;
 	n->free_objects += cachep->num - page->active;
 	spin_unlock(&n->list_lock);
 	spin_unlock(&n->list_lock);
@@ -2903,9 +2891,10 @@ static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n,
 
 
 	/* Move pfmemalloc slab to the end of list to speed up next search */
 	/* Move pfmemalloc slab to the end of list to speed up next search */
 	list_del(&page->lru);
 	list_del(&page->lru);
-	if (!page->active)
+	if (!page->active) {
 		list_add_tail(&page->lru, &n->slabs_free);
 		list_add_tail(&page->lru, &n->slabs_free);
-	else
+		n->free_slabs++;
+	} else
 		list_add_tail(&page->lru, &n->slabs_partial);
 		list_add_tail(&page->lru, &n->slabs_partial);
 
 
 	list_for_each_entry(page, &n->slabs_partial, lru) {
 	list_for_each_entry(page, &n->slabs_partial, lru) {
@@ -2913,9 +2902,12 @@ static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n,
 			return page;
 			return page;
 	}
 	}
 
 
+	n->free_touched = 1;
 	list_for_each_entry(page, &n->slabs_free, lru) {
 	list_for_each_entry(page, &n->slabs_free, lru) {
-		if (!PageSlabPfmemalloc(page))
+		if (!PageSlabPfmemalloc(page)) {
+			n->free_slabs--;
 			return page;
 			return page;
+		}
 	}
 	}
 
 
 	return NULL;
 	return NULL;
@@ -2925,16 +2917,18 @@ static struct page *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc)
 {
 {
 	struct page *page;
 	struct page *page;
 
 
-	page = list_first_entry_or_null(&n->slabs_partial,
-			struct page, lru);
+	assert_spin_locked(&n->list_lock);
+	page = list_first_entry_or_null(&n->slabs_partial, struct page, lru);
 	if (!page) {
 	if (!page) {
 		n->free_touched = 1;
 		n->free_touched = 1;
-		page = list_first_entry_or_null(&n->slabs_free,
-				struct page, lru);
+		page = list_first_entry_or_null(&n->slabs_free, struct page,
+						lru);
+		if (page)
+			n->free_slabs--;
 	}
 	}
 
 
 	if (sk_memalloc_socks())
 	if (sk_memalloc_socks())
-		return get_valid_first_slab(n, page, pfmemalloc);
+		page = get_valid_first_slab(n, page, pfmemalloc);
 
 
 	return page;
 	return page;
 }
 }
@@ -3434,9 +3428,10 @@ static void free_block(struct kmem_cache *cachep, void **objpp,
 		STATS_DEC_ACTIVE(cachep);
 		STATS_DEC_ACTIVE(cachep);
 
 
 		/* fixup slab chains */
 		/* fixup slab chains */
-		if (page->active == 0)
+		if (page->active == 0) {
 			list_add(&page->lru, &n->slabs_free);
 			list_add(&page->lru, &n->slabs_free);
-		else {
+			n->free_slabs++;
+		} else {
 			/* Unconditionally move a slab to the end of the
 			/* Unconditionally move a slab to the end of the
 			 * partial list on free - maximum time for the
 			 * partial list on free - maximum time for the
 			 * other objects to be freed, too.
 			 * other objects to be freed, too.
@@ -3450,7 +3445,8 @@ static void free_block(struct kmem_cache *cachep, void **objpp,
 
 
 		page = list_last_entry(&n->slabs_free, struct page, lru);
 		page = list_last_entry(&n->slabs_free, struct page, lru);
 		list_move(&page->lru, list);
 		list_move(&page->lru, list);
-		n->num_slabs--;
+		n->free_slabs--;
+		n->total_slabs--;
 	}
 	}
 }
 }
 
 
@@ -4102,64 +4098,33 @@ out:
 #ifdef CONFIG_SLABINFO
 #ifdef CONFIG_SLABINFO
 void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
 void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
 {
 {
-	struct page *page;
-	unsigned long active_objs;
-	unsigned long num_objs;
-	unsigned long active_slabs = 0;
-	unsigned long num_slabs, free_objects = 0, shared_avail = 0;
-	unsigned long num_slabs_partial = 0, num_slabs_free = 0;
-	unsigned long num_slabs_full = 0;
-	const char *name;
-	char *error = NULL;
+	unsigned long active_objs, num_objs, active_slabs;
+	unsigned long total_slabs = 0, free_objs = 0, shared_avail = 0;
+	unsigned long free_slabs = 0;
 	int node;
 	int node;
 	struct kmem_cache_node *n;
 	struct kmem_cache_node *n;
 
 
-	active_objs = 0;
-	num_slabs = 0;
 	for_each_kmem_cache_node(cachep, node, n) {
 	for_each_kmem_cache_node(cachep, node, n) {
-
 		check_irq_on();
 		check_irq_on();
 		spin_lock_irq(&n->list_lock);
 		spin_lock_irq(&n->list_lock);
 
 
-		num_slabs += n->num_slabs;
+		total_slabs += n->total_slabs;
+		free_slabs += n->free_slabs;
+		free_objs += n->free_objects;
 
 
-		list_for_each_entry(page, &n->slabs_partial, lru) {
-			if (page->active == cachep->num && !error)
-				error = "slabs_partial accounting error";
-			if (!page->active && !error)
-				error = "slabs_partial accounting error";
-			active_objs += page->active;
-			num_slabs_partial++;
-		}
-
-		list_for_each_entry(page, &n->slabs_free, lru) {
-			if (page->active && !error)
-				error = "slabs_free accounting error";
-			num_slabs_free++;
-		}
-
-		free_objects += n->free_objects;
 		if (n->shared)
 		if (n->shared)
 			shared_avail += n->shared->avail;
 			shared_avail += n->shared->avail;
 
 
 		spin_unlock_irq(&n->list_lock);
 		spin_unlock_irq(&n->list_lock);
 	}
 	}
-	num_objs = num_slabs * cachep->num;
-	active_slabs = num_slabs - num_slabs_free;
-	num_slabs_full = num_slabs - (num_slabs_partial + num_slabs_free);
-	active_objs += (num_slabs_full * cachep->num);
-
-	if (num_objs - active_objs != free_objects && !error)
-		error = "free_objects accounting error";
-
-	name = cachep->name;
-	if (error)
-		pr_err("slab: cache %s error: %s\n", name, error);
+	num_objs = total_slabs * cachep->num;
+	active_slabs = total_slabs - free_slabs;
+	active_objs = num_objs - free_objs;
 
 
 	sinfo->active_objs = active_objs;
 	sinfo->active_objs = active_objs;
 	sinfo->num_objs = num_objs;
 	sinfo->num_objs = num_objs;
 	sinfo->active_slabs = active_slabs;
 	sinfo->active_slabs = active_slabs;
-	sinfo->num_slabs = num_slabs;
+	sinfo->num_slabs = total_slabs;
 	sinfo->shared_avail = shared_avail;
 	sinfo->shared_avail = shared_avail;
 	sinfo->limit = cachep->limit;
 	sinfo->limit = cachep->limit;
 	sinfo->batchcount = cachep->batchcount;
 	sinfo->batchcount = cachep->batchcount;

+ 18 - 2
mm/slab.h

@@ -142,11 +142,26 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size,
 #define SLAB_CACHE_FLAGS (0)
 #define SLAB_CACHE_FLAGS (0)
 #endif
 #endif
 
 
+/* Common flags available with current configuration */
 #define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS)
 #define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS)
 
 
+/* Common flags permitted for kmem_cache_create */
+#define SLAB_FLAGS_PERMITTED (SLAB_CORE_FLAGS | \
+			      SLAB_RED_ZONE | \
+			      SLAB_POISON | \
+			      SLAB_STORE_USER | \
+			      SLAB_TRACE | \
+			      SLAB_CONSISTENCY_CHECKS | \
+			      SLAB_MEM_SPREAD | \
+			      SLAB_NOLEAKTRACE | \
+			      SLAB_RECLAIM_ACCOUNT | \
+			      SLAB_TEMPORARY | \
+			      SLAB_NOTRACK | \
+			      SLAB_ACCOUNT)
+
 int __kmem_cache_shutdown(struct kmem_cache *);
 int __kmem_cache_shutdown(struct kmem_cache *);
 void __kmem_cache_release(struct kmem_cache *);
 void __kmem_cache_release(struct kmem_cache *);
-int __kmem_cache_shrink(struct kmem_cache *, bool);
+int __kmem_cache_shrink(struct kmem_cache *);
 void slab_kmem_cache_release(struct kmem_cache *);
 void slab_kmem_cache_release(struct kmem_cache *);
 
 
 struct seq_file;
 struct seq_file;
@@ -432,7 +447,8 @@ struct kmem_cache_node {
 	struct list_head slabs_partial;	/* partial list first, better asm code */
 	struct list_head slabs_partial;	/* partial list first, better asm code */
 	struct list_head slabs_full;
 	struct list_head slabs_full;
 	struct list_head slabs_free;
 	struct list_head slabs_free;
-	unsigned long num_slabs;
+	unsigned long total_slabs;	/* length of all slab lists */
+	unsigned long free_slabs;	/* length of free slab list only */
 	unsigned long free_objects;
 	unsigned long free_objects;
 	unsigned int free_limit;
 	unsigned int free_limit;
 	unsigned int colour_next;	/* Per-node cache coloring */
 	unsigned int colour_next;	/* Per-node cache coloring */

+ 31 - 2
mm/slab_common.c

@@ -404,6 +404,12 @@ kmem_cache_create(const char *name, size_t size, size_t align,
 		goto out_unlock;
 		goto out_unlock;
 	}
 	}
 
 
+	/* Refuse requests with allocator specific flags */
+	if (flags & ~SLAB_FLAGS_PERMITTED) {
+		err = -EINVAL;
+		goto out_unlock;
+	}
+
 	/*
 	/*
 	 * Some allocators will constraint the set of valid flags to a subset
 	 * Some allocators will constraint the set of valid flags to a subset
 	 * of all flags. We expect them to define CACHE_CREATE_MASK in this
 	 * of all flags. We expect them to define CACHE_CREATE_MASK in this
@@ -573,6 +579,29 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg)
 	get_online_cpus();
 	get_online_cpus();
 	get_online_mems();
 	get_online_mems();
 
 
+#ifdef CONFIG_SLUB
+	/*
+	 * In case of SLUB, we need to disable empty slab caching to
+	 * avoid pinning the offline memory cgroup by freeable kmem
+	 * pages charged to it. SLAB doesn't need this, as it
+	 * periodically purges unused slabs.
+	 */
+	mutex_lock(&slab_mutex);
+	list_for_each_entry(s, &slab_caches, list) {
+		c = is_root_cache(s) ? cache_from_memcg_idx(s, idx) : NULL;
+		if (c) {
+			c->cpu_partial = 0;
+			c->min_partial = 0;
+		}
+	}
+	mutex_unlock(&slab_mutex);
+	/*
+	 * kmem_cache->cpu_partial is checked locklessly (see
+	 * put_cpu_partial()). Make sure the change is visible.
+	 */
+	synchronize_sched();
+#endif
+
 	mutex_lock(&slab_mutex);
 	mutex_lock(&slab_mutex);
 	list_for_each_entry(s, &slab_caches, list) {
 	list_for_each_entry(s, &slab_caches, list) {
 		if (!is_root_cache(s))
 		if (!is_root_cache(s))
@@ -584,7 +613,7 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg)
 		if (!c)
 		if (!c)
 			continue;
 			continue;
 
 
-		__kmem_cache_shrink(c, true);
+		__kmem_cache_shrink(c);
 		arr->entries[idx] = NULL;
 		arr->entries[idx] = NULL;
 	}
 	}
 	mutex_unlock(&slab_mutex);
 	mutex_unlock(&slab_mutex);
@@ -755,7 +784,7 @@ int kmem_cache_shrink(struct kmem_cache *cachep)
 	get_online_cpus();
 	get_online_cpus();
 	get_online_mems();
 	get_online_mems();
 	kasan_cache_shrink(cachep);
 	kasan_cache_shrink(cachep);
-	ret = __kmem_cache_shrink(cachep, false);
+	ret = __kmem_cache_shrink(cachep);
 	put_online_mems();
 	put_online_mems();
 	put_online_cpus();
 	put_online_cpus();
 	return ret;
 	return ret;

+ 1 - 1
mm/slob.c

@@ -634,7 +634,7 @@ void __kmem_cache_release(struct kmem_cache *c)
 {
 {
 }
 }
 
 
-int __kmem_cache_shrink(struct kmem_cache *d, bool deactivate)
+int __kmem_cache_shrink(struct kmem_cache *d)
 {
 {
 	return 0;
 	return 0;
 }
 }

Some files were not shown because too many files changed in this diff