8 years ago · bc49a7831b
--- a/Documentation/ABI/obsolete/sysfs-block-zram
+++ b/Documentation/ABI/obsolete/sysfs-block-zram
@@ -1,119 +0,0 @@
 
				-What:		/sys/block/zram<id>/num_reads
			
 
				-Date:		August 2015
			
 
				-Contact:	Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
			
 
				-Description:
			
 
				-		The num_reads file is read-only and specifies the number of
			
 
				-		reads (failed or successful) done on this device.
			
 
				-		Now accessible via zram<id>/stat node.
			
 
				-
			
 
				-What:		/sys/block/zram<id>/num_writes
			
 
				-Date:		August 2015
			
 
				-Contact:	Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
			
 
				-Description:
			
 
				-		The num_writes file is read-only and specifies the number of
			
 
				-		writes (failed or successful) done on this device.
			
 
				-		Now accessible via zram<id>/stat node.
			
 
				-
			
 
				-What:		/sys/block/zram<id>/invalid_io
			
 
				-Date:		August 2015
			
 
				-Contact:	Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
			
 
				-Description:
			
 
				-		The invalid_io file is read-only and specifies the number of
			
 
				-		non-page-size-aligned I/O requests issued to this device.
			
 
				-		Now accessible via zram<id>/io_stat node.
			
 
				-
			
 
				-What:		/sys/block/zram<id>/failed_reads
			
 
				-Date:		August 2015
			
 
				-Contact:	Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
			
 
				-Description:
			
 
				-		The failed_reads file is read-only and specifies the number of
			
 
				-		failed reads happened on this device.
			
 
				-		Now accessible via zram<id>/io_stat node.
			
 
				-
			
 
				-What:		/sys/block/zram<id>/failed_writes
			
 
				-Date:		August 2015
			
 
				-Contact:	Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
			
 
				-Description:
			
 
				-		The failed_writes file is read-only and specifies the number of
			
 
				-		failed writes happened on this device.
			
 
				-		Now accessible via zram<id>/io_stat node.
			
 
				-
			
 
				-What:		/sys/block/zram<id>/notify_free
			
 
				-Date:		August 2015
			
 
				-Contact:	Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
			
 
				-Description:
			
 
				-		The notify_free file is read-only. Depending on device usage
			
 
				-		scenario it may account a) the number of pages freed because
			
 
				-		of swap slot free notifications or b) the number of pages freed
			
 
				-		because of REQ_DISCARD requests sent by bio. The former ones
			
 
				-		are sent to a swap block device when a swap slot is freed, which
			
 
				-		implies that this disk is being used as a swap disk. The latter
			
 
				-		ones are sent by filesystem mounted with discard option,
			
 
				-		whenever some data blocks are getting discarded.
			
 
				-		Now accessible via zram<id>/io_stat node.
			
 
				-
			
 
				-What:		/sys/block/zram<id>/zero_pages
			
 
				-Date:		August 2015
			
 
				-Contact:	Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
			
 
				-Description:
			
 
				-		The zero_pages file is read-only and specifies number of zero
			
 
				-		filled pages written to this disk. No memory is allocated for
			
 
				-		such pages.
			
 
				-		Now accessible via zram<id>/mm_stat node.
			
 
				-
			
 
				-What:		/sys/block/zram<id>/orig_data_size
			
 
				-Date:		August 2015
			
 
				-Contact:	Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
			
 
				-Description:
			
 
				-		The orig_data_size file is read-only and specifies uncompressed
			
 
				-		size of data stored in this disk. This excludes zero-filled
			
 
				-		pages (zero_pages) since no memory is allocated for them.
			
 
				-		Unit: bytes
			
 
				-		Now accessible via zram<id>/mm_stat node.
			
 
				-
			
 
				-What:		/sys/block/zram<id>/compr_data_size
			
 
				-Date:		August 2015
			
 
				-Contact:	Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
			
 
				-Description:
			
 
				-		The compr_data_size file is read-only and specifies compressed
			
 
				-		size of data stored in this disk. So, compression ratio can be
			
 
				-		calculated using orig_data_size and this statistic.
			
 
				-		Unit: bytes
			
 
				-		Now accessible via zram<id>/mm_stat node.
			
 
				-
			
 
				-What:		/sys/block/zram<id>/mem_used_total
			
 
				-Date:		August 2015
			
 
				-Contact:	Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
			
 
				-Description:
			
 
				-		The mem_used_total file is read-only and specifies the amount
			
 
				-		of memory, including allocator fragmentation and metadata
			
 
				-		overhead, allocated for this disk. So, allocator space
			
 
				-		efficiency can be calculated using compr_data_size and this
			
 
				-		statistic.
			
 
				-		Unit: bytes
			
 
				-		Now accessible via zram<id>/mm_stat node.
			
 
				-
			
 
				-What:		/sys/block/zram<id>/mem_used_max
			
 
				-Date:		August 2015
			
 
				-Contact:	Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
			
 
				-Description:
			
 
				-		The mem_used_max file is read/write and specifies the amount
			
 
				-		of maximum memory zram have consumed to store compressed data.
			
 
				-		For resetting the value, you should write "0". Otherwise,
			
 
				-		you could see -EINVAL.
			
 
				-		Unit: bytes
			
 
				-		Downgraded to write-only node: so it's possible to set new
			
 
				-		value only; its current value is stored in zram<id>/mm_stat
			
 
				-		node.
			
 
				-
			
 
				-What:		/sys/block/zram<id>/mem_limit
			
 
				-Date:		August 2015
			
 
				-Contact:	Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
			
 
				-Description:
			
 
				-		The mem_limit file is read/write and specifies the maximum
			
 
				-		amount of memory ZRAM can use to store the compressed data.
			
 
				-		The limit could be changed in run time and "0" means disable
			
 
				-		the limit.  No limit is the initial state.  Unit: bytes
			
 
				-		Downgraded to write-only node: so it's possible to set new
			
 
				-		value only; its current value is stored in zram<id>/mm_stat
			
 
				-		node.
			
--- a/Documentation/ABI/testing/sysfs-block-zram
+++ b/Documentation/ABI/testing/sysfs-block-zram
@@ -22,41 +22,6 @@ Description:
 
				 		device. The reset operation frees all the memory associated
			
 
				 		with this device.
			
 
				 
			
 
				-What:		/sys/block/zram<id>/num_reads
			
 
				-Date:		August 2010
			
 
				-Contact:	Nitin Gupta <ngupta@vflare.org>
			
 
				-Description:
			
 
				-		The num_reads file is read-only and specifies the number of
			
 
				-		reads (failed or successful) done on this device.
			
 
				-
			
 
				-What:		/sys/block/zram<id>/num_writes
			
 
				-Date:		August 2010
			
 
				-Contact:	Nitin Gupta <ngupta@vflare.org>
			
 
				-Description:
			
 
				-		The num_writes file is read-only and specifies the number of
			
 
				-		writes (failed or successful) done on this device.
			
 
				-
			
 
				-What:		/sys/block/zram<id>/invalid_io
			
 
				-Date:		August 2010
			
 
				-Contact:	Nitin Gupta <ngupta@vflare.org>
			
 
				-Description:
			
 
				-		The invalid_io file is read-only and specifies the number of
			
 
				-		non-page-size-aligned I/O requests issued to this device.
			
 
				-
			
 
				-What:		/sys/block/zram<id>/failed_reads
			
 
				-Date:		February 2014
			
 
				-Contact:	Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
			
 
				-Description:
			
 
				-		The failed_reads file is read-only and specifies the number of
			
 
				-		failed reads happened on this device.
			
 
				-
			
 
				-What:		/sys/block/zram<id>/failed_writes
			
 
				-Date:		February 2014
			
 
				-Contact:	Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
			
 
				-Description:
			
 
				-		The failed_writes file is read-only and specifies the number of
			
 
				-		failed writes happened on this device.
			
 
				-
			
 
				 What:		/sys/block/zram<id>/max_comp_streams
			
 
				 Date:		February 2014
			
 
				 Contact:	Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
			
@@ -73,74 +38,24 @@ Description:
 
				 		available and selected compression algorithms, change
			
 
				 		compression algorithm selection.
			
 
				 
			
 
				-What:		/sys/block/zram<id>/notify_free
			
 
				-Date:		August 2010
			
 
				-Contact:	Nitin Gupta <ngupta@vflare.org>
			
 
				-Description:
			
 
				-		The notify_free file is read-only. Depending on device usage
			
 
				-		scenario it may account a) the number of pages freed because
			
 
				-		of swap slot free notifications or b) the number of pages freed
			
 
				-		because of REQ_DISCARD requests sent by bio. The former ones
			
 
				-		are sent to a swap block device when a swap slot is freed, which
			
 
				-		implies that this disk is being used as a swap disk. The latter
			
 
				-		ones are sent by filesystem mounted with discard option,
			
 
				-		whenever some data blocks are getting discarded.
			
 
				-
			
 
				-What:		/sys/block/zram<id>/zero_pages
			
 
				-Date:		August 2010
			
 
				-Contact:	Nitin Gupta <ngupta@vflare.org>
			
 
				-Description:
			
 
				-		The zero_pages file is read-only and specifies number of zero
			
 
				-		filled pages written to this disk. No memory is allocated for
			
 
				-		such pages.
			
 
				-
			
 
				-What:		/sys/block/zram<id>/orig_data_size
			
 
				-Date:		August 2010
			
 
				-Contact:	Nitin Gupta <ngupta@vflare.org>
			
 
				-Description:
			
 
				-		The orig_data_size file is read-only and specifies uncompressed
			
 
				-		size of data stored in this disk. This excludes zero-filled
			
 
				-		pages (zero_pages) since no memory is allocated for them.
			
 
				-		Unit: bytes
			
 
				-
			
 
				-What:		/sys/block/zram<id>/compr_data_size
			
 
				-Date:		August 2010
			
 
				-Contact:	Nitin Gupta <ngupta@vflare.org>
			
 
				-Description:
			
 
				-		The compr_data_size file is read-only and specifies compressed
			
 
				-		size of data stored in this disk. So, compression ratio can be
			
 
				-		calculated using orig_data_size and this statistic.
			
 
				-		Unit: bytes
			
 
				-
			
 
				-What:		/sys/block/zram<id>/mem_used_total
			
 
				-Date:		August 2010
			
 
				-Contact:	Nitin Gupta <ngupta@vflare.org>
			
 
				-Description:
			
 
				-		The mem_used_total file is read-only and specifies the amount
			
 
				-		of memory, including allocator fragmentation and metadata
			
 
				-		overhead, allocated for this disk. So, allocator space
			
 
				-		efficiency can be calculated using compr_data_size and this
			
 
				-		statistic.
			
 
				-		Unit: bytes
			
 
				-
			
 
				 What:		/sys/block/zram<id>/mem_used_max
			
 
				 Date:		August 2014
			
 
				 Contact:	Minchan Kim <minchan@kernel.org>
			
 
				 Description:
			
 
				-		The mem_used_max file is read/write and specifies the amount
			
 
				-		of maximum memory zram have consumed to store compressed data.
			
 
				-		For resetting the value, you should write "0". Otherwise,
			
 
				-		you could see -EINVAL.
			
 
				+		The mem_used_max file is write-only and is used to reset
			
 
				+		the counter of maximum memory zram have consumed to store
			
 
				+		compressed data. For resetting the value, you should write
			
 
				+		"0". Otherwise, you could see -EINVAL.
			
 
				 		Unit: bytes
			
 
				 
			
 
				 What:		/sys/block/zram<id>/mem_limit
			
 
				 Date:		August 2014
			
 
				 Contact:	Minchan Kim <minchan@kernel.org>
			
 
				 Description:
			
 
				-		The mem_limit file is read/write and specifies the maximum
			
 
				-		amount of memory ZRAM can use to store the compressed data.  The
			
 
				-		limit could be changed in run time and "0" means disable the
			
 
				-		limit.  No limit is the initial state.  Unit: bytes
			
 
				+		The mem_limit file is write-only and specifies the maximum
			
 
				+		amount of memory ZRAM can use to store the compressed data.
			
 
				+		The limit could be changed in run time and "0" means disable
			
 
				+		the limit. No limit is the initial state.  Unit: bytes
			
 
				 
			
 
				 What:		/sys/block/zram<id>/compact
			
 
				 Date:		August 2015
			
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3694,6 +3694,14 @@
 
				 			last alloc / free. For more information see
			
 
				 			Documentation/vm/slub.txt.
			
 
				 
			
 
				+	slub_memcg_sysfs=	[MM, SLUB]
			
 
				+			Determines whether to enable sysfs directories for
			
 
				+			memory cgroup sub-caches. 1 to enable, 0 to disable.
			
 
				+			The default is determined by CONFIG_SLUB_MEMCG_SYSFS_ON.
			
 
				+			Enabling this can lead to a very high number of	debug
			
 
				+			directories and files being created under
			
 
				+			/sys/kernel/slub.
			
 
				+
			
 
				 	slub_max_order= [MM, SLUB]
			
 
				 			Determines the maximum allowed order for slabs.
			
 
				 			A high setting may cause OOMs due to memory
			
--- a/Documentation/blockdev/zram.txt
+++ b/Documentation/blockdev/zram.txt
@@ -161,42 +161,14 @@ Name            access            description
 
				 disksize          RW    show and set the device's disk size
			
 
				 initstate         RO    shows the initialization state of the device
			
 
				 reset             WO    trigger device reset
			
 
				-num_reads         RO    the number of reads
			
 
				-failed_reads      RO    the number of failed reads
			
 
				-num_write         RO    the number of writes
			
 
				-failed_writes     RO    the number of failed writes
			
 
				-invalid_io        RO    the number of non-page-size-aligned I/O requests
			
 
				+mem_used_max      WO    reset the `mem_used_max' counter (see later)
			
 
				+mem_limit         WO    specifies the maximum amount of memory ZRAM can use
			
 
				+                        to store the compressed data
			
 
				 max_comp_streams  RW    the number of possible concurrent compress operations
			
 
				 comp_algorithm    RW    show and change the compression algorithm
			
 
				-notify_free       RO    the number of notifications to free pages (either
			
 
				-                        slot free notifications or REQ_DISCARD requests)
			
 
				-zero_pages        RO    the number of zero filled pages written to this disk
			
 
				-orig_data_size    RO    uncompressed size of data stored in this disk
			
 
				-compr_data_size   RO    compressed size of data stored in this disk
			
 
				-mem_used_total    RO    the amount of memory allocated for this disk
			
 
				-mem_used_max      RW    the maximum amount of memory zram have consumed to
			
 
				-                        store the data (to reset this counter to the actual
			
 
				-                        current value, write 1 to this attribute)
			
 
				-mem_limit         RW    the maximum amount of memory ZRAM can use to store
			
 
				-                        the compressed data
			
 
				-pages_compacted   RO    the number of pages freed during compaction
			
 
				-                        (available only via zram<id>/mm_stat node)
			
 
				 compact           WO    trigger memory compaction
			
 
				 debug_stat        RO    this file is used for zram debugging purposes
			
 
				 
			
 
				-WARNING
			
 
				-=======
			
 
				-per-stat sysfs attributes are considered to be deprecated.
			
 
				-The basic strategy is:
			
 
				--- the existing RW nodes will be downgraded to WO nodes (in linux 4.11)
			
 
				--- deprecated RO sysfs nodes will eventually be removed (in linux 4.11)
			
 
				-
			
 
				-The list of deprecated attributes can be found here:
			
 
				-Documentation/ABI/obsolete/sysfs-block-zram
			
 
				-
			
 
				-Basically, every attribute that has its own read accessible sysfs node
			
 
				-(e.g. num_reads) *AND* is accessible via one of the stat files (zram<id>/stat
			
 
				-or zram<id>/io_stat or zram<id>/mm_stat) is considered to be deprecated.
			
 
				 
			
 
				 User space is advised to use the following files to read the device statistics.
			
 
				 
			
@@ -211,22 +183,40 @@ The stat file represents device's I/O statistics not accounted by block
 
				 layer and, thus, not available in zram<id>/stat file. It consists of a
			
 
				 single line of text and contains the following stats separated by
			
 
				 whitespace:
			
 
				-	failed_reads
			
 
				-	failed_writes
			
 
				-	invalid_io
			
 
				-	notify_free
			
 
				+ failed_reads     the number of failed reads
			
 
				+ failed_writes    the number of failed writes
			
 
				+ invalid_io       the number of non-page-size-aligned I/O requests
			
 
				+ notify_free      Depending on device usage scenario it may account
			
 
				+                  a) the number of pages freed because of swap slot free
			
 
				+                  notifications or b) the number of pages freed because of
			
 
				+                  REQ_DISCARD requests sent by bio. The former ones are
			
 
				+                  sent to a swap block device when a swap slot is freed,
			
 
				+                  which implies that this disk is being used as a swap disk.
			
 
				+                  The latter ones are sent by filesystem mounted with
			
 
				+                  discard option, whenever some data blocks are getting
			
 
				+                  discarded.
			
 
				 
			
 
				 File /sys/block/zram<id>/mm_stat
			
 
				 
			
 
				 The stat file represents device's mm statistics. It consists of a single
			
 
				 line of text and contains the following stats separated by whitespace:
			
 
				-	orig_data_size
			
 
				-	compr_data_size
			
 
				-	mem_used_total
			
 
				-	mem_limit
			
 
				-	mem_used_max
			
 
				-	zero_pages
			
 
				-	num_migrated
			
 
				+ orig_data_size   uncompressed size of data stored in this disk.
			
 
				+                  This excludes zero-filled pages (zero_pages) since no
			
 
				+                  memory is allocated for them.
			
 
				+                  Unit: bytes
			
 
				+ compr_data_size  compressed size of data stored in this disk
			
 
				+ mem_used_total   the amount of memory allocated for this disk. This
			
 
				+                  includes allocator fragmentation and metadata overhead,
			
 
				+                  allocated for this disk. So, allocator space efficiency
			
 
				+                  can be calculated using compr_data_size and this statistic.
			
 
				+                  Unit: bytes
			
 
				+ mem_limit        the maximum amount of memory ZRAM can use to store
			
 
				+                  the compressed data
			
 
				+ mem_used_max     the maximum amount of memory zram have consumed to
			
 
				+                  store the data
			
 
				+ zero_pages       the number of zero filled pages written to this disk.
			
 
				+                  No memory is allocated for such pages.
			
 
				+ pages_compacted  the number of pages freed during compaction
			
 
				 
			
 
				 9) Deactivate:
			
 
				 	swapoff /dev/zram0
			
--- a/Documentation/trace/postprocess/trace-vmscan-postprocess.pl
+++ b/Documentation/trace/postprocess/trace-vmscan-postprocess.pl
@@ -112,8 +112,8 @@ my $regex_direct_end_default = 'nr_reclaimed=([0-9]*)';
 
				 my $regex_kswapd_wake_default = 'nid=([0-9]*) order=([0-9]*)';
			
 
				 my $regex_kswapd_sleep_default = 'nid=([0-9]*)';
			
 
				 my $regex_wakeup_kswapd_default = 'nid=([0-9]*) zid=([0-9]*) order=([0-9]*)';
			
 
				-my $regex_lru_isolate_default = 'isolate_mode=([0-9]*) order=([0-9]*) nr_requested=([0-9]*) nr_scanned=([0-9]*) nr_taken=([0-9]*) file=([0-9]*)';
			
 
				-my $regex_lru_shrink_inactive_default = 'nid=([0-9]*) zid=([0-9]*) nr_scanned=([0-9]*) nr_reclaimed=([0-9]*) priority=([0-9]*) flags=([A-Z_|]*)';
			
 
				+my $regex_lru_isolate_default = 'isolate_mode=([0-9]*) classzone_idx=([0-9]*) order=([0-9]*) nr_requested=([0-9]*) nr_scanned=([0-9]*) nr_skipped=([0-9]*) nr_taken=([0-9]*) lru=([a-z_]*)';
			
 
				+my $regex_lru_shrink_inactive_default = 'nid=([0-9]*) nr_scanned=([0-9]*) nr_reclaimed=([0-9]*) nr_dirty=([0-9]*) nr_writeback=([0-9]*) nr_congested=([0-9]*) nr_immediate=([0-9]*) nr_activate=([0-9]*) nr_ref_keep=([0-9]*) nr_unmap_fail=([0-9]*) priority=([0-9]*) flags=([A-Z_|]*)';
			
 
				 my $regex_lru_shrink_active_default = 'lru=([A-Z_]*) nr_scanned=([0-9]*) nr_rotated=([0-9]*) priority=([0-9]*)';
			
 
				 my $regex_writepage_default = 'page=([0-9a-f]*) pfn=([0-9]*) flags=([A-Z_|]*)';
			
 
				 
			
@@ -205,15 +205,15 @@ $regex_wakeup_kswapd = generate_traceevent_regex(
 
				 $regex_lru_isolate = generate_traceevent_regex(
			
 
				 			"vmscan/mm_vmscan_lru_isolate",
			
 
				 			$regex_lru_isolate_default,
			
 
				-			"isolate_mode", "order",
			
 
				-			"nr_requested", "nr_scanned", "nr_taken",
			
 
				-			"file");
			
 
				+			"isolate_mode", "classzone_idx", "order",
			
 
				+			"nr_requested", "nr_scanned", "nr_skipped", "nr_taken",
			
 
				+			"lru");
			
 
				 $regex_lru_shrink_inactive = generate_traceevent_regex(
			
 
				 			"vmscan/mm_vmscan_lru_shrink_inactive",
			
 
				 			$regex_lru_shrink_inactive_default,
			
 
				-			"nid", "zid",
			
 
				-			"nr_scanned", "nr_reclaimed", "priority",
			
 
				-			"flags");
			
 
				+			"nid", "nr_scanned", "nr_reclaimed", "nr_dirty", "nr_writeback",
			
 
				+			"nr_congested", "nr_immediate", "nr_activate", "nr_ref_keep",
			
 
				+			"nr_unmap_fail", "priority", "flags");
			
 
				 $regex_lru_shrink_active = generate_traceevent_regex(
			
 
				 			"vmscan/mm_vmscan_lru_shrink_active",
			
 
				 			$regex_lru_shrink_active_default,
			
@@ -381,8 +381,8 @@ EVENT_PROCESS:
 
				 				next;
			
 
				 			}
			
 
				 			my $isolate_mode = $1;
			
 
				-			my $nr_scanned = $4;
			
 
				-			my $file = $6;
			
 
				+			my $nr_scanned = $5;
			
 
				+			my $file = $8;
			
 
				 
			
 
				 			# To closer match vmstat scanning statistics, only count isolate_both
			
 
				 			# and isolate_inactive as scanning. isolate_active is rotation
			
@@ -391,7 +391,7 @@ EVENT_PROCESS:
 
				 			# isolate_both     == 3
			
 
				 			if ($isolate_mode != 2) {
			
 
				 				$perprocesspid{$process_pid}->{HIGH_NR_SCANNED} += $nr_scanned;
			
 
				-				if ($file == 1) {
			
 
				+				if ($file =~ /_file/) {
			
 
				 					$perprocesspid{$process_pid}->{HIGH_NR_FILE_SCANNED} += $nr_scanned;
			
 
				 				} else {
			
 
				 					$perprocesspid{$process_pid}->{HIGH_NR_ANON_SCANNED} += $nr_scanned;
			
@@ -406,8 +406,8 @@ EVENT_PROCESS:
 
				 				next;
			
 
				 			}
			
 
				 
			
 
				-			my $nr_reclaimed = $4;
			
 
				-			my $flags = $6;
			
 
				+			my $nr_reclaimed = $3;
			
 
				+			my $flags = $12;
			
 
				 			my $file = 0;
			
 
				 			if ($flags =~ /RECLAIM_WB_FILE/) {
			
 
				 				$file = 1;
			
--- a/Documentation/vm/transhuge.txt
+++ b/Documentation/vm/transhuge.txt
@@ -110,6 +110,7 @@ MADV_HUGEPAGE region.
 
				 
			
 
				 echo always >/sys/kernel/mm/transparent_hugepage/defrag
			
 
				 echo defer >/sys/kernel/mm/transparent_hugepage/defrag
			
 
				+echo defer+madvise >/sys/kernel/mm/transparent_hugepage/defrag
			
 
				 echo madvise >/sys/kernel/mm/transparent_hugepage/defrag
			
 
				 echo never >/sys/kernel/mm/transparent_hugepage/defrag
			
 
				 
			
@@ -120,10 +121,15 @@ that benefit heavily from THP use and are willing to delay the VM start
 
				 to utilise them.
			
 
				 
			
 
				 "defer" means that an application will wake kswapd in the background
			
 
				-to reclaim pages and wake kcompact to compact memory so that THP is
			
 
				+to reclaim pages and wake kcompactd to compact memory so that THP is
			
 
				 available in the near future. It's the responsibility of khugepaged
			
 
				 to then install the THP pages later.
			
 
				 
			
 
				+"defer+madvise" will enter direct reclaim and compaction like "always", but
			
 
				+only for regions that have used madvise(MADV_HUGEPAGE); all other regions
			
 
				+will wake kswapd in the background to reclaim pages and wake kcompactd to
			
 
				+compact memory so that THP is available in the near future.
			
 
				+
			
 
				 "madvise" will enter direct reclaim like "always" but only for regions
			
 
				 that are have used madvise(MADV_HUGEPAGE). This is the default behaviour.
			
 
				 
			
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3926,10 +3926,13 @@ S:	Maintained
 
				 F:	drivers/i2c/busses/i2c-diolan-u2c.c
			
 
				 
			
 
				 DIRECT ACCESS (DAX)
			
 
				-M:	Matthew Wilcox <willy@linux.intel.com>
			
 
				+M:	Matthew Wilcox <mawilcox@microsoft.com>
			
 
				+M:	Ross Zwisler <ross.zwisler@linux.intel.com>
			
 
				 L:	linux-fsdevel@vger.kernel.org
			
 
				 S:	Supported
			
 
				 F:	fs/dax.c
			
 
				+F:	include/linux/dax.h
			
 
				+F:	include/trace/events/fs_dax.h
			
 
				 
			
 
				 DIRECTORY NOTIFICATION (DNOTIFY)
			
 
				 M:	Eric Paris <eparis@parisplace.org>
			
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -684,51 +684,3 @@ int arch_remove_memory(u64 start, u64 size)
 
				 }
			
 
				 #endif
			
 
				 #endif
			
 
				-
			
 
				-/**
			
 
				- * show_mem - give short summary of memory stats
			
 
				- *
			
 
				- * Shows a simple page count of reserved and used pages in the system.
			
 
				- * For discontig machines, it does this on a per-pgdat basis.
			
 
				- */
			
 
				-void show_mem(unsigned int filter)
			
 
				-{
			
 
				-	int total_reserved = 0;
			
 
				-	unsigned long total_present = 0;
			
 
				-	pg_data_t *pgdat;
			
 
				-
			
 
				-	printk(KERN_INFO "Mem-info:\n");
			
 
				-	show_free_areas(filter);
			
 
				-	printk(KERN_INFO "Node memory in pages:\n");
			
 
				-	for_each_online_pgdat(pgdat) {
			
 
				-		unsigned long present;
			
 
				-		unsigned long flags;
			
 
				-		int reserved = 0;
			
 
				-		int nid = pgdat->node_id;
			
 
				-		int zoneid;
			
 
				-
			
 
				-		if (skip_free_areas_node(filter, nid))
			
 
				-			continue;
			
 
				-		pgdat_resize_lock(pgdat, &flags);
			
 
				-
			
 
				-		for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
			
 
				-			struct zone *zone = &pgdat->node_zones[zoneid];
			
 
				-			if (!populated_zone(zone))
			
 
				-				continue;
			
 
				-
			
 
				-			reserved += zone->present_pages - zone->managed_pages;
			
 
				-		}
			
 
				-		present = pgdat->node_present_pages;
			
 
				-
			
 
				-		pgdat_resize_unlock(pgdat, &flags);
			
 
				-		total_present += present;
			
 
				-		total_reserved += reserved;
			
 
				-		printk(KERN_INFO "Node %4d:  RAM: %11ld, rsvd: %8d, ",
			
 
				-		       nid, present, reserved);
			
 
				-	}
			
 
				-	printk(KERN_INFO "%ld pages of RAM\n", total_present);
			
 
				-	printk(KERN_INFO "%d reserved pages\n", total_reserved);
			
 
				-	printk(KERN_INFO "Total of %ld pages in page table cache\n",
			
 
				-	       quicklist_total_size());
			
 
				-	printk(KERN_INFO "%ld free buffer pages\n", nr_free_buffer_pages());
			
 
				-}
			
--- a/arch/m32r/include/asm/Kbuild
+++ b/arch/m32r/include/asm/Kbuild
@@ -1,5 +1,6 @@
 
				 
			
 
				 generic-y += clkdev.h
			
 
				+generic-y += current.h
			
 
				 generic-y += exec.h
			
 
				 generic-y += irq_work.h
			
 
				 generic-y += kvm_para.h
			
--- a/arch/m32r/include/asm/cmpxchg.h
+++ b/arch/m32r/include/asm/cmpxchg.h
@@ -64,8 +64,10 @@ __xchg(unsigned long x, volatile void *ptr, int size)
 
				 	return (tmp);
			
 
				 }
			
 
				 
			
 
				-#define xchg(ptr, x)							\
			
 
				-	((__typeof__(*(ptr)))__xchg((unsigned long)(x), (ptr), sizeof(*(ptr))))
			
 
				+#define xchg(ptr, x) ({							\
			
 
				+	((__typeof__(*(ptr)))__xchg((unsigned long)(x), (ptr),		\
			
 
				+				    sizeof(*(ptr))));			\
			
 
				+})
			
 
				 
			
 
				 static __always_inline unsigned long
			
 
				 __xchg_local(unsigned long x, volatile void *ptr, int size)
			
@@ -187,9 +189,12 @@ __cmpxchg(volatile void *ptr, unsigned long old, unsigned long new, int size)
 
				 	return old;
			
 
				 }
			
 
				 
			
 
				-#define cmpxchg(ptr, o, n)						 \
			
 
				-	((__typeof__(*(ptr))) __cmpxchg((ptr), (unsigned long)(o),	 \
			
 
				-			(unsigned long)(n), sizeof(*(ptr))))
			
 
				+#define cmpxchg(ptr, o, n) ({				\
			
 
				+	((__typeof__(*(ptr)))				\
			
 
				+		 __cmpxchg((ptr), (unsigned long)(o),	\
			
 
				+			   (unsigned long)(n),		\
			
 
				+			   sizeof(*(ptr))));		\
			
 
				+})
			
 
				 
			
 
				 #include <asm-generic/cmpxchg-local.h>
			
 
				 
			
--- a/arch/m32r/include/asm/current.h
+++ b/arch/m32r/include/asm/current.h
@@ -1,15 +0,0 @@
 
				-#ifndef _ASM_M32R_CURRENT_H
			
 
				-#define _ASM_M32R_CURRENT_H
			
 
				-
			
 
				-#include <linux/thread_info.h>
			
 
				-
			
 
				-struct task_struct;
			
 
				-
			
 
				-static __inline__ struct task_struct *get_current(void)
			
 
				-{
			
 
				-	return current_thread_info()->task;
			
 
				-}
			
 
				-
			
 
				-#define current	(get_current())
			
 
				-
			
 
				-#endif	/* _ASM_M32R_CURRENT_H */
			
--- a/arch/parisc/include/asm/Kbuild
+++ b/arch/parisc/include/asm/Kbuild
@@ -2,6 +2,7 @@
 
				 generic-y += auxvec.h
			
 
				 generic-y += barrier.h
			
 
				 generic-y += clkdev.h
			
 
				+generic-y += current.h
			
 
				 generic-y += device.h
			
 
				 generic-y += div64.h
			
 
				 generic-y += emergency-restart.h
			
--- a/arch/parisc/include/asm/current.h
+++ b/arch/parisc/include/asm/current.h
@@ -1,15 +0,0 @@
 
				-#ifndef _PARISC_CURRENT_H
			
 
				-#define _PARISC_CURRENT_H
			
 
				-
			
 
				-#include <linux/thread_info.h>
			
 
				-
			
 
				-struct task_struct;
			
 
				-
			
 
				-static inline struct task_struct * get_current(void)
			
 
				-{
			
 
				-	return current_thread_info()->task;
			
 
				-}
			
 
				- 
			
 
				-#define current get_current()
			
 
				-
			
 
				-#endif /* !(_PARISC_CURRENT_H) */
			
--- a/arch/parisc/mm/init.c
+++ b/arch/parisc/mm/init.c
@@ -653,55 +653,6 @@ void __init mem_init(void)
 
				 unsigned long *empty_zero_page __read_mostly;
			
 
				 EXPORT_SYMBOL(empty_zero_page);
			
 
				 
			
 
				-void show_mem(unsigned int filter)
			
 
				-{
			
 
				-	int total = 0,reserved = 0;
			
 
				-	pg_data_t *pgdat;
			
 
				-
			
 
				-	printk(KERN_INFO "Mem-info:\n");
			
 
				-	show_free_areas(filter);
			
 
				-
			
 
				-	for_each_online_pgdat(pgdat) {
			
 
				-		unsigned long flags;
			
 
				-		int zoneid;
			
 
				-
			
 
				-		pgdat_resize_lock(pgdat, &flags);
			
 
				-		for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
			
 
				-			struct zone *zone = &pgdat->node_zones[zoneid];
			
 
				-			if (!populated_zone(zone))
			
 
				-				continue;
			
 
				-
			
 
				-			total += zone->present_pages;
			
 
				-			reserved = zone->present_pages - zone->managed_pages;
			
 
				-		}
			
 
				-		pgdat_resize_unlock(pgdat, &flags);
			
 
				-	}
			
 
				-
			
 
				-	printk(KERN_INFO "%d pages of RAM\n", total);
			
 
				-	printk(KERN_INFO "%d reserved pages\n", reserved);
			
 
				-
			
 
				-#ifdef CONFIG_DISCONTIGMEM
			
 
				-	{
			
 
				-		struct zonelist *zl;
			
 
				-		int i, j;
			
 
				-
			
 
				-		for (i = 0; i < npmem_ranges; i++) {
			
 
				-			zl = node_zonelist(i, 0);
			
 
				-			for (j = 0; j < MAX_NR_ZONES; j++) {
			
 
				-				struct zoneref *z;
			
 
				-				struct zone *zone;
			
 
				-
			
 
				-				printk("Zone list for zone %d on node %d: ", j, i);
			
 
				-				for_each_zone_zonelist(zone, z, zl, j)
			
 
				-					printk("[%d/%s] ", zone_to_nid(zone),
			
 
				-								zone->name);
			
 
				-				printk("\n");
			
 
				-			}
			
 
				-		}
			
 
				-	}
			
 
				-#endif
			
 
				-}
			
 
				-
			
 
				 /*
			
 
				  * pagetable_init() sets up the page tables
			
 
				  *
			
--- a/arch/powerpc/include/asm/page.h
+++ b/arch/powerpc/include/asm/page.h
@@ -230,7 +230,9 @@ extern long long virt_phys_offset;
 
				  * and needs to be executable.  This means the whole heap ends
			
 
				  * up being executable.
			
 
				  */
			
 
				-#define VM_DATA_DEFAULT_FLAGS32	(VM_READ | VM_WRITE | VM_EXEC | \
			
 
				+#define VM_DATA_DEFAULT_FLAGS32 \
			
 
				+	(((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0) | \
			
 
				+				 VM_READ | VM_WRITE | \
			
 
				 				 VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
			
 
				 
			
 
				 #define VM_DATA_DEFAULT_FLAGS64	(VM_READ | VM_WRITE | \
			
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -916,7 +916,7 @@ cmds(struct pt_regs *excp)
 
				 				memzcan();
			
 
				 				break;
			
 
				 			case 'i':
			
 
				-				show_mem(0);
			
 
				+				show_mem(0, NULL);
			
 
				 				break;
			
 
				 			default:
			
 
				 				termch = cmd;
			
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -687,7 +687,7 @@ void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to)
 
				 		/* Find vma in the parent mm */
			
 
				 		vma = find_vma(gmap->mm, vmaddr);
			
 
				 		size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK));
			
 
				-		zap_page_range(vma, vmaddr, size, NULL);
			
 
				+		zap_page_range(vma, vmaddr, size);
			
 
				 	}
			
 
				 	up_read(&gmap->mm->mmap_sem);
			
 
				 }
			
--- a/arch/score/include/asm/Kbuild
+++ b/arch/score/include/asm/Kbuild
@@ -1,9 +1,9 @@
 
				 
			
 
				 header-y +=
			
 
				 
			
 
				-
			
 
				 generic-y += barrier.h
			
 
				 generic-y += clkdev.h
			
 
				+generic-y += current.h
			
 
				 generic-y += irq_work.h
			
 
				 generic-y += mcs_spinlock.h
			
 
				 generic-y += mm-arch-hooks.h
			
--- a/arch/score/include/asm/current.h
+++ b/arch/score/include/asm/current.h
@@ -1,6 +0,0 @@
 
				-#ifndef _ASM_SCORE_CURRENT_H
			
 
				-#define _ASM_SCORE_CURRENT_H
			
 
				-
			
 
				-#include <asm-generic/current.h>
			
 
				-
			
 
				-#endif /* _ASM_SCORE_CURRENT_H */
			
--- a/arch/sparc/kernel/setup_32.c
+++ b/arch/sparc/kernel/setup_32.c
@@ -82,7 +82,7 @@ static void prom_sync_me(void)
 
				 			     "nop\n\t" : : "r" (&trapbase));
			
 
				 
			
 
				 	prom_printf("PROM SYNC COMMAND...\n");
			
 
				-	show_free_areas(0);
			
 
				+	show_free_areas(0, NULL);
			
 
				 	if (!is_idle_task(current)) {
			
 
				 		local_irq_enable();
			
 
				 		sys_sync();
			
--- a/arch/sparc/mm/init_32.c
+++ b/arch/sparc/mm/init_32.c
@@ -55,17 +55,6 @@ extern unsigned int sparc_ramdisk_size;
 
				 
			
 
				 unsigned long highstart_pfn, highend_pfn;
			
 
				 
			
 
				-void show_mem(unsigned int filter)
			
 
				-{
			
 
				-	printk("Mem-info:\n");
			
 
				-	show_free_areas(filter);
			
 
				-	printk("Free swap:       %6ldkB\n",
			
 
				-	       get_nr_swap_pages() << (PAGE_SHIFT-10));
			
 
				-	printk("%ld pages of RAM\n", totalram_pages);
			
 
				-	printk("%ld free pages\n", nr_free_pages());
			
 
				-}
			
 
				-
			
 
				-
			
 
				 unsigned long last_valid_pfn;
			
 
				 
			
 
				 unsigned long calc_highpages(void)
			
--- a/arch/tile/mm/pgtable.c
+++ b/arch/tile/mm/pgtable.c
@@ -36,51 +36,6 @@
 
				 
			
 
				 #define K(x) ((x) << (PAGE_SHIFT-10))
			
 
				 
			
 
				-/*
			
 
				- * The normal show_free_areas() is too verbose on Tile, with dozens
			
 
				- * of processors and often four NUMA zones each with high and lowmem.
			
 
				- */
			
 
				-void show_mem(unsigned int filter)
			
 
				-{
			
 
				-	struct zone *zone;
			
 
				-
			
 
				-	pr_err("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu free:%lu\n slab:%lu mapped:%lu pagetables:%lu bounce:%lu pagecache:%lu swap:%lu\n",
			
 
				-	       (global_node_page_state(NR_ACTIVE_ANON) +
			
 
				-		global_node_page_state(NR_ACTIVE_FILE)),
			
 
				-	       (global_node_page_state(NR_INACTIVE_ANON) +
			
 
				-		global_node_page_state(NR_INACTIVE_FILE)),
			
 
				-	       global_node_page_state(NR_FILE_DIRTY),
			
 
				-	       global_node_page_state(NR_WRITEBACK),
			
 
				-	       global_node_page_state(NR_UNSTABLE_NFS),
			
 
				-	       global_page_state(NR_FREE_PAGES),
			
 
				-	       (global_page_state(NR_SLAB_RECLAIMABLE) +
			
 
				-		global_page_state(NR_SLAB_UNRECLAIMABLE)),
			
 
				-	       global_node_page_state(NR_FILE_MAPPED),
			
 
				-	       global_page_state(NR_PAGETABLE),
			
 
				-	       global_page_state(NR_BOUNCE),
			
 
				-	       global_node_page_state(NR_FILE_PAGES),
			
 
				-	       get_nr_swap_pages());
			
 
				-
			
 
				-	for_each_zone(zone) {
			
 
				-		unsigned long flags, order, total = 0, largest_order = -1;
			
 
				-
			
 
				-		if (!populated_zone(zone))
			
 
				-			continue;
			
 
				-
			
 
				-		spin_lock_irqsave(&zone->lock, flags);
			
 
				-		for (order = 0; order < MAX_ORDER; order++) {
			
 
				-			int nr = zone->free_area[order].nr_free;
			
 
				-			total += nr << order;
			
 
				-			if (nr)
			
 
				-				largest_order = order;
			
 
				-		}
			
 
				-		spin_unlock_irqrestore(&zone->lock, flags);
			
 
				-		pr_err("Node %d %7s: %lukB (largest %luKb)\n",
			
 
				-		       zone_to_nid(zone), zone->name,
			
 
				-		       K(total), largest_order ? K(1UL) << largest_order : 0);
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				 /**
			
 
				  * shatter_huge_page() - ensure a given address is mapped by a small page.
			
 
				  *
			
--- a/arch/unicore32/mm/init.c
+++ b/arch/unicore32/mm/init.c
@@ -57,50 +57,6 @@ early_param("initrd", early_initrd);
 
				  */
			
 
				 struct meminfo meminfo;
			
 
				 
			
 
				-void show_mem(unsigned int filter)
			
 
				-{
			
 
				-	int free = 0, total = 0, reserved = 0;
			
 
				-	int shared = 0, cached = 0, slab = 0, i;
			
 
				-	struct meminfo *mi = &meminfo;
			
 
				-
			
 
				-	printk(KERN_DEFAULT "Mem-info:\n");
			
 
				-	show_free_areas(filter);
			
 
				-
			
 
				-	for_each_bank(i, mi) {
			
 
				-		struct membank *bank = &mi->bank[i];
			
 
				-		unsigned int pfn1, pfn2;
			
 
				-		struct page *page, *end;
			
 
				-
			
 
				-		pfn1 = bank_pfn_start(bank);
			
 
				-		pfn2 = bank_pfn_end(bank);
			
 
				-
			
 
				-		page = pfn_to_page(pfn1);
			
 
				-		end  = pfn_to_page(pfn2 - 1) + 1;
			
 
				-
			
 
				-		do {
			
 
				-			total++;
			
 
				-			if (PageReserved(page))
			
 
				-				reserved++;
			
 
				-			else if (PageSwapCache(page))
			
 
				-				cached++;
			
 
				-			else if (PageSlab(page))
			
 
				-				slab++;
			
 
				-			else if (!page_count(page))
			
 
				-				free++;
			
 
				-			else
			
 
				-				shared += page_count(page) - 1;
			
 
				-			page++;
			
 
				-		} while (page < end);
			
 
				-	}
			
 
				-
			
 
				-	printk(KERN_DEFAULT "%d pages of RAM\n", total);
			
 
				-	printk(KERN_DEFAULT "%d free pages\n", free);
			
 
				-	printk(KERN_DEFAULT "%d reserved pages\n", reserved);
			
 
				-	printk(KERN_DEFAULT "%d slab pages\n", slab);
			
 
				-	printk(KERN_DEFAULT "%d pages shared\n", shared);
			
 
				-	printk(KERN_DEFAULT "%d pages swap cached\n", cached);
			
 
				-}
			
 
				-
			
 
				 static void __init find_limits(unsigned long *min, unsigned long *max_low,
			
 
				 	unsigned long *max_high)
			
 
				 {
			
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -679,7 +679,7 @@ static void __meminit free_pagetable(struct page *page, int order)
 
				 	if (PageReserved(page)) {
			
 
				 		__ClearPageReserved(page);
			
 
				 
			
 
				-		magic = (unsigned long)page->lru.next;
			
 
				+		magic = (unsigned long)page->freelist;
			
 
				 		if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) {
			
 
				 			while (nr_pages--)
			
 
				 				put_page_bootmem(page++);
			
--- a/arch/x86/mm/mpx.c
+++ b/arch/x86/mm/mpx.c
@@ -796,7 +796,7 @@ static noinline int zap_bt_entries_mapping(struct mm_struct *mm,
 
				 			return -EINVAL;
			
 
				 
			
 
				 		len = min(vma->vm_end, end) - addr;
			
 
				-		zap_page_range(vma, addr, len, NULL);
			
 
				+		zap_page_range(vma, addr, len);
			
 
				 		trace_mpx_unmap_zap(addr, addr+len);
			
 
				 
			
 
				 		vma = vma->vm_next;
			
--- a/block/ioprio.c
+++ b/block/ioprio.c
@@ -122,14 +122,14 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio)
 
				 			if (!user)
			
 
				 				break;
			
 
				 
			
 
				-			do_each_thread(g, p) {
			
 
				+			for_each_process_thread(g, p) {
			
 
				 				if (!uid_eq(task_uid(p), uid) ||
			
 
				 				    !task_pid_vnr(p))
			
 
				 					continue;
			
 
				 				ret = set_task_ioprio(p, ioprio);
			
 
				 				if (ret)
			
 
				 					goto free_uid;
			
 
				-			} while_each_thread(g, p);
			
 
				+			}
			
 
				 free_uid:
			
 
				 			if (who)
			
 
				 				free_uid(user);
			
@@ -222,7 +222,7 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
 
				 			if (!user)
			
 
				 				break;
			
 
				 
			
 
				-			do_each_thread(g, p) {
			
 
				+			for_each_process_thread(g, p) {
			
 
				 				if (!uid_eq(task_uid(p), user->uid) ||
			
 
				 				    !task_pid_vnr(p))
			
 
				 					continue;
			
@@ -233,7 +233,7 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
 
				 					ret = tmpio;
			
 
				 				else
			
 
				 					ret = ioprio_best(ret, tmpio);
			
 
				-			} while_each_thread(g, p);
			
 
				+			}
			
 
				 
			
 
				 			if (who)
			
 
				 				free_uid(user);
			
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -657,7 +657,7 @@ free_range:
 
				 		page = &proc->pages[(page_addr - proc->buffer) / PAGE_SIZE];
			
 
				 		if (vma)
			
 
				 			zap_page_range(vma, (uintptr_t)page_addr +
			
 
				-				proc->user_buffer_offset, PAGE_SIZE, NULL);
			
 
				+				proc->user_buffer_offset, PAGE_SIZE);
			
 
				 err_vm_insert_page_failed:
			
 
				 		unmap_kernel_range((unsigned long)page_addr, PAGE_SIZE);
			
 
				 err_map_kernel_failed:
			
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -45,27 +45,6 @@ static const char *default_compressor = "lzo";
 
				 /* Module params (documentation at end) */
			
 
				 static unsigned int num_devices = 1;
			
 
				 
			
 
				-static inline void deprecated_attr_warn(const char *name)
			
 
				-{
			
 
				-	pr_warn_once("%d (%s) Attribute %s (and others) will be removed. %s\n",
			
 
				-			task_pid_nr(current),
			
 
				-			current->comm,
			
 
				-			name,
			
 
				-			"See zram documentation.");
			
 
				-}
			
 
				-
			
 
				-#define ZRAM_ATTR_RO(name)						\
			
 
				-static ssize_t name##_show(struct device *d,				\
			
 
				-				struct device_attribute *attr, char *b)	\
			
 
				-{									\
			
 
				-	struct zram *zram = dev_to_zram(d);				\
			
 
				-									\
			
 
				-	deprecated_attr_warn(__stringify(name));			\
			
 
				-	return scnprintf(b, PAGE_SIZE, "%llu\n",			\
			
 
				-		(u64)atomic64_read(&zram->stats.name));			\
			
 
				-}									\
			
 
				-static DEVICE_ATTR_RO(name);
			
 
				-
			
 
				 static inline bool init_done(struct zram *zram)
			
 
				 {
			
 
				 	return zram->disksize;
			
@@ -218,47 +197,6 @@ static ssize_t disksize_show(struct device *dev,
 
				 	return scnprintf(buf, PAGE_SIZE, "%llu\n", zram->disksize);
			
 
				 }
			
 
				 
			
 
				-static ssize_t orig_data_size_show(struct device *dev,
			
 
				-		struct device_attribute *attr, char *buf)
			
 
				-{
			
 
				-	struct zram *zram = dev_to_zram(dev);
			
 
				-
			
 
				-	deprecated_attr_warn("orig_data_size");
			
 
				-	return scnprintf(buf, PAGE_SIZE, "%llu\n",
			
 
				-		(u64)(atomic64_read(&zram->stats.pages_stored)) << PAGE_SHIFT);
			
 
				-}
			
 
				-
			
 
				-static ssize_t mem_used_total_show(struct device *dev,
			
 
				-		struct device_attribute *attr, char *buf)
			
 
				-{
			
 
				-	u64 val = 0;
			
 
				-	struct zram *zram = dev_to_zram(dev);
			
 
				-
			
 
				-	deprecated_attr_warn("mem_used_total");
			
 
				-	down_read(&zram->init_lock);
			
 
				-	if (init_done(zram)) {
			
 
				-		struct zram_meta *meta = zram->meta;
			
 
				-		val = zs_get_total_pages(meta->mem_pool);
			
 
				-	}
			
 
				-	up_read(&zram->init_lock);
			
 
				-
			
 
				-	return scnprintf(buf, PAGE_SIZE, "%llu\n", val << PAGE_SHIFT);
			
 
				-}
			
 
				-
			
 
				-static ssize_t mem_limit_show(struct device *dev,
			
 
				-		struct device_attribute *attr, char *buf)
			
 
				-{
			
 
				-	u64 val;
			
 
				-	struct zram *zram = dev_to_zram(dev);
			
 
				-
			
 
				-	deprecated_attr_warn("mem_limit");
			
 
				-	down_read(&zram->init_lock);
			
 
				-	val = zram->limit_pages;
			
 
				-	up_read(&zram->init_lock);
			
 
				-
			
 
				-	return scnprintf(buf, PAGE_SIZE, "%llu\n", val << PAGE_SHIFT);
			
 
				-}
			
 
				-
			
 
				 static ssize_t mem_limit_store(struct device *dev,
			
 
				 		struct device_attribute *attr, const char *buf, size_t len)
			
 
				 {
			
@@ -277,21 +215,6 @@ static ssize_t mem_limit_store(struct device *dev,
 
				 	return len;
			
 
				 }
			
 
				 
			
 
				-static ssize_t mem_used_max_show(struct device *dev,
			
 
				-		struct device_attribute *attr, char *buf)
			
 
				-{
			
 
				-	u64 val = 0;
			
 
				-	struct zram *zram = dev_to_zram(dev);
			
 
				-
			
 
				-	deprecated_attr_warn("mem_used_max");
			
 
				-	down_read(&zram->init_lock);
			
 
				-	if (init_done(zram))
			
 
				-		val = atomic_long_read(&zram->stats.max_used_pages);
			
 
				-	up_read(&zram->init_lock);
			
 
				-
			
 
				-	return scnprintf(buf, PAGE_SIZE, "%llu\n", val << PAGE_SHIFT);
			
 
				-}
			
 
				-
			
 
				 static ssize_t mem_used_max_store(struct device *dev,
			
 
				 		struct device_attribute *attr, const char *buf, size_t len)
			
 
				 {
			
@@ -467,14 +390,6 @@ static ssize_t debug_stat_show(struct device *dev,
 
				 static DEVICE_ATTR_RO(io_stat);
			
 
				 static DEVICE_ATTR_RO(mm_stat);
			
 
				 static DEVICE_ATTR_RO(debug_stat);
			
 
				-ZRAM_ATTR_RO(num_reads);
			
 
				-ZRAM_ATTR_RO(num_writes);
			
 
				-ZRAM_ATTR_RO(failed_reads);
			
 
				-ZRAM_ATTR_RO(failed_writes);
			
 
				-ZRAM_ATTR_RO(invalid_io);
			
 
				-ZRAM_ATTR_RO(notify_free);
			
 
				-ZRAM_ATTR_RO(zero_pages);
			
 
				-ZRAM_ATTR_RO(compr_data_size);
			
 
				 
			
 
				 static inline bool zram_meta_get(struct zram *zram)
			
 
				 {
			
@@ -1188,10 +1103,8 @@ static DEVICE_ATTR_WO(compact);
 
				 static DEVICE_ATTR_RW(disksize);
			
 
				 static DEVICE_ATTR_RO(initstate);
			
 
				 static DEVICE_ATTR_WO(reset);
			
 
				-static DEVICE_ATTR_RO(orig_data_size);
			
 
				-static DEVICE_ATTR_RO(mem_used_total);
			
 
				-static DEVICE_ATTR_RW(mem_limit);
			
 
				-static DEVICE_ATTR_RW(mem_used_max);
			
 
				+static DEVICE_ATTR_WO(mem_limit);
			
 
				+static DEVICE_ATTR_WO(mem_used_max);
			
 
				 static DEVICE_ATTR_RW(max_comp_streams);
			
 
				 static DEVICE_ATTR_RW(comp_algorithm);
			
 
				 
			
@@ -1199,17 +1112,7 @@ static struct attribute *zram_disk_attrs[] = {
 
				 	&dev_attr_disksize.attr,
			
 
				 	&dev_attr_initstate.attr,
			
 
				 	&dev_attr_reset.attr,
			
 
				-	&dev_attr_num_reads.attr,
			
 
				-	&dev_attr_num_writes.attr,
			
 
				-	&dev_attr_failed_reads.attr,
			
 
				-	&dev_attr_failed_writes.attr,
			
 
				 	&dev_attr_compact.attr,
			
 
				-	&dev_attr_invalid_io.attr,
			
 
				-	&dev_attr_notify_free.attr,
			
 
				-	&dev_attr_zero_pages.attr,
			
 
				-	&dev_attr_orig_data_size.attr,
			
 
				-	&dev_attr_compr_data_size.attr,
			
 
				-	&dev_attr_mem_used_total.attr,
			
 
				 	&dev_attr_mem_limit.attr,
			
 
				 	&dev_attr_mem_used_max.attr,
			
 
				 	&dev_attr_max_comp_streams.attr,
			
--- a/drivers/dax/dax.c
+++ b/drivers/dax/dax.c
@@ -472,18 +472,16 @@ static int dax_dev_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 
				 	return rc;
			
 
				 }
			
 
				 
			
 
				-static int __dax_dev_pmd_fault(struct dax_dev *dax_dev,
			
 
				-		struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd,
			
 
				-		unsigned int flags)
			
 
				+static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, struct vm_fault *vmf)
			
 
				 {
			
 
				-	unsigned long pmd_addr = addr & PMD_MASK;
			
 
				+	unsigned long pmd_addr = vmf->address & PMD_MASK;
			
 
				 	struct device *dev = &dax_dev->dev;
			
 
				 	struct dax_region *dax_region;
			
 
				 	phys_addr_t phys;
			
 
				 	pgoff_t pgoff;
			
 
				 	pfn_t pfn;
			
 
				 
			
 
				-	if (check_vma(dax_dev, vma, __func__))
			
 
				+	if (check_vma(dax_dev, vmf->vma, __func__))
			
 
				 		return VM_FAULT_SIGBUS;
			
 
				 
			
 
				 	dax_region = dax_dev->region;
			
@@ -498,7 +496,7 @@ static int __dax_dev_pmd_fault(struct dax_dev *dax_dev,
 
				 		return VM_FAULT_SIGBUS;
			
 
				 	}
			
 
				 
			
 
				-	pgoff = linear_page_index(vma, pmd_addr);
			
 
				+	pgoff = linear_page_index(vmf->vma, pmd_addr);
			
 
				 	phys = pgoff_to_phys(dax_dev, pgoff, PMD_SIZE);
			
 
				 	if (phys == -1) {
			
 
				 		dev_dbg(dev, "%s: phys_to_pgoff(%#lx) failed\n", __func__,
			
@@ -508,23 +506,23 @@ static int __dax_dev_pmd_fault(struct dax_dev *dax_dev,
 
				 
			
 
				 	pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
			
 
				 
			
 
				-	return vmf_insert_pfn_pmd(vma, addr, pmd, pfn,
			
 
				-			flags & FAULT_FLAG_WRITE);
			
 
				+	return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, pfn,
			
 
				+			vmf->flags & FAULT_FLAG_WRITE);
			
 
				 }
			
 
				 
			
 
				-static int dax_dev_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
			
 
				-		pmd_t *pmd, unsigned int flags)
			
 
				+static int dax_dev_pmd_fault(struct vm_fault *vmf)
			
 
				 {
			
 
				 	int rc;
			
 
				-	struct file *filp = vma->vm_file;
			
 
				+	struct file *filp = vmf->vma->vm_file;
			
 
				 	struct dax_dev *dax_dev = filp->private_data;
			
 
				 
			
 
				 	dev_dbg(&dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__,
			
 
				-			current->comm, (flags & FAULT_FLAG_WRITE)
			
 
				-			? "write" : "read", vma->vm_start, vma->vm_end);
			
 
				+			current->comm, (vmf->flags & FAULT_FLAG_WRITE)
			
 
				+			? "write" : "read",
			
 
				+			vmf->vma->vm_start, vmf->vma->vm_end);
			
 
				 
			
 
				 	rcu_read_lock();
			
 
				-	rc = __dax_dev_pmd_fault(dax_dev, vma, addr, pmd, flags);
			
 
				+	rc = __dax_dev_pmd_fault(dax_dev, vmf);
			
 
				 	rcu_read_unlock();
			
 
				 
			
 
				 	return rc;
			
--- a/drivers/net/ethernet/sgi/ioc3-eth.c
+++ b/drivers/net/ethernet/sgi/ioc3-eth.c
@@ -914,7 +914,7 @@ static void ioc3_alloc_rings(struct net_device *dev)
 
				 
			
 
				 			skb = ioc3_alloc_skb(RX_BUF_ALLOC_SIZE, GFP_ATOMIC);
			
 
				 			if (!skb) {
			
 
				-				show_free_areas(0);
			
 
				+				show_free_areas(0, NULL);
			
 
				 				continue;
			
 
				 			}
			
 
				 
			
--- a/drivers/staging/android/ion/ion.c
+++ b/drivers/staging/android/ion/ion.c
@@ -865,8 +865,7 @@ static void ion_buffer_sync_for_device(struct ion_buffer *buffer,
 
				 	list_for_each_entry(vma_list, &buffer->vmas, list) {
			
 
				 		struct vm_area_struct *vma = vma_list->vma;
			
 
				 
			
 
				-		zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start,
			
 
				-			       NULL);
			
 
				+		zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start);
			
 
				 	}
			
 
				 	mutex_unlock(&buffer->lock);
			
 
				 }
			
--- a/drivers/tty/sysrq.c
+++ b/drivers/tty/sysrq.c
@@ -317,7 +317,7 @@ static struct sysrq_key_op sysrq_ftrace_dump_op = {
 
				 
			
 
				 static void sysrq_handle_showmem(int key)
			
 
				 {
			
 
				-	show_mem(0);
			
 
				+	show_mem(0, NULL);
			
 
				 }
			
 
				 static struct sysrq_key_op sysrq_showmem_op = {
			
 
				 	.handler	= sysrq_handle_showmem,
			
--- a/drivers/tty/vt/keyboard.c
+++ b/drivers/tty/vt/keyboard.c
@@ -572,7 +572,7 @@ static void fn_scroll_back(struct vc_data *vc)
 
				 
			
 
				 static void fn_show_mem(struct vc_data *vc)
			
 
				 {
			
 
				-	show_mem(0);
			
 
				+	show_mem(0, NULL);
			
 
				 }
			
 
				 
			
 
				 static void fn_show_state(struct vc_data *vc)
			
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -277,6 +277,7 @@ static int v9fs_xattr_set_acl(const struct xattr_handler *handler,
 
				 	case ACL_TYPE_ACCESS:
			
 
				 		if (acl) {
			
 
				 			struct iattr iattr;
			
 
				+			struct posix_acl *old_acl = acl;
			
 
				 
			
 
				 			retval = posix_acl_update_mode(inode, &iattr.ia_mode, &acl);
			
 
				 			if (retval)
			
@@ -287,6 +288,7 @@ static int v9fs_xattr_set_acl(const struct xattr_handler *handler,
 
				 				 * by the mode bits. So don't
			
 
				 				 * update ACL.
			
 
				 				 */
			
 
				+				posix_acl_release(old_acl);
			
 
				 				value = NULL;
			
 
				 				size = 0;
			
 
				 			}
			
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -91,12 +91,18 @@ static struct linux_binfmt elf_format = {
 
				 
			
 
				 #define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE)
			
 
				 
			
 
				-static int set_brk(unsigned long start, unsigned long end)
			
 
				+static int set_brk(unsigned long start, unsigned long end, int prot)
			
 
				 {
			
 
				 	start = ELF_PAGEALIGN(start);
			
 
				 	end = ELF_PAGEALIGN(end);
			
 
				 	if (end > start) {
			
 
				-		int error = vm_brk(start, end - start);
			
 
				+		/*
			
 
				+		 * Map the last of the bss segment.
			
 
				+		 * If the header is requesting these pages to be
			
 
				+		 * executable, honour that (ppc32 needs this).
			
 
				+		 */
			
 
				+		int error = vm_brk_flags(start, end - start,
			
 
				+				prot & PROT_EXEC ? VM_EXEC : 0);
			
 
				 		if (error)
			
 
				 			return error;
			
 
				 	}
			
@@ -524,6 +530,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
 
				 	unsigned long load_addr = 0;
			
 
				 	int load_addr_set = 0;
			
 
				 	unsigned long last_bss = 0, elf_bss = 0;
			
 
				+	int bss_prot = 0;
			
 
				 	unsigned long error = ~0UL;
			
 
				 	unsigned long total_size;
			
 
				 	int i;
			
@@ -606,8 +613,10 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
 
				 			 * elf_bss and last_bss is the bss section.
			
 
				 			 */
			
 
				 			k = load_addr + eppnt->p_vaddr + eppnt->p_memsz;
			
 
				-			if (k > last_bss)
			
 
				+			if (k > last_bss) {
			
 
				 				last_bss = k;
			
 
				+				bss_prot = elf_prot;
			
 
				+			}
			
 
				 		}
			
 
				 	}
			
 
				 
			
@@ -623,13 +632,14 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
 
				 	/*
			
 
				 	 * Next, align both the file and mem bss up to the page size,
			
 
				 	 * since this is where elf_bss was just zeroed up to, and where
			
 
				-	 * last_bss will end after the vm_brk() below.
			
 
				+	 * last_bss will end after the vm_brk_flags() below.
			
 
				 	 */
			
 
				 	elf_bss = ELF_PAGEALIGN(elf_bss);
			
 
				 	last_bss = ELF_PAGEALIGN(last_bss);
			
 
				 	/* Finally, if there is still more bss to allocate, do it. */
			
 
				 	if (last_bss > elf_bss) {
			
 
				-		error = vm_brk(elf_bss, last_bss - elf_bss);
			
 
				+		error = vm_brk_flags(elf_bss, last_bss - elf_bss,
			
 
				+				bss_prot & PROT_EXEC ? VM_EXEC : 0);
			
 
				 		if (error)
			
 
				 			goto out;
			
 
				 	}
			
@@ -674,6 +684,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
 
				 	unsigned long error;
			
 
				 	struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL;
			
 
				 	unsigned long elf_bss, elf_brk;
			
 
				+	int bss_prot = 0;
			
 
				 	int retval, i;
			
 
				 	unsigned long elf_entry;
			
 
				 	unsigned long interp_load_addr = 0;
			
@@ -882,7 +893,8 @@ static int load_elf_binary(struct linux_binprm *bprm)
 
				 			   before this one. Map anonymous pages, if needed,
			
 
				 			   and clear the area.  */
			
 
				 			retval = set_brk(elf_bss + load_bias,
			
 
				-					 elf_brk + load_bias);
			
 
				+					 elf_brk + load_bias,
			
 
				+					 bss_prot);
			
 
				 			if (retval)
			
 
				 				goto out_free_dentry;
			
 
				 			nbyte = ELF_PAGEOFFSET(elf_bss);
			
@@ -976,8 +988,10 @@ static int load_elf_binary(struct linux_binprm *bprm)
 
				 		if (end_data < k)
			
 
				 			end_data = k;
			
 
				 		k = elf_ppnt->p_vaddr + elf_ppnt->p_memsz;
			
 
				-		if (k > elf_brk)
			
 
				+		if (k > elf_brk) {
			
 
				+			bss_prot = elf_prot;
			
 
				 			elf_brk = k;
			
 
				+		}
			
 
				 	}
			
 
				 
			
 
				 	loc->elf_ex.e_entry += load_bias;
			
@@ -993,7 +1007,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
 
				 	 * mapping in the interpreter, to make sure it doesn't wind
			
 
				 	 * up getting placed where the bss needs to go.
			
 
				 	 */
			
 
				-	retval = set_brk(elf_bss, elf_brk);
			
 
				+	retval = set_brk(elf_bss, elf_brk, bss_prot);
			
 
				 	if (retval)
			
 
				 		goto out_free_dentry;
			
 
				 	if (likely(elf_bss != elf_brk) && unlikely(padzero(elf_bss))) {
			
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -35,6 +35,9 @@
 
				 #include <linux/iomap.h>
			
 
				 #include "internal.h"
			
 
				 
			
 
				+#define CREATE_TRACE_POINTS
			
 
				+#include <trace/events/fs_dax.h>
			
 
				+
			
 
				 /* We choose 4096 entries - same as per-zone page wait tables */
			
 
				 #define DAX_WAIT_TABLE_BITS 12
			
 
				 #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
			
@@ -1253,21 +1256,21 @@ EXPORT_SYMBOL_GPL(dax_iomap_fault);
 
				  */
			
 
				 #define PG_PMD_COLOUR	((PMD_SIZE >> PAGE_SHIFT) - 1)
			
 
				 
			
 
				-static int dax_pmd_insert_mapping(struct vm_area_struct *vma, pmd_t *pmd,
			
 
				-		struct vm_fault *vmf, unsigned long address,
			
 
				-		struct iomap *iomap, loff_t pos, bool write, void **entryp)
			
 
				+static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
			
 
				+		loff_t pos, void **entryp)
			
 
				 {
			
 
				-	struct address_space *mapping = vma->vm_file->f_mapping;
			
 
				+	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
			
 
				 	struct block_device *bdev = iomap->bdev;
			
 
				+	struct inode *inode = mapping->host;
			
 
				 	struct blk_dax_ctl dax = {
			
 
				 		.sector = dax_iomap_sector(iomap, pos),
			
 
				 		.size = PMD_SIZE,
			
 
				 	};
			
 
				 	long length = dax_map_atomic(bdev, &dax);
			
 
				-	void *ret;
			
 
				+	void *ret = NULL;
			
 
				 
			
 
				 	if (length < 0) /* dax_map_atomic() failed */
			
 
				-		return VM_FAULT_FALLBACK;
			
 
				+		goto fallback;
			
 
				 	if (length < PMD_SIZE)
			
 
				 		goto unmap_fallback;
			
 
				 	if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR)
			
@@ -1280,67 +1283,86 @@ static int dax_pmd_insert_mapping(struct vm_area_struct *vma, pmd_t *pmd,
 
				 	ret = dax_insert_mapping_entry(mapping, vmf, *entryp, dax.sector,
			
 
				 			RADIX_DAX_PMD);
			
 
				 	if (IS_ERR(ret))
			
 
				-		return VM_FAULT_FALLBACK;
			
 
				+		goto fallback;
			
 
				 	*entryp = ret;
			
 
				 
			
 
				-	return vmf_insert_pfn_pmd(vma, address, pmd, dax.pfn, write);
			
 
				+	trace_dax_pmd_insert_mapping(inode, vmf, length, dax.pfn, ret);
			
 
				+	return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
			
 
				+			dax.pfn, vmf->flags & FAULT_FLAG_WRITE);
			
 
				 
			
 
				  unmap_fallback:
			
 
				 	dax_unmap_atomic(bdev, &dax);
			
 
				+fallback:
			
 
				+	trace_dax_pmd_insert_mapping_fallback(inode, vmf, length,
			
 
				+			dax.pfn, ret);
			
 
				 	return VM_FAULT_FALLBACK;
			
 
				 }
			
 
				 
			
 
				-static int dax_pmd_load_hole(struct vm_area_struct *vma, pmd_t *pmd,
			
 
				-		struct vm_fault *vmf, unsigned long address,
			
 
				-		struct iomap *iomap, void **entryp)
			
 
				+static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
			
 
				+		void **entryp)
			
 
				 {
			
 
				-	struct address_space *mapping = vma->vm_file->f_mapping;
			
 
				-	unsigned long pmd_addr = address & PMD_MASK;
			
 
				+	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
			
 
				+	unsigned long pmd_addr = vmf->address & PMD_MASK;
			
 
				+	struct inode *inode = mapping->host;
			
 
				 	struct page *zero_page;
			
 
				+	void *ret = NULL;
			
 
				 	spinlock_t *ptl;
			
 
				 	pmd_t pmd_entry;
			
 
				-	void *ret;
			
 
				 
			
 
				-	zero_page = mm_get_huge_zero_page(vma->vm_mm);
			
 
				+	zero_page = mm_get_huge_zero_page(vmf->vma->vm_mm);
			
 
				 
			
 
				 	if (unlikely(!zero_page))
			
 
				-		return VM_FAULT_FALLBACK;
			
 
				+		goto fallback;
			
 
				 
			
 
				 	ret = dax_insert_mapping_entry(mapping, vmf, *entryp, 0,
			
 
				 			RADIX_DAX_PMD | RADIX_DAX_HZP);
			
 
				 	if (IS_ERR(ret))
			
 
				-		return VM_FAULT_FALLBACK;
			
 
				+		goto fallback;
			
 
				 	*entryp = ret;
			
 
				 
			
 
				-	ptl = pmd_lock(vma->vm_mm, pmd);
			
 
				-	if (!pmd_none(*pmd)) {
			
 
				+	ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
			
 
				+	if (!pmd_none(*(vmf->pmd))) {
			
 
				 		spin_unlock(ptl);
			
 
				-		return VM_FAULT_FALLBACK;
			
 
				+		goto fallback;
			
 
				 	}
			
 
				 
			
 
				-	pmd_entry = mk_pmd(zero_page, vma->vm_page_prot);
			
 
				+	pmd_entry = mk_pmd(zero_page, vmf->vma->vm_page_prot);
			
 
				 	pmd_entry = pmd_mkhuge(pmd_entry);
			
 
				-	set_pmd_at(vma->vm_mm, pmd_addr, pmd, pmd_entry);
			
 
				+	set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry);
			
 
				 	spin_unlock(ptl);
			
 
				+	trace_dax_pmd_load_hole(inode, vmf, zero_page, ret);
			
 
				 	return VM_FAULT_NOPAGE;
			
 
				+
			
 
				+fallback:
			
 
				+	trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, ret);
			
 
				+	return VM_FAULT_FALLBACK;
			
 
				 }
			
 
				 
			
 
				-int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
			
 
				-		pmd_t *pmd, unsigned int flags, const struct iomap_ops *ops)
			
 
				+int dax_iomap_pmd_fault(struct vm_fault *vmf, const struct iomap_ops *ops)
			
 
				 {
			
 
				+	struct vm_area_struct *vma = vmf->vma;
			
 
				 	struct address_space *mapping = vma->vm_file->f_mapping;
			
 
				-	unsigned long pmd_addr = address & PMD_MASK;
			
 
				-	bool write = flags & FAULT_FLAG_WRITE;
			
 
				+	unsigned long pmd_addr = vmf->address & PMD_MASK;
			
 
				+	bool write = vmf->flags & FAULT_FLAG_WRITE;
			
 
				 	unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT;
			
 
				 	struct inode *inode = mapping->host;
			
 
				 	int result = VM_FAULT_FALLBACK;
			
 
				 	struct iomap iomap = { 0 };
			
 
				 	pgoff_t max_pgoff, pgoff;
			
 
				-	struct vm_fault vmf;
			
 
				 	void *entry;
			
 
				 	loff_t pos;
			
 
				 	int error;
			
 
				 
			
 
				+	/*
			
 
				+	 * Check whether offset isn't beyond end of file now. Caller is
			
 
				+	 * supposed to hold locks serializing us with truncate / punch hole so
			
 
				+	 * this is a reliable test.
			
 
				+	 */
			
 
				+	pgoff = linear_page_index(vma, pmd_addr);
			
 
				+	max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT;
			
 
				+
			
 
				+	trace_dax_pmd_fault(inode, vmf, max_pgoff, 0);
			
 
				+
			
 
				 	/* Fall back to PTEs if we're going to COW */
			
 
				 	if (write && !(vma->vm_flags & VM_SHARED))
			
 
				 		goto fallback;
			
@@ -1351,16 +1373,10 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 
				 	if ((pmd_addr + PMD_SIZE) > vma->vm_end)
			
 
				 		goto fallback;
			
 
				 
			
 
				-	/*
			
 
				-	 * Check whether offset isn't beyond end of file now. Caller is
			
 
				-	 * supposed to hold locks serializing us with truncate / punch hole so
			
 
				-	 * this is a reliable test.
			
 
				-	 */
			
 
				-	pgoff = linear_page_index(vma, pmd_addr);
			
 
				-	max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT;
			
 
				-
			
 
				-	if (pgoff > max_pgoff)
			
 
				-		return VM_FAULT_SIGBUS;
			
 
				+	if (pgoff > max_pgoff) {
			
 
				+		result = VM_FAULT_SIGBUS;
			
 
				+		goto out;
			
 
				+	}
			
 
				 
			
 
				 	/* If the PMD would extend beyond the file size */
			
 
				 	if ((pgoff | PG_PMD_COLOUR) > max_pgoff)
			
@@ -1389,21 +1405,15 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 
				 	if (IS_ERR(entry))
			
 
				 		goto finish_iomap;
			
 
				 
			
 
				-	vmf.pgoff = pgoff;
			
 
				-	vmf.flags = flags;
			
 
				-	vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_IO;
			
 
				-
			
 
				 	switch (iomap.type) {
			
 
				 	case IOMAP_MAPPED:
			
 
				-		result = dax_pmd_insert_mapping(vma, pmd, &vmf, address,
			
 
				-				&iomap, pos, write, &entry);
			
 
				+		result = dax_pmd_insert_mapping(vmf, &iomap, pos, &entry);
			
 
				 		break;
			
 
				 	case IOMAP_UNWRITTEN:
			
 
				 	case IOMAP_HOLE:
			
 
				 		if (WARN_ON_ONCE(write))
			
 
				 			goto unlock_entry;
			
 
				-		result = dax_pmd_load_hole(vma, pmd, &vmf, address, &iomap,
			
 
				-				&entry);
			
 
				+		result = dax_pmd_load_hole(vmf, &iomap, &entry);
			
 
				 		break;
			
 
				 	default:
			
 
				 		WARN_ON_ONCE(1);
			
@@ -1429,9 +1439,11 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 
				 	}
			
 
				  fallback:
			
 
				 	if (result == VM_FAULT_FALLBACK) {
			
 
				-		split_huge_pmd(vma, pmd, address);
			
 
				+		split_huge_pmd(vma, vmf->pmd, vmf->address);
			
 
				 		count_vm_event(THP_FAULT_FALLBACK);
			
 
				 	}
			
 
				+out:
			
 
				+	trace_dax_pmd_fault_done(inode, vmf, max_pgoff, result);
			
 
				 	return result;
			
 
				 }
			
 
				 EXPORT_SYMBOL_GPL(dax_iomap_pmd_fault);
			
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -273,21 +273,20 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 
				 	return result;
			
 
				 }
			
 
				 
			
 
				-static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
			
 
				-						pmd_t *pmd, unsigned int flags)
			
 
				+static int
			
 
				+ext4_dax_pmd_fault(struct vm_fault *vmf)
			
 
				 {
			
 
				 	int result;
			
 
				-	struct inode *inode = file_inode(vma->vm_file);
			
 
				+	struct inode *inode = file_inode(vmf->vma->vm_file);
			
 
				 	struct super_block *sb = inode->i_sb;
			
 
				-	bool write = flags & FAULT_FLAG_WRITE;
			
 
				+	bool write = vmf->flags & FAULT_FLAG_WRITE;
			
 
				 
			
 
				 	if (write) {
			
 
				 		sb_start_pagefault(sb);
			
 
				-		file_update_time(vma->vm_file);
			
 
				+		file_update_time(vmf->vma->vm_file);
			
 
				 	}
			
 
				 	down_read(&EXT4_I(inode)->i_mmap_sem);
			
 
				-	result = dax_iomap_pmd_fault(vma, addr, pmd, flags,
			
 
				-				     &ext4_iomap_ops);
			
 
				+	result = dax_iomap_pmd_fault(vmf, &ext4_iomap_ops);
			
 
				 	up_read(&EXT4_I(inode)->i_mmap_sem);
			
 
				 	if (write)
			
 
				 		sb_end_pagefault(sb);
			
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -728,8 +728,6 @@ static void nfs_inode_remove_request(struct nfs_page *req)
 
				 		if (likely(head->wb_page && !PageSwapCache(head->wb_page))) {
			
 
				 			set_page_private(head->wb_page, 0);
			
 
				 			ClearPagePrivate(head->wb_page);
			
 
				-			smp_mb__after_atomic();
			
 
				-			wake_up_page(head->wb_page, PG_private);
			
 
				 			clear_bit(PG_MAPPED, &head->wb_flags);
			
 
				 		}
			
 
				 		nfsi->nrequests--;
			
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -283,16 +283,14 @@ int ocfs2_set_acl(handle_t *handle,
 
				 int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type)
			
 
				 {
			
 
				 	struct buffer_head *bh = NULL;
			
 
				-	int status = 0;
			
 
				+	int status, had_lock;
			
 
				+	struct ocfs2_lock_holder oh;
			
 
				 
			
 
				-	status = ocfs2_inode_lock(inode, &bh, 1);
			
 
				-	if (status < 0) {
			
 
				-		if (status != -ENOENT)
			
 
				-			mlog_errno(status);
			
 
				-		return status;
			
 
				-	}
			
 
				+	had_lock = ocfs2_inode_lock_tracker(inode, &bh, 1, &oh);
			
 
				+	if (had_lock < 0)
			
 
				+		return had_lock;
			
 
				 	status = ocfs2_set_acl(NULL, inode, bh, type, acl, NULL, NULL);
			
 
				-	ocfs2_inode_unlock(inode, 1);
			
 
				+	ocfs2_inode_unlock_tracker(inode, 1, &oh, had_lock);
			
 
				 	brelse(bh);
			
 
				 	return status;
			
 
				 }
			
@@ -302,21 +300,20 @@ struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type)
 
				 	struct ocfs2_super *osb;
			
 
				 	struct buffer_head *di_bh = NULL;
			
 
				 	struct posix_acl *acl;
			
 
				-	int ret;
			
 
				+	int had_lock;
			
 
				+	struct ocfs2_lock_holder oh;
			
 
				 
			
 
				 	osb = OCFS2_SB(inode->i_sb);
			
 
				 	if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
			
 
				 		return NULL;
			
 
				-	ret = ocfs2_inode_lock(inode, &di_bh, 0);
			
 
				-	if (ret < 0) {
			
 
				-		if (ret != -ENOENT)
			
 
				-			mlog_errno(ret);
			
 
				-		return ERR_PTR(ret);
			
 
				-	}
			
 
				+
			
 
				+	had_lock = ocfs2_inode_lock_tracker(inode, &di_bh, 0, &oh);
			
 
				+	if (had_lock < 0)
			
 
				+		return ERR_PTR(had_lock);
			
 
				 
			
 
				 	acl = ocfs2_get_acl_nolock(inode, type, di_bh);
			
 
				 
			
 
				-	ocfs2_inode_unlock(inode, 0);
			
 
				+	ocfs2_inode_unlock_tracker(inode, 0, &oh, had_lock);
			
 
				 	brelse(di_bh);
			
 
				 	return acl;
			
 
				 }
			
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -532,6 +532,7 @@ void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res)
 
				 	init_waitqueue_head(&res->l_event);
			
 
				 	INIT_LIST_HEAD(&res->l_blocked_list);
			
 
				 	INIT_LIST_HEAD(&res->l_mask_waiters);
			
 
				+	INIT_LIST_HEAD(&res->l_holders);
			
 
				 }
			
 
				 
			
 
				 void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
			
@@ -749,6 +750,50 @@ void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
 
				 	res->l_flags = 0UL;
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * Keep a list of processes who have interest in a lockres.
			
 
				+ * Note: this is now only uesed for check recursive cluster locking.
			
 
				+ */
			
 
				+static inline void ocfs2_add_holder(struct ocfs2_lock_res *lockres,
			
 
				+				   struct ocfs2_lock_holder *oh)
			
 
				+{
			
 
				+	INIT_LIST_HEAD(&oh->oh_list);
			
 
				+	oh->oh_owner_pid = get_pid(task_pid(current));
			
 
				+
			
 
				+	spin_lock(&lockres->l_lock);
			
 
				+	list_add_tail(&oh->oh_list, &lockres->l_holders);
			
 
				+	spin_unlock(&lockres->l_lock);
			
 
				+}
			
 
				+
			
 
				+static inline void ocfs2_remove_holder(struct ocfs2_lock_res *lockres,
			
 
				+				       struct ocfs2_lock_holder *oh)
			
 
				+{
			
 
				+	spin_lock(&lockres->l_lock);
			
 
				+	list_del(&oh->oh_list);
			
 
				+	spin_unlock(&lockres->l_lock);
			
 
				+
			
 
				+	put_pid(oh->oh_owner_pid);
			
 
				+}
			
 
				+
			
 
				+static inline int ocfs2_is_locked_by_me(struct ocfs2_lock_res *lockres)
			
 
				+{
			
 
				+	struct ocfs2_lock_holder *oh;
			
 
				+	struct pid *pid;
			
 
				+
			
 
				+	/* look in the list of holders for one with the current task as owner */
			
 
				+	spin_lock(&lockres->l_lock);
			
 
				+	pid = task_pid(current);
			
 
				+	list_for_each_entry(oh, &lockres->l_holders, oh_list) {
			
 
				+		if (oh->oh_owner_pid == pid) {
			
 
				+			spin_unlock(&lockres->l_lock);
			
 
				+			return 1;
			
 
				+		}
			
 
				+	}
			
 
				+	spin_unlock(&lockres->l_lock);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				 static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres,
			
 
				 				     int level)
			
 
				 {
			
@@ -2333,8 +2378,9 @@ int ocfs2_inode_lock_full_nested(struct inode *inode,
 
				 		goto getbh;
			
 
				 	}
			
 
				 
			
 
				-	if (ocfs2_mount_local(osb))
			
 
				-		goto local;
			
 
				+	if ((arg_flags & OCFS2_META_LOCK_GETBH) ||
			
 
				+	    ocfs2_mount_local(osb))
			
 
				+		goto update;
			
 
				 
			
 
				 	if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
			
 
				 		ocfs2_wait_for_recovery(osb);
			
@@ -2363,7 +2409,7 @@ int ocfs2_inode_lock_full_nested(struct inode *inode,
 
				 	if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
			
 
				 		ocfs2_wait_for_recovery(osb);
			
 
				 
			
 
				-local:
			
 
				+update:
			
 
				 	/*
			
 
				 	 * We only see this flag if we're being called from
			
 
				 	 * ocfs2_read_locked_inode(). It means we're locking an inode
			
@@ -2497,6 +2543,59 @@ void ocfs2_inode_unlock(struct inode *inode,
 
				 		ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * This _tracker variantes are introduced to deal with the recursive cluster
			
 
				+ * locking issue. The idea is to keep track of a lock holder on the stack of
			
 
				+ * the current process. If there's a lock holder on the stack, we know the
			
 
				+ * task context is already protected by cluster locking. Currently, they're
			
 
				+ * used in some VFS entry routines.
			
 
				+ *
			
 
				+ * return < 0 on error, return == 0 if there's no lock holder on the stack
			
 
				+ * before this call, return == 1 if this call would be a recursive locking.
			
 
				+ */
			
 
				+int ocfs2_inode_lock_tracker(struct inode *inode,
			
 
				+			     struct buffer_head **ret_bh,
			
 
				+			     int ex,
			
 
				+			     struct ocfs2_lock_holder *oh)
			
 
				+{
			
 
				+	int status;
			
 
				+	int arg_flags = 0, has_locked;
			
 
				+	struct ocfs2_lock_res *lockres;
			
 
				+
			
 
				+	lockres = &OCFS2_I(inode)->ip_inode_lockres;
			
 
				+	has_locked = ocfs2_is_locked_by_me(lockres);
			
 
				+	/* Just get buffer head if the cluster lock has been taken */
			
 
				+	if (has_locked)
			
 
				+		arg_flags = OCFS2_META_LOCK_GETBH;
			
 
				+
			
 
				+	if (likely(!has_locked || ret_bh)) {
			
 
				+		status = ocfs2_inode_lock_full(inode, ret_bh, ex, arg_flags);
			
 
				+		if (status < 0) {
			
 
				+			if (status != -ENOENT)
			
 
				+				mlog_errno(status);
			
 
				+			return status;
			
 
				+		}
			
 
				+	}
			
 
				+	if (!has_locked)
			
 
				+		ocfs2_add_holder(lockres, oh);
			
 
				+
			
 
				+	return has_locked;
			
 
				+}
			
 
				+
			
 
				+void ocfs2_inode_unlock_tracker(struct inode *inode,
			
 
				+				int ex,
			
 
				+				struct ocfs2_lock_holder *oh,
			
 
				+				int had_lock)
			
 
				+{
			
 
				+	struct ocfs2_lock_res *lockres;
			
 
				+
			
 
				+	lockres = &OCFS2_I(inode)->ip_inode_lockres;
			
 
				+	if (!had_lock) {
			
 
				+		ocfs2_remove_holder(lockres, oh);
			
 
				+		ocfs2_inode_unlock(inode, ex);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				 int ocfs2_orphan_scan_lock(struct ocfs2_super *osb, u32 *seqno)
			
 
				 {
			
 
				 	struct ocfs2_lock_res *lockres;
			
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -70,6 +70,11 @@ struct ocfs2_orphan_scan_lvb {
 
				 	__be32	lvb_os_seqno;
			
 
				 };
			
 
				 
			
 
				+struct ocfs2_lock_holder {
			
 
				+	struct list_head oh_list;
			
 
				+	struct pid *oh_owner_pid;
			
 
				+};
			
 
				+
			
 
				 /* ocfs2_inode_lock_full() 'arg_flags' flags */
			
 
				 /* don't wait on recovery. */
			
 
				 #define OCFS2_META_LOCK_RECOVERY	(0x01)
			
@@ -77,6 +82,8 @@ struct ocfs2_orphan_scan_lvb {
 
				 #define OCFS2_META_LOCK_NOQUEUE		(0x02)
			
 
				 /* don't block waiting for the downconvert thread, instead return -EAGAIN */
			
 
				 #define OCFS2_LOCK_NONBLOCK		(0x04)
			
 
				+/* just get back disk inode bh if we've got cluster lock. */
			
 
				+#define OCFS2_META_LOCK_GETBH		(0x08)
			
 
				 
			
 
				 /* Locking subclasses of inode cluster lock */
			
 
				 enum {
			
@@ -170,4 +177,15 @@ void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug);
 
				 
			
 
				 /* To set the locking protocol on module initialization */
			
 
				 void ocfs2_set_locking_protocol(void);
			
 
				+
			
 
				+/* The _tracker pair is used to avoid cluster recursive locking */
			
 
				+int ocfs2_inode_lock_tracker(struct inode *inode,
			
 
				+			     struct buffer_head **ret_bh,
			
 
				+			     int ex,
			
 
				+			     struct ocfs2_lock_holder *oh);
			
 
				+void ocfs2_inode_unlock_tracker(struct inode *inode,
			
 
				+				int ex,
			
 
				+				struct ocfs2_lock_holder *oh,
			
 
				+				int had_lock);
			
 
				+
			
 
				 #endif	/* DLMGLUE_H */
			
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1138,6 +1138,8 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 
				 	handle_t *handle = NULL;
			
 
				 	struct dquot *transfer_to[MAXQUOTAS] = { };
			
 
				 	int qtype;
			
 
				+	int had_lock;
			
 
				+	struct ocfs2_lock_holder oh;
			
 
				 
			
 
				 	trace_ocfs2_setattr(inode, dentry,
			
 
				 			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
			
@@ -1173,11 +1175,30 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	status = ocfs2_inode_lock(inode, &bh, 1);
			
 
				-	if (status < 0) {
			
 
				-		if (status != -ENOENT)
			
 
				-			mlog_errno(status);
			
 
				+	had_lock = ocfs2_inode_lock_tracker(inode, &bh, 1, &oh);
			
 
				+	if (had_lock < 0) {
			
 
				+		status = had_lock;
			
 
				 		goto bail_unlock_rw;
			
 
				+	} else if (had_lock) {
			
 
				+		/*
			
 
				+		 * As far as we know, ocfs2_setattr() could only be the first
			
 
				+		 * VFS entry point in the call chain of recursive cluster
			
 
				+		 * locking issue.
			
 
				+		 *
			
 
				+		 * For instance:
			
 
				+		 * chmod_common()
			
 
				+		 *  notify_change()
			
 
				+		 *   ocfs2_setattr()
			
 
				+		 *    posix_acl_chmod()
			
 
				+		 *     ocfs2_iop_get_acl()
			
 
				+		 *
			
 
				+		 * But, we're not 100% sure if it's always true, because the
			
 
				+		 * ordering of the VFS entry points in the call chain is out
			
 
				+		 * of our control. So, we'd better dump the stack here to
			
 
				+		 * catch the other cases of recursive locking.
			
 
				+		 */
			
 
				+		mlog(ML_ERROR, "Another case of recursive locking:\n");
			
 
				+		dump_stack();
			
 
				 	}
			
 
				 	inode_locked = 1;
			
 
				 
			
@@ -1260,8 +1281,8 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 
				 bail_commit:
			
 
				 	ocfs2_commit_trans(osb, handle);
			
 
				 bail_unlock:
			
 
				-	if (status) {
			
 
				-		ocfs2_inode_unlock(inode, 1);
			
 
				+	if (status && inode_locked) {
			
 
				+		ocfs2_inode_unlock_tracker(inode, 1, &oh, had_lock);
			
 
				 		inode_locked = 0;
			
 
				 	}
			
 
				 bail_unlock_rw:
			
@@ -1279,7 +1300,7 @@ bail:
 
				 			mlog_errno(status);
			
 
				 	}
			
 
				 	if (inode_locked)
			
 
				-		ocfs2_inode_unlock(inode, 1);
			
 
				+		ocfs2_inode_unlock_tracker(inode, 1, &oh, had_lock);
			
 
				 
			
 
				 	brelse(bh);
			
 
				 	return status;
			
@@ -1320,21 +1341,32 @@ bail:
 
				 
			
 
				 int ocfs2_permission(struct inode *inode, int mask)
			
 
				 {
			
 
				-	int ret;
			
 
				+	int ret, had_lock;
			
 
				+	struct ocfs2_lock_holder oh;
			
 
				 
			
 
				 	if (mask & MAY_NOT_BLOCK)
			
 
				 		return -ECHILD;
			
 
				 
			
 
				-	ret = ocfs2_inode_lock(inode, NULL, 0);
			
 
				-	if (ret) {
			
 
				-		if (ret != -ENOENT)
			
 
				-			mlog_errno(ret);
			
 
				+	had_lock = ocfs2_inode_lock_tracker(inode, NULL, 0, &oh);
			
 
				+	if (had_lock < 0) {
			
 
				+		ret = had_lock;
			
 
				 		goto out;
			
 
				+	} else if (had_lock) {
			
 
				+		/* See comments in ocfs2_setattr() for details.
			
 
				+		 * The call chain of this case could be:
			
 
				+		 * do_sys_open()
			
 
				+		 *  may_open()
			
 
				+		 *   inode_permission()
			
 
				+		 *    ocfs2_permission()
			
 
				+		 *     ocfs2_iop_get_acl()
			
 
				+		 */
			
 
				+		mlog(ML_ERROR, "Another case of recursive locking:\n");
			
 
				+		dump_stack();
			
 
				 	}
			
 
				 
			
 
				 	ret = generic_permission(inode, mask);
			
 
				 
			
 
				-	ocfs2_inode_unlock(inode, 0);
			
 
				+	ocfs2_inode_unlock_tracker(inode, 0, &oh, had_lock);
			
 
				 out:
			
 
				 	return ret;
			
 
				 }
			
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -172,6 +172,7 @@ struct ocfs2_lock_res {
 
				 
			
 
				 	struct list_head         l_blocked_list;
			
 
				 	struct list_head         l_mask_waiters;
			
 
				+	struct list_head	 l_holders;
			
 
				 
			
 
				 	unsigned long		 l_flags;
			
 
				 	char                     l_name[OCFS2_LOCK_ID_MAX_LEN];
			
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -12,6 +12,7 @@
 
				  *  mm/ksm.c (mm hashing).
			
 
				  */
			
 
				 
			
 
				+#include <linux/list.h>
			
 
				 #include <linux/hashtable.h>
			
 
				 #include <linux/sched.h>
			
 
				 #include <linux/mm.h>
			
@@ -26,6 +27,7 @@
 
				 #include <linux/mempolicy.h>
			
 
				 #include <linux/ioctl.h>
			
 
				 #include <linux/security.h>
			
 
				+#include <linux/hugetlb.h>
			
 
				 
			
 
				 static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly;
			
 
				 
			
@@ -45,12 +47,16 @@ struct userfaultfd_ctx {
 
				 	wait_queue_head_t fault_wqh;
			
 
				 	/* waitqueue head for the pseudo fd to wakeup poll/read */
			
 
				 	wait_queue_head_t fd_wqh;
			
 
				+	/* waitqueue head for events */
			
 
				+	wait_queue_head_t event_wqh;
			
 
				 	/* a refile sequence protected by fault_pending_wqh lock */
			
 
				 	struct seqcount refile_seq;
			
 
				 	/* pseudo fd refcounting */
			
 
				 	atomic_t refcount;
			
 
				 	/* userfaultfd syscall flags */
			
 
				 	unsigned int flags;
			
 
				+	/* features requested from the userspace */
			
 
				+	unsigned int features;
			
 
				 	/* state machine */
			
 
				 	enum userfaultfd_state state;
			
 
				 	/* released */
			
@@ -59,6 +65,12 @@ struct userfaultfd_ctx {
 
				 	struct mm_struct *mm;
			
 
				 };
			
 
				 
			
 
				+struct userfaultfd_fork_ctx {
			
 
				+	struct userfaultfd_ctx *orig;
			
 
				+	struct userfaultfd_ctx *new;
			
 
				+	struct list_head list;
			
 
				+};
			
 
				+
			
 
				 struct userfaultfd_wait_queue {
			
 
				 	struct uffd_msg msg;
			
 
				 	wait_queue_t wq;
			
@@ -142,6 +154,8 @@ static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx)
 
				 		VM_BUG_ON(waitqueue_active(&ctx->fault_pending_wqh));
			
 
				 		VM_BUG_ON(spin_is_locked(&ctx->fault_wqh.lock));
			
 
				 		VM_BUG_ON(waitqueue_active(&ctx->fault_wqh));
			
 
				+		VM_BUG_ON(spin_is_locked(&ctx->event_wqh.lock));
			
 
				+		VM_BUG_ON(waitqueue_active(&ctx->event_wqh));
			
 
				 		VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock));
			
 
				 		VM_BUG_ON(waitqueue_active(&ctx->fd_wqh));
			
 
				 		mmdrop(ctx->mm);
			
@@ -169,7 +183,7 @@ static inline struct uffd_msg userfault_msg(unsigned long address,
 
				 	msg.arg.pagefault.address = address;
			
 
				 	if (flags & FAULT_FLAG_WRITE)
			
 
				 		/*
			
 
				-		 * If UFFD_FEATURE_PAGEFAULT_FLAG_WRITE was set in the
			
 
				+		 * If UFFD_FEATURE_PAGEFAULT_FLAG_WP was set in the
			
 
				 		 * uffdio_api.features and UFFD_PAGEFAULT_FLAG_WRITE
			
 
				 		 * was not set in a UFFD_EVENT_PAGEFAULT, it means it
			
 
				 		 * was a read fault, otherwise if set it means it's
			
@@ -188,6 +202,49 @@ static inline struct uffd_msg userfault_msg(unsigned long address,
 
				 	return msg;
			
 
				 }
			
 
				 
			
 
				+#ifdef CONFIG_HUGETLB_PAGE
			
 
				+/*
			
 
				+ * Same functionality as userfaultfd_must_wait below with modifications for
			
 
				+ * hugepmd ranges.
			
 
				+ */
			
 
				+static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
			
 
				+					 unsigned long address,
			
 
				+					 unsigned long flags,
			
 
				+					 unsigned long reason)
			
 
				+{
			
 
				+	struct mm_struct *mm = ctx->mm;
			
 
				+	pte_t *pte;
			
 
				+	bool ret = true;
			
 
				+
			
 
				+	VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
			
 
				+
			
 
				+	pte = huge_pte_offset(mm, address);
			
 
				+	if (!pte)
			
 
				+		goto out;
			
 
				+
			
 
				+	ret = false;
			
 
				+
			
 
				+	/*
			
 
				+	 * Lockless access: we're in a wait_event so it's ok if it
			
 
				+	 * changes under us.
			
 
				+	 */
			
 
				+	if (huge_pte_none(*pte))
			
 
				+		ret = true;
			
 
				+	if (!huge_pte_write(*pte) && (reason & VM_UFFD_WP))
			
 
				+		ret = true;
			
 
				+out:
			
 
				+	return ret;
			
 
				+}
			
 
				+#else
			
 
				+static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
			
 
				+					 unsigned long address,
			
 
				+					 unsigned long flags,
			
 
				+					 unsigned long reason)
			
 
				+{
			
 
				+	return false;	/* should never get here */
			
 
				+}
			
 
				+#endif /* CONFIG_HUGETLB_PAGE */
			
 
				+
			
 
				 /*
			
 
				  * Verify the pagetables are still not ok after having reigstered into
			
 
				  * the fault_pending_wqh to avoid userland having to UFFDIO_WAKE any
			
@@ -364,8 +421,12 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason)
 
				 	set_current_state(blocking_state);
			
 
				 	spin_unlock(&ctx->fault_pending_wqh.lock);
			
 
				 
			
 
				-	must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags,
			
 
				-					  reason);
			
 
				+	if (!is_vm_hugetlb_page(vmf->vma))
			
 
				+		must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags,
			
 
				+						  reason);
			
 
				+	else
			
 
				+		must_wait = userfaultfd_huge_must_wait(ctx, vmf->address,
			
 
				+						       vmf->flags, reason);
			
 
				 	up_read(&mm->mmap_sem);
			
 
				 
			
 
				 	if (likely(must_wait && !ACCESS_ONCE(ctx->released) &&
			
@@ -458,6 +519,196 @@ out:
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				+static int userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
			
 
				+					     struct userfaultfd_wait_queue *ewq)
			
 
				+{
			
 
				+	int ret = 0;
			
 
				+
			
 
				+	ewq->ctx = ctx;
			
 
				+	init_waitqueue_entry(&ewq->wq, current);
			
 
				+
			
 
				+	spin_lock(&ctx->event_wqh.lock);
			
 
				+	/*
			
 
				+	 * After the __add_wait_queue the uwq is visible to userland
			
 
				+	 * through poll/read().
			
 
				+	 */
			
 
				+	__add_wait_queue(&ctx->event_wqh, &ewq->wq);
			
 
				+	for (;;) {
			
 
				+		set_current_state(TASK_KILLABLE);
			
 
				+		if (ewq->msg.event == 0)
			
 
				+			break;
			
 
				+		if (ACCESS_ONCE(ctx->released) ||
			
 
				+		    fatal_signal_pending(current)) {
			
 
				+			ret = -1;
			
 
				+			__remove_wait_queue(&ctx->event_wqh, &ewq->wq);
			
 
				+			break;
			
 
				+		}
			
 
				+
			
 
				+		spin_unlock(&ctx->event_wqh.lock);
			
 
				+
			
 
				+		wake_up_poll(&ctx->fd_wqh, POLLIN);
			
 
				+		schedule();
			
 
				+
			
 
				+		spin_lock(&ctx->event_wqh.lock);
			
 
				+	}
			
 
				+	__set_current_state(TASK_RUNNING);
			
 
				+	spin_unlock(&ctx->event_wqh.lock);
			
 
				+
			
 
				+	/*
			
 
				+	 * ctx may go away after this if the userfault pseudo fd is
			
 
				+	 * already released.
			
 
				+	 */
			
 
				+
			
 
				+	userfaultfd_ctx_put(ctx);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static void userfaultfd_event_complete(struct userfaultfd_ctx *ctx,
			
 
				+				       struct userfaultfd_wait_queue *ewq)
			
 
				+{
			
 
				+	ewq->msg.event = 0;
			
 
				+	wake_up_locked(&ctx->event_wqh);
			
 
				+	__remove_wait_queue(&ctx->event_wqh, &ewq->wq);
			
 
				+}
			
 
				+
			
 
				+int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
			
 
				+{
			
 
				+	struct userfaultfd_ctx *ctx = NULL, *octx;
			
 
				+	struct userfaultfd_fork_ctx *fctx;
			
 
				+
			
 
				+	octx = vma->vm_userfaultfd_ctx.ctx;
			
 
				+	if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) {
			
 
				+		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
			
 
				+		vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING);
			
 
				+		return 0;
			
 
				+	}
			
 
				+
			
 
				+	list_for_each_entry(fctx, fcs, list)
			
 
				+		if (fctx->orig == octx) {
			
 
				+			ctx = fctx->new;
			
 
				+			break;
			
 
				+		}
			
 
				+
			
 
				+	if (!ctx) {
			
 
				+		fctx = kmalloc(sizeof(*fctx), GFP_KERNEL);
			
 
				+		if (!fctx)
			
 
				+			return -ENOMEM;
			
 
				+
			
 
				+		ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
			
 
				+		if (!ctx) {
			
 
				+			kfree(fctx);
			
 
				+			return -ENOMEM;
			
 
				+		}
			
 
				+
			
 
				+		atomic_set(&ctx->refcount, 1);
			
 
				+		ctx->flags = octx->flags;
			
 
				+		ctx->state = UFFD_STATE_RUNNING;
			
 
				+		ctx->features = octx->features;
			
 
				+		ctx->released = false;
			
 
				+		ctx->mm = vma->vm_mm;
			
 
				+		atomic_inc(&ctx->mm->mm_count);
			
 
				+
			
 
				+		userfaultfd_ctx_get(octx);
			
 
				+		fctx->orig = octx;
			
 
				+		fctx->new = ctx;
			
 
				+		list_add_tail(&fctx->list, fcs);
			
 
				+	}
			
 
				+
			
 
				+	vma->vm_userfaultfd_ctx.ctx = ctx;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int dup_fctx(struct userfaultfd_fork_ctx *fctx)
			
 
				+{
			
 
				+	struct userfaultfd_ctx *ctx = fctx->orig;
			
 
				+	struct userfaultfd_wait_queue ewq;
			
 
				+
			
 
				+	msg_init(&ewq.msg);
			
 
				+
			
 
				+	ewq.msg.event = UFFD_EVENT_FORK;
			
 
				+	ewq.msg.arg.reserved.reserved1 = (unsigned long)fctx->new;
			
 
				+
			
 
				+	return userfaultfd_event_wait_completion(ctx, &ewq);
			
 
				+}
			
 
				+
			
 
				+void dup_userfaultfd_complete(struct list_head *fcs)
			
 
				+{
			
 
				+	int ret = 0;
			
 
				+	struct userfaultfd_fork_ctx *fctx, *n;
			
 
				+
			
 
				+	list_for_each_entry_safe(fctx, n, fcs, list) {
			
 
				+		if (!ret)
			
 
				+			ret = dup_fctx(fctx);
			
 
				+		list_del(&fctx->list);
			
 
				+		kfree(fctx);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void mremap_userfaultfd_prep(struct vm_area_struct *vma,
			
 
				+			     struct vm_userfaultfd_ctx *vm_ctx)
			
 
				+{
			
 
				+	struct userfaultfd_ctx *ctx;
			
 
				+
			
 
				+	ctx = vma->vm_userfaultfd_ctx.ctx;
			
 
				+	if (ctx && (ctx->features & UFFD_FEATURE_EVENT_REMAP)) {
			
 
				+		vm_ctx->ctx = ctx;
			
 
				+		userfaultfd_ctx_get(ctx);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx,
			
 
				+				 unsigned long from, unsigned long to,
			
 
				+				 unsigned long len)
			
 
				+{
			
 
				+	struct userfaultfd_ctx *ctx = vm_ctx->ctx;
			
 
				+	struct userfaultfd_wait_queue ewq;
			
 
				+
			
 
				+	if (!ctx)
			
 
				+		return;
			
 
				+
			
 
				+	if (to & ~PAGE_MASK) {
			
 
				+		userfaultfd_ctx_put(ctx);
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	msg_init(&ewq.msg);
			
 
				+
			
 
				+	ewq.msg.event = UFFD_EVENT_REMAP;
			
 
				+	ewq.msg.arg.remap.from = from;
			
 
				+	ewq.msg.arg.remap.to = to;
			
 
				+	ewq.msg.arg.remap.len = len;
			
 
				+
			
 
				+	userfaultfd_event_wait_completion(ctx, &ewq);
			
 
				+}
			
 
				+
			
 
				+void madvise_userfault_dontneed(struct vm_area_struct *vma,
			
 
				+				struct vm_area_struct **prev,
			
 
				+				unsigned long start, unsigned long end)
			
 
				+{
			
 
				+	struct mm_struct *mm = vma->vm_mm;
			
 
				+	struct userfaultfd_ctx *ctx;
			
 
				+	struct userfaultfd_wait_queue ewq;
			
 
				+
			
 
				+	ctx = vma->vm_userfaultfd_ctx.ctx;
			
 
				+	if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_MADVDONTNEED))
			
 
				+		return;
			
 
				+
			
 
				+	userfaultfd_ctx_get(ctx);
			
 
				+	up_read(&mm->mmap_sem);
			
 
				+
			
 
				+	*prev = NULL; /* We wait for ACK w/o the mmap semaphore */
			
 
				+
			
 
				+	msg_init(&ewq.msg);
			
 
				+
			
 
				+	ewq.msg.event = UFFD_EVENT_MADVDONTNEED;
			
 
				+	ewq.msg.arg.madv_dn.start = start;
			
 
				+	ewq.msg.arg.madv_dn.end = end;
			
 
				+
			
 
				+	userfaultfd_event_wait_completion(ctx, &ewq);
			
 
				+
			
 
				+	down_read(&mm->mmap_sem);
			
 
				+}
			
 
				+
			
 
				 static int userfaultfd_release(struct inode *inode, struct file *file)
			
 
				 {
			
 
				 	struct userfaultfd_ctx *ctx = file->private_data;
			
@@ -522,25 +773,36 @@ wakeup:
 
				 }
			
 
				 
			
 
				 /* fault_pending_wqh.lock must be hold by the caller */
			
 
				-static inline struct userfaultfd_wait_queue *find_userfault(
			
 
				-	struct userfaultfd_ctx *ctx)
			
 
				+static inline struct userfaultfd_wait_queue *find_userfault_in(
			
 
				+		wait_queue_head_t *wqh)
			
 
				 {
			
 
				 	wait_queue_t *wq;
			
 
				 	struct userfaultfd_wait_queue *uwq;
			
 
				 
			
 
				-	VM_BUG_ON(!spin_is_locked(&ctx->fault_pending_wqh.lock));
			
 
				+	VM_BUG_ON(!spin_is_locked(&wqh->lock));
			
 
				 
			
 
				 	uwq = NULL;
			
 
				-	if (!waitqueue_active(&ctx->fault_pending_wqh))
			
 
				+	if (!waitqueue_active(wqh))
			
 
				 		goto out;
			
 
				 	/* walk in reverse to provide FIFO behavior to read userfaults */
			
 
				-	wq = list_last_entry(&ctx->fault_pending_wqh.task_list,
			
 
				-			     typeof(*wq), task_list);
			
 
				+	wq = list_last_entry(&wqh->task_list, typeof(*wq), task_list);
			
 
				 	uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
			
 
				 out:
			
 
				 	return uwq;
			
 
				 }
			
 
				 
			
 
				+static inline struct userfaultfd_wait_queue *find_userfault(
			
 
				+		struct userfaultfd_ctx *ctx)
			
 
				+{
			
 
				+	return find_userfault_in(&ctx->fault_pending_wqh);
			
 
				+}
			
 
				+
			
 
				+static inline struct userfaultfd_wait_queue *find_userfault_evt(
			
 
				+		struct userfaultfd_ctx *ctx)
			
 
				+{
			
 
				+	return find_userfault_in(&ctx->event_wqh);
			
 
				+}
			
 
				+
			
 
				 static unsigned int userfaultfd_poll(struct file *file, poll_table *wait)
			
 
				 {
			
 
				 	struct userfaultfd_ctx *ctx = file->private_data;
			
@@ -572,10 +834,42 @@ static unsigned int userfaultfd_poll(struct file *file, poll_table *wait)
 
				 		smp_mb();
			
 
				 		if (waitqueue_active(&ctx->fault_pending_wqh))
			
 
				 			ret = POLLIN;
			
 
				+		else if (waitqueue_active(&ctx->event_wqh))
			
 
				+			ret = POLLIN;
			
 
				+
			
 
				 		return ret;
			
 
				 	default:
			
 
				-		BUG();
			
 
				+		WARN_ON_ONCE(1);
			
 
				+		return POLLERR;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static const struct file_operations userfaultfd_fops;
			
 
				+
			
 
				+static int resolve_userfault_fork(struct userfaultfd_ctx *ctx,
			
 
				+				  struct userfaultfd_ctx *new,
			
 
				+				  struct uffd_msg *msg)
			
 
				+{
			
 
				+	int fd;
			
 
				+	struct file *file;
			
 
				+	unsigned int flags = new->flags & UFFD_SHARED_FCNTL_FLAGS;
			
 
				+
			
 
				+	fd = get_unused_fd_flags(flags);
			
 
				+	if (fd < 0)
			
 
				+		return fd;
			
 
				+
			
 
				+	file = anon_inode_getfile("[userfaultfd]", &userfaultfd_fops, new,
			
 
				+				  O_RDWR | flags);
			
 
				+	if (IS_ERR(file)) {
			
 
				+		put_unused_fd(fd);
			
 
				+		return PTR_ERR(file);
			
 
				 	}
			
 
				+
			
 
				+	fd_install(fd, file);
			
 
				+	msg->arg.reserved.reserved1 = 0;
			
 
				+	msg->arg.fork.ufd = fd;
			
 
				+
			
 
				+	return 0;
			
 
				 }
			
 
				 
			
 
				 static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
			
@@ -584,6 +878,15 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
 
				 	ssize_t ret;
			
 
				 	DECLARE_WAITQUEUE(wait, current);
			
 
				 	struct userfaultfd_wait_queue *uwq;
			
 
				+	/*
			
 
				+	 * Handling fork event requires sleeping operations, so
			
 
				+	 * we drop the event_wqh lock, then do these ops, then
			
 
				+	 * lock it back and wake up the waiter. While the lock is
			
 
				+	 * dropped the ewq may go away so we keep track of it
			
 
				+	 * carefully.
			
 
				+	 */
			
 
				+	LIST_HEAD(fork_event);
			
 
				+	struct userfaultfd_ctx *fork_nctx = NULL;
			
 
				 
			
 
				 	/* always take the fd_wqh lock before the fault_pending_wqh lock */
			
 
				 	spin_lock(&ctx->fd_wqh.lock);
			
@@ -635,6 +938,29 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
 
				 			break;
			
 
				 		}
			
 
				 		spin_unlock(&ctx->fault_pending_wqh.lock);
			
 
				+
			
 
				+		spin_lock(&ctx->event_wqh.lock);
			
 
				+		uwq = find_userfault_evt(ctx);
			
 
				+		if (uwq) {
			
 
				+			*msg = uwq->msg;
			
 
				+
			
 
				+			if (uwq->msg.event == UFFD_EVENT_FORK) {
			
 
				+				fork_nctx = (struct userfaultfd_ctx *)
			
 
				+					(unsigned long)
			
 
				+					uwq->msg.arg.reserved.reserved1;
			
 
				+				list_move(&uwq->wq.task_list, &fork_event);
			
 
				+				spin_unlock(&ctx->event_wqh.lock);
			
 
				+				ret = 0;
			
 
				+				break;
			
 
				+			}
			
 
				+
			
 
				+			userfaultfd_event_complete(ctx, uwq);
			
 
				+			spin_unlock(&ctx->event_wqh.lock);
			
 
				+			ret = 0;
			
 
				+			break;
			
 
				+		}
			
 
				+		spin_unlock(&ctx->event_wqh.lock);
			
 
				+
			
 
				 		if (signal_pending(current)) {
			
 
				 			ret = -ERESTARTSYS;
			
 
				 			break;
			
@@ -651,6 +977,23 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
 
				 	__set_current_state(TASK_RUNNING);
			
 
				 	spin_unlock(&ctx->fd_wqh.lock);
			
 
				 
			
 
				+	if (!ret && msg->event == UFFD_EVENT_FORK) {
			
 
				+		ret = resolve_userfault_fork(ctx, fork_nctx, msg);
			
 
				+
			
 
				+		if (!ret) {
			
 
				+			spin_lock(&ctx->event_wqh.lock);
			
 
				+			if (!list_empty(&fork_event)) {
			
 
				+				uwq = list_first_entry(&fork_event,
			
 
				+						       typeof(*uwq),
			
 
				+						       wq.task_list);
			
 
				+				list_del(&uwq->wq.task_list);
			
 
				+				__add_wait_queue(&ctx->event_wqh, &uwq->wq);
			
 
				+				userfaultfd_event_complete(ctx, uwq);
			
 
				+			}
			
 
				+			spin_unlock(&ctx->event_wqh.lock);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				 	return ret;
			
 
				 }
			
 
				 
			
@@ -753,6 +1096,12 @@ static __always_inline int validate_range(struct mm_struct *mm,
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+static inline bool vma_can_userfault(struct vm_area_struct *vma)
			
 
				+{
			
 
				+	return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) ||
			
 
				+		vma_is_shmem(vma);
			
 
				+}
			
 
				+
			
 
				 static int userfaultfd_register(struct userfaultfd_ctx *ctx,
			
 
				 				unsigned long arg)
			
 
				 {
			
@@ -763,6 +1112,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 
				 	struct uffdio_register __user *user_uffdio_register;
			
 
				 	unsigned long vm_flags, new_flags;
			
 
				 	bool found;
			
 
				+	bool non_anon_pages;
			
 
				 	unsigned long start, end, vma_end;
			
 
				 
			
 
				 	user_uffdio_register = (struct uffdio_register __user *) arg;
			
@@ -813,14 +1163,22 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 
				 	if (vma->vm_start >= end)
			
 
				 		goto out_unlock;
			
 
				 
			
 
				+	/*
			
 
				+	 * If the first vma contains huge pages, make sure start address
			
 
				+	 * is aligned to huge page size.
			
 
				+	 */
			
 
				+	if (is_vm_hugetlb_page(vma)) {
			
 
				+		unsigned long vma_hpagesize = vma_kernel_pagesize(vma);
			
 
				+
			
 
				+		if (start & (vma_hpagesize - 1))
			
 
				+			goto out_unlock;
			
 
				+	}
			
 
				+
			
 
				 	/*
			
 
				 	 * Search for not compatible vmas.
			
 
				-	 *
			
 
				-	 * FIXME: this shall be relaxed later so that it doesn't fail
			
 
				-	 * on tmpfs backed vmas (in addition to the current allowance
			
 
				-	 * on anonymous vmas).
			
 
				 	 */
			
 
				 	found = false;
			
 
				+	non_anon_pages = false;
			
 
				 	for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) {
			
 
				 		cond_resched();
			
 
				 
			
@@ -829,8 +1187,21 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 
				 
			
 
				 		/* check not compatible vmas */
			
 
				 		ret = -EINVAL;
			
 
				-		if (cur->vm_ops)
			
 
				+		if (!vma_can_userfault(cur))
			
 
				 			goto out_unlock;
			
 
				+		/*
			
 
				+		 * If this vma contains ending address, and huge pages
			
 
				+		 * check alignment.
			
 
				+		 */
			
 
				+		if (is_vm_hugetlb_page(cur) && end <= cur->vm_end &&
			
 
				+		    end > cur->vm_start) {
			
 
				+			unsigned long vma_hpagesize = vma_kernel_pagesize(cur);
			
 
				+
			
 
				+			ret = -EINVAL;
			
 
				+
			
 
				+			if (end & (vma_hpagesize - 1))
			
 
				+				goto out_unlock;
			
 
				+		}
			
 
				 
			
 
				 		/*
			
 
				 		 * Check that this vma isn't already owned by a
			
@@ -843,6 +1214,12 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 
				 		    cur->vm_userfaultfd_ctx.ctx != ctx)
			
 
				 			goto out_unlock;
			
 
				 
			
 
				+		/*
			
 
				+		 * Note vmas containing huge pages
			
 
				+		 */
			
 
				+		if (is_vm_hugetlb_page(cur) || vma_is_shmem(cur))
			
 
				+			non_anon_pages = true;
			
 
				+
			
 
				 		found = true;
			
 
				 	}
			
 
				 	BUG_ON(!found);
			
@@ -854,7 +1231,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 
				 	do {
			
 
				 		cond_resched();
			
 
				 
			
 
				-		BUG_ON(vma->vm_ops);
			
 
				+		BUG_ON(!vma_can_userfault(vma));
			
 
				 		BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
			
 
				 		       vma->vm_userfaultfd_ctx.ctx != ctx);
			
 
				 
			
@@ -912,7 +1289,8 @@ out_unlock:
 
				 		 * userland which ioctls methods are guaranteed to
			
 
				 		 * succeed on this range.
			
 
				 		 */
			
 
				-		if (put_user(UFFD_API_RANGE_IOCTLS,
			
 
				+		if (put_user(non_anon_pages ? UFFD_API_RANGE_IOCTLS_BASIC :
			
 
				+			     UFFD_API_RANGE_IOCTLS,
			
 
				 			     &user_uffdio_register->ioctls))
			
 
				 			ret = -EFAULT;
			
 
				 	}
			
@@ -958,12 +1336,19 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
 
				 	if (vma->vm_start >= end)
			
 
				 		goto out_unlock;
			
 
				 
			
 
				+	/*
			
 
				+	 * If the first vma contains huge pages, make sure start address
			
 
				+	 * is aligned to huge page size.
			
 
				+	 */
			
 
				+	if (is_vm_hugetlb_page(vma)) {
			
 
				+		unsigned long vma_hpagesize = vma_kernel_pagesize(vma);
			
 
				+
			
 
				+		if (start & (vma_hpagesize - 1))
			
 
				+			goto out_unlock;
			
 
				+	}
			
 
				+
			
 
				 	/*
			
 
				 	 * Search for not compatible vmas.
			
 
				-	 *
			
 
				-	 * FIXME: this shall be relaxed later so that it doesn't fail
			
 
				-	 * on tmpfs backed vmas (in addition to the current allowance
			
 
				-	 * on anonymous vmas).
			
 
				 	 */
			
 
				 	found = false;
			
 
				 	ret = -EINVAL;
			
@@ -980,7 +1365,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
 
				 		 * provides for more strict behavior to notice
			
 
				 		 * unregistration errors.
			
 
				 		 */
			
 
				-		if (cur->vm_ops)
			
 
				+		if (!vma_can_userfault(cur))
			
 
				 			goto out_unlock;
			
 
				 
			
 
				 		found = true;
			
@@ -994,7 +1379,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
 
				 	do {
			
 
				 		cond_resched();
			
 
				 
			
 
				-		BUG_ON(vma->vm_ops);
			
 
				+		BUG_ON(!vma_can_userfault(vma));
			
 
				 
			
 
				 		/*
			
 
				 		 * Nothing to do: this vma is already registered into this
			
@@ -1007,6 +1392,19 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
 
				 			start = vma->vm_start;
			
 
				 		vma_end = min(end, vma->vm_end);
			
 
				 
			
 
				+		if (userfaultfd_missing(vma)) {
			
 
				+			/*
			
 
				+			 * Wake any concurrent pending userfault while
			
 
				+			 * we unregister, so they will not hang
			
 
				+			 * permanently and it avoids userland to call
			
 
				+			 * UFFDIO_WAKE explicitly.
			
 
				+			 */
			
 
				+			struct userfaultfd_wake_range range;
			
 
				+			range.start = start;
			
 
				+			range.len = vma_end - start;
			
 
				+			wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range);
			
 
				+		}
			
 
				+
			
 
				 		new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP);
			
 
				 		prev = vma_merge(mm, prev, start, vma_end, new_flags,
			
 
				 				 vma->anon_vma, vma->vm_file, vma->vm_pgoff,
			
@@ -1178,6 +1576,14 @@ out:
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				+static inline unsigned int uffd_ctx_features(__u64 user_features)
			
 
				+{
			
 
				+	/*
			
 
				+	 * For the current set of features the bits just coincide
			
 
				+	 */
			
 
				+	return (unsigned int)user_features;
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * userland asks for a certain API version and we return which bits
			
 
				  * and ioctl commands are implemented in this kernel for such API
			
@@ -1189,6 +1595,7 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
 
				 	struct uffdio_api uffdio_api;
			
 
				 	void __user *buf = (void __user *)arg;
			
 
				 	int ret;
			
 
				+	__u64 features;
			
 
				 
			
 
				 	ret = -EINVAL;
			
 
				 	if (ctx->state != UFFD_STATE_WAIT_API)
			
@@ -1196,19 +1603,23 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
 
				 	ret = -EFAULT;
			
 
				 	if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api)))
			
 
				 		goto out;
			
 
				-	if (uffdio_api.api != UFFD_API || uffdio_api.features) {
			
 
				+	features = uffdio_api.features;
			
 
				+	if (uffdio_api.api != UFFD_API || (features & ~UFFD_API_FEATURES)) {
			
 
				 		memset(&uffdio_api, 0, sizeof(uffdio_api));
			
 
				 		if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
			
 
				 			goto out;
			
 
				 		ret = -EINVAL;
			
 
				 		goto out;
			
 
				 	}
			
 
				+	/* report all available features and ioctls to userland */
			
 
				 	uffdio_api.features = UFFD_API_FEATURES;
			
 
				 	uffdio_api.ioctls = UFFD_API_IOCTLS;
			
 
				 	ret = -EFAULT;
			
 
				 	if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
			
 
				 		goto out;
			
 
				 	ctx->state = UFFD_STATE_RUNNING;
			
 
				+	/* only enable the requested features for this uffd context */
			
 
				+	ctx->features = uffd_ctx_features(features);
			
 
				 	ret = 0;
			
 
				 out:
			
 
				 	return ret;
			
@@ -1295,6 +1706,7 @@ static void init_once_userfaultfd_ctx(void *mem)
 
				 
			
 
				 	init_waitqueue_head(&ctx->fault_pending_wqh);
			
 
				 	init_waitqueue_head(&ctx->fault_wqh);
			
 
				+	init_waitqueue_head(&ctx->event_wqh);
			
 
				 	init_waitqueue_head(&ctx->fd_wqh);
			
 
				 	seqcount_init(&ctx->refile_seq);
			
 
				 }
			
@@ -1335,6 +1747,7 @@ static struct file *userfaultfd_file_create(int flags)
 
				 
			
 
				 	atomic_set(&ctx->refcount, 1);
			
 
				 	ctx->flags = flags;
			
 
				+	ctx->features = 0;
			
 
				 	ctx->state = UFFD_STATE_WAIT_API;
			
 
				 	ctx->released = false;
			
 
				 	ctx->mm = current->mm;
			
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1437,12 +1437,9 @@ xfs_filemap_fault(
 
				  */
			
 
				 STATIC int
			
 
				 xfs_filemap_pmd_fault(
			
 
				-	struct vm_area_struct	*vma,
			
 
				-	unsigned long		addr,
			
 
				-	pmd_t			*pmd,
			
 
				-	unsigned int		flags)
			
 
				+	struct vm_fault		*vmf)
			
 
				 {
			
 
				-	struct inode		*inode = file_inode(vma->vm_file);
			
 
				+	struct inode		*inode = file_inode(vmf->vma->vm_file);
			
 
				 	struct xfs_inode	*ip = XFS_I(inode);
			
 
				 	int			ret;
			
 
				 
			
@@ -1451,16 +1448,16 @@ xfs_filemap_pmd_fault(
 
				 
			
 
				 	trace_xfs_filemap_pmd_fault(ip);
			
 
				 
			
 
				-	if (flags & FAULT_FLAG_WRITE) {
			
 
				+	if (vmf->flags & FAULT_FLAG_WRITE) {
			
 
				 		sb_start_pagefault(inode->i_sb);
			
 
				-		file_update_time(vma->vm_file);
			
 
				+		file_update_time(vmf->vma->vm_file);
			
 
				 	}
			
 
				 
			
 
				 	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
			
 
				-	ret = dax_iomap_pmd_fault(vma, addr, pmd, flags, &xfs_iomap_ops);
			
 
				+	ret = dax_iomap_pmd_fault(vmf, &xfs_iomap_ops);
			
 
				 	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
			
 
				 
			
 
				-	if (flags & FAULT_FLAG_WRITE)
			
 
				+	if (vmf->flags & FAULT_FLAG_WRITE)
			
 
				 		sb_end_pagefault(inode->i_sb);
			
 
				 
			
 
				 	return ret;
			
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -71,15 +71,13 @@ static inline unsigned int dax_radix_order(void *entry)
 
				 		return PMD_SHIFT - PAGE_SHIFT;
			
 
				 	return 0;
			
 
				 }
			
 
				-int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
			
 
				-		pmd_t *pmd, unsigned int flags, const struct iomap_ops *ops);
			
 
				+int dax_iomap_pmd_fault(struct vm_fault *vmf, const struct iomap_ops *ops);
			
 
				 #else
			
 
				 static inline unsigned int dax_radix_order(void *entry)
			
 
				 {
			
 
				 	return 0;
			
 
				 }
			
 
				-static inline int dax_iomap_pmd_fault(struct vm_area_struct *vma,
			
 
				-		unsigned long address, pmd_t *pmd, unsigned int flags,
			
 
				+static inline int dax_iomap_pmd_fault(struct vm_fault *vmf,
			
 
				 		const struct iomap_ops *ops)
			
 
				 {
			
 
				 	return VM_FAULT_FALLBACK;
			
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -33,6 +33,7 @@ enum transparent_hugepage_flag {
 
				 	TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
			
 
				 	TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
			
 
				 	TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
			
 
				+	TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG,
			
 
				 	TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
			
 
				 	TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG,
			
 
				 	TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG,
			
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -65,7 +65,8 @@ int hugetlb_mempolicy_sysctl_handler(struct ctl_table *, int,
 
				 int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *);
			
 
				 long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *,
			
 
				 			 struct page **, struct vm_area_struct **,
			
 
				-			 unsigned long *, unsigned long *, long, unsigned int);
			
 
				+			 unsigned long *, unsigned long *, long, unsigned int,
			
 
				+			 int *);
			
 
				 void unmap_hugepage_range(struct vm_area_struct *,
			
 
				 			  unsigned long, unsigned long, struct page *);
			
 
				 void __unmap_hugepage_range_final(struct mmu_gather *tlb,
			
@@ -81,6 +82,11 @@ void hugetlb_show_meminfo(void);
 
				 unsigned long hugetlb_total_pages(void);
			
 
				 int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
			
 
				 			unsigned long address, unsigned int flags);
			
 
				+int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte,
			
 
				+				struct vm_area_struct *dst_vma,
			
 
				+				unsigned long dst_addr,
			
 
				+				unsigned long src_addr,
			
 
				+				struct page **pagep);
			
 
				 int hugetlb_reserve_pages(struct inode *inode, long from, long to,
			
 
				 						struct vm_area_struct *vma,
			
 
				 						vm_flags_t vm_flags);
			
@@ -131,7 +137,7 @@ static inline unsigned long hugetlb_total_pages(void)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-#define follow_hugetlb_page(m,v,p,vs,a,b,i,w)	({ BUG(); 0; })
			
 
				+#define follow_hugetlb_page(m,v,p,vs,a,b,i,w,n)	({ BUG(); 0; })
			
 
				 #define follow_huge_addr(mm, addr, write)	ERR_PTR(-EINVAL)
			
 
				 #define copy_hugetlb_page_range(src, dst, vma)	({ BUG(); 0; })
			
 
				 static inline void hugetlb_report_meminfo(struct seq_file *m)
			
@@ -149,6 +155,8 @@ static inline void hugetlb_show_meminfo(void)
 
				 #define is_hugepage_only_range(mm, addr, len)	0
			
 
				 #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; })
			
 
				 #define hugetlb_fault(mm, vma, addr, flags)	({ BUG(); 0; })
			
 
				+#define hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, dst_addr, \
			
 
				+				src_addr, pagep)	({ BUG(); 0; })
			
 
				 #define huge_pte_offset(mm, address)	0
			
 
				 static inline int dequeue_hwpoisoned_huge_page(struct page *page)
			
 
				 {
			
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -203,6 +203,7 @@ int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn,
 
				 			    unsigned long  *end_pfn);
			
 
				 void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
			
 
				 			  unsigned long *out_end_pfn, int *out_nid);
			
 
				+unsigned long memblock_next_valid_pfn(unsigned long pfn, unsigned long max_pfn);
			
 
				 
			
 
				 /**
			
 
				  * for_each_mem_pfn_range - early memory pfn range iterator
			
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -253,6 +253,7 @@ struct mem_cgroup {
 
				         /* Index in the kmem_cache->memcg_params.memcg_caches array */
			
 
				 	int kmemcg_id;
			
 
				 	enum memcg_kmem_state kmem_state;
			
 
				+	struct list_head kmem_caches;
			
 
				 #endif
			
 
				 
			
 
				 	int last_scanned_node;
			
@@ -829,6 +830,7 @@ void memcg_kmem_uncharge(struct page *page, int order);
 
				 
			
 
				 #if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
			
 
				 extern struct static_key_false memcg_kmem_enabled_key;
			
 
				+extern struct workqueue_struct *memcg_kmem_cache_wq;
			
 
				 
			
 
				 extern int memcg_nr_cache_ids;
			
 
				 void memcg_get_cache_ids(void);
			
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -285,6 +285,17 @@ extern pgprot_t protection_map[16];
 
				 #define FAULT_FLAG_REMOTE	0x80	/* faulting for non current tsk/mm */
			
 
				 #define FAULT_FLAG_INSTRUCTION  0x100	/* The fault was during an instruction fetch */
			
 
				 
			
 
				+#define FAULT_FLAG_TRACE \
			
 
				+	{ FAULT_FLAG_WRITE,		"WRITE" }, \
			
 
				+	{ FAULT_FLAG_MKWRITE,		"MKWRITE" }, \
			
 
				+	{ FAULT_FLAG_ALLOW_RETRY,	"ALLOW_RETRY" }, \
			
 
				+	{ FAULT_FLAG_RETRY_NOWAIT,	"RETRY_NOWAIT" }, \
			
 
				+	{ FAULT_FLAG_KILLABLE,		"KILLABLE" }, \
			
 
				+	{ FAULT_FLAG_TRIED,		"TRIED" }, \
			
 
				+	{ FAULT_FLAG_USER,		"USER" }, \
			
 
				+	{ FAULT_FLAG_REMOTE,		"REMOTE" }, \
			
 
				+	{ FAULT_FLAG_INSTRUCTION,	"INSTRUCTION" }
			
 
				+
			
 
				 /*
			
 
				  * vm_fault is filled by the the pagefault handler and passed to the vma's
			
 
				  * ->fault function. The vma's ->fault is responsible for returning a bitmask
			
@@ -340,8 +351,7 @@ struct vm_operations_struct {
 
				 	void (*close)(struct vm_area_struct * area);
			
 
				 	int (*mremap)(struct vm_area_struct * area);
			
 
				 	int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf);
			
 
				-	int (*pmd_fault)(struct vm_area_struct *, unsigned long address,
			
 
				-						pmd_t *, unsigned int flags);
			
 
				+	int (*pmd_fault)(struct vm_fault *vmf);
			
 
				 	void (*map_pages)(struct vm_fault *vmf,
			
 
				 			pgoff_t start_pgoff, pgoff_t end_pgoff);
			
 
				 
			
@@ -1111,6 +1121,20 @@ static inline void clear_page_pfmemalloc(struct page *page)
 
				 			 VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE | \
			
 
				 			 VM_FAULT_FALLBACK)
			
 
				 
			
 
				+#define VM_FAULT_RESULT_TRACE \
			
 
				+	{ VM_FAULT_OOM,			"OOM" }, \
			
 
				+	{ VM_FAULT_SIGBUS,		"SIGBUS" }, \
			
 
				+	{ VM_FAULT_MAJOR,		"MAJOR" }, \
			
 
				+	{ VM_FAULT_WRITE,		"WRITE" }, \
			
 
				+	{ VM_FAULT_HWPOISON,		"HWPOISON" }, \
			
 
				+	{ VM_FAULT_HWPOISON_LARGE,	"HWPOISON_LARGE" }, \
			
 
				+	{ VM_FAULT_SIGSEGV,		"SIGSEGV" }, \
			
 
				+	{ VM_FAULT_NOPAGE,		"NOPAGE" }, \
			
 
				+	{ VM_FAULT_LOCKED,		"LOCKED" }, \
			
 
				+	{ VM_FAULT_RETRY,		"RETRY" }, \
			
 
				+	{ VM_FAULT_FALLBACK,		"FALLBACK" }, \
			
 
				+	{ VM_FAULT_DONE_COW,		"DONE_COW" }
			
 
				+
			
 
				 /* Encode hstate index for a hwpoisoned large page */
			
 
				 #define VM_FAULT_SET_HINDEX(x) ((x) << 12)
			
 
				 #define VM_FAULT_GET_HINDEX(x) (((x) >> 12) & 0xf)
			
@@ -1128,8 +1152,7 @@ extern void pagefault_out_of_memory(void);
 
				  */
			
 
				 #define SHOW_MEM_FILTER_NODES		(0x0001u)	/* disallowed nodes */
			
 
				 
			
 
				-extern void show_free_areas(unsigned int flags);
			
 
				-extern bool skip_free_areas_node(unsigned int flags, int nid);
			
 
				+extern void show_free_areas(unsigned int flags, nodemask_t *nodemask);
			
 
				 
			
 
				 int shmem_zero_setup(struct vm_area_struct *);
			
 
				 #ifdef CONFIG_SHMEM
			
@@ -1152,8 +1175,6 @@ struct zap_details {
 
				 	struct address_space *check_mapping;	/* Check page->mapping if set */
			
 
				 	pgoff_t	first_index;			/* Lowest page->index to unmap */
			
 
				 	pgoff_t last_index;			/* Highest page->index to unmap */
			
 
				-	bool ignore_dirty;			/* Ignore dirty pages */
			
 
				-	bool check_swap_entries;		/* Check also swap entries */
			
 
				 };
			
 
				 
			
 
				 struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
			
@@ -1164,7 +1185,7 @@ struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
 
				 int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
			
 
				 		unsigned long size);
			
 
				 void zap_page_range(struct vm_area_struct *vma, unsigned long address,
			
 
				-		unsigned long size, struct zap_details *);
			
 
				+		unsigned long size);
			
 
				 void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
			
 
				 		unsigned long start, unsigned long end);
			
 
				 
			
@@ -1359,6 +1380,16 @@ static inline bool vma_is_anonymous(struct vm_area_struct *vma)
 
				 	return !vma->vm_ops;
			
 
				 }
			
 
				 
			
 
				+#ifdef CONFIG_SHMEM
			
 
				+/*
			
 
				+ * The vma_is_shmem is not inline because it is used only by slow
			
 
				+ * paths in userfault.
			
 
				+ */
			
 
				+bool vma_is_shmem(struct vm_area_struct *vma);
			
 
				+#else
			
 
				+static inline bool vma_is_shmem(struct vm_area_struct *vma) { return false; }
			
 
				+#endif
			
 
				+
			
 
				 static inline int stack_guard_page_start(struct vm_area_struct *vma,
			
 
				 					     unsigned long addr)
			
 
				 {
			
@@ -1900,7 +1931,7 @@ extern void setup_per_zone_wmarks(void);
 
				 extern int __meminit init_per_zone_wmark_min(void);
			
 
				 extern void mem_init(void);
			
 
				 extern void __init mmap_init(void);
			
 
				-extern void show_mem(unsigned int flags);
			
 
				+extern void show_mem(unsigned int flags, nodemask_t *nodemask);
			
 
				 extern long si_mem_available(void);
			
 
				 extern void si_meminfo(struct sysinfo * val);
			
 
				 extern void si_meminfo_node(struct sysinfo *val, int nid);
			
@@ -1908,8 +1939,8 @@ extern void si_meminfo_node(struct sysinfo *val, int nid);
 
				 extern unsigned long arch_reserved_kernel_pages(void);
			
 
				 #endif
			
 
				 
			
 
				-extern __printf(2, 3)
			
 
				-void warn_alloc(gfp_t gfp_mask, const char *fmt, ...);
			
 
				+extern __printf(3, 4)
			
 
				+void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...);
			
 
				 
			
 
				 extern void setup_per_cpu_pageset(void);
			
 
				 
			
@@ -2049,6 +2080,7 @@ static inline void mm_populate(unsigned long addr, unsigned long len) {}
 
				 
			
 
				 /* These take the mm semaphore themselves */
			
 
				 extern int __must_check vm_brk(unsigned long, unsigned long);
			
 
				+extern int __must_check vm_brk_flags(unsigned long, unsigned long, unsigned long);
			
 
				 extern int vm_munmap(unsigned long, size_t);
			
 
				 extern unsigned long __must_check vm_mmap(struct file *, unsigned long,
			
 
				         unsigned long, unsigned long,
			
@@ -2400,6 +2432,10 @@ extern void clear_huge_page(struct page *page,
 
				 extern void copy_user_huge_page(struct page *dst, struct page *src,
			
 
				 				unsigned long addr, struct vm_area_struct *vma,
			
 
				 				unsigned int pages_per_huge_page);
			
 
				+extern long copy_huge_page_from_user(struct page *dst_page,
			
 
				+				const void __user *usr_src,
			
 
				+				unsigned int pages_per_huge_page,
			
 
				+				bool allow_pagefault);
			
 
				 #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
			
 
				 
			
 
				 extern struct page_ext_operations debug_guardpage_ops;
			
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -779,7 +779,7 @@ static inline struct pglist_data *lruvec_pgdat(struct lruvec *lruvec)
 
				 #endif
			
 
				 }
			
 
				 
			
 
				-extern unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru);
			
 
				+extern unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx);
			
 
				 
			
 
				 #ifdef CONFIG_HAVE_MEMORY_PRESENT
			
 
				 void memory_present(int nid, unsigned long start, unsigned long end);
			
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -266,7 +266,6 @@ static inline struct page *find_get_page_flags(struct address_space *mapping,
 
				 
			
 
				 /**
			
 
				  * find_lock_page - locate, pin and lock a pagecache page
			
 
				- * pagecache_get_page - find and get a page reference
			
 
				  * @mapping: the address_space to search
			
 
				  * @offset: the page index
			
 
				  *
			
@@ -482,19 +481,11 @@ static inline int lock_page_or_retry(struct page *page, struct mm_struct *mm,
 
				 }
			
 
				 
			
 
				 /*
			
 
				- * This is exported only for wait_on_page_locked/wait_on_page_writeback,
			
 
				- * and for filesystems which need to wait on PG_private.
			
 
				+ * This is exported only for wait_on_page_locked/wait_on_page_writeback, etc.,
			
 
				+ * and should not be used directly.
			
 
				  */
			
 
				 extern void wait_on_page_bit(struct page *page, int bit_nr);
			
 
				 extern int wait_on_page_bit_killable(struct page *page, int bit_nr);
			
 
				-extern void wake_up_page_bit(struct page *page, int bit_nr);
			
 
				-
			
 
				-static inline void wake_up_page(struct page *page, int bit)
			
 
				-{
			
 
				-	if (!PageWaiters(page))
			
 
				-		return;
			
 
				-	wake_up_page_bit(page, bit);
			
 
				-}
			
 
				 
			
 
				 /* 
			
 
				  * Wait for a page to be unlocked.
			
--- a/include/linux/pfn_t.h
+++ b/include/linux/pfn_t.h
@@ -15,6 +15,12 @@
 
				 #define PFN_DEV (1ULL << (BITS_PER_LONG_LONG - 3))
			
 
				 #define PFN_MAP (1ULL << (BITS_PER_LONG_LONG - 4))
			
 
				 
			
 
				+#define PFN_FLAGS_TRACE \
			
 
				+	{ PFN_SG_CHAIN,	"SG_CHAIN" }, \
			
 
				+	{ PFN_SG_LAST,	"SG_LAST" }, \
			
 
				+	{ PFN_DEV,	"DEV" }, \
			
 
				+	{ PFN_MAP,	"MAP" }
			
 
				+
			
 
				 static inline pfn_t __pfn_to_pfn_t(unsigned long pfn, u64 flags)
			
 
				 {
			
 
				 	pfn_t pfn_t = { .val = pfn | (flags & PFN_FLAGS_MASK), };
			
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -124,4 +124,15 @@ static inline bool shmem_huge_enabled(struct vm_area_struct *vma)
 
				 }
			
 
				 #endif
			
 
				 
			
 
				+#ifdef CONFIG_SHMEM
			
 
				+extern int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
			
 
				+				  struct vm_area_struct *dst_vma,
			
 
				+				  unsigned long dst_addr,
			
 
				+				  unsigned long src_addr,
			
 
				+				  struct page **pagep);
			
 
				+#else
			
 
				+#define shmem_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, dst_addr, \
			
 
				+			       src_addr, pagep)        ({ BUG(); 0; })
			
 
				+#endif
			
 
				+
			
 
				 #endif
			
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -545,22 +545,49 @@ struct memcg_cache_array {
 
				  * array to be accessed without taking any locks, on relocation we free the old
			
 
				  * version only after a grace period.
			
 
				  *
			
 
				- * Child caches will hold extra metadata needed for its operation. Fields are:
			
 
				+ * Root and child caches hold different metadata.
			
 
				  *
			
 
				- * @memcg: pointer to the memcg this cache belongs to
			
 
				- * @root_cache: pointer to the global, root cache, this cache was derived from
			
 
				+ * @root_cache:	Common to root and child caches.  NULL for root, pointer to
			
 
				+ *		the root cache for children.
			
 
				  *
			
 
				- * Both root and child caches of the same kind are linked into a list chained
			
 
				- * through @list.
			
 
				+ * The following fields are specific to root caches.
			
 
				+ *
			
 
				+ * @memcg_caches: kmemcg ID indexed table of child caches.  This table is
			
 
				+ *		used to index child cachces during allocation and cleared
			
 
				+ *		early during shutdown.
			
 
				+ *
			
 
				+ * @root_caches_node: List node for slab_root_caches list.
			
 
				+ *
			
 
				+ * @children:	List of all child caches.  While the child caches are also
			
 
				+ *		reachable through @memcg_caches, a child cache remains on
			
 
				+ *		this list until it is actually destroyed.
			
 
				+ *
			
 
				+ * The following fields are specific to child caches.
			
 
				+ *
			
 
				+ * @memcg:	Pointer to the memcg this cache belongs to.
			
 
				+ *
			
 
				+ * @children_node: List node for @root_cache->children list.
			
 
				+ *
			
 
				+ * @kmem_caches_node: List node for @memcg->kmem_caches list.
			
 
				  */
			
 
				 struct memcg_cache_params {
			
 
				-	bool is_root_cache;
			
 
				-	struct list_head list;
			
 
				+	struct kmem_cache *root_cache;
			
 
				 	union {
			
 
				-		struct memcg_cache_array __rcu *memcg_caches;
			
 
				+		struct {
			
 
				+			struct memcg_cache_array __rcu *memcg_caches;
			
 
				+			struct list_head __root_caches_node;
			
 
				+			struct list_head children;
			
 
				+		};
			
 
				 		struct {
			
 
				 			struct mem_cgroup *memcg;
			
 
				-			struct kmem_cache *root_cache;
			
 
				+			struct list_head children_node;
			
 
				+			struct list_head kmem_caches_node;
			
 
				+
			
 
				+			void (*deact_fn)(struct kmem_cache *);
			
 
				+			union {
			
 
				+				struct rcu_head deact_rcu_head;
			
 
				+				struct work_struct deact_work;
			
 
				+			};
			
 
				 		};
			
 
				 	};
			
 
				 };
			
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -113,9 +113,9 @@ struct kmem_cache {
 
				 
			
 
				 #ifdef CONFIG_SYSFS
			
 
				 #define SLAB_SUPPORTS_SYSFS
			
 
				-void sysfs_slab_remove(struct kmem_cache *);
			
 
				+void sysfs_slab_release(struct kmem_cache *);
			
 
				 #else
			
 
				-static inline void sysfs_slab_remove(struct kmem_cache *s)
			
 
				+static inline void sysfs_slab_release(struct kmem_cache *s)
			
 
				 {
			
 
				 }
			
 
				 #endif
			
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -27,6 +27,7 @@ struct bio;
 
				 #define SWAP_FLAGS_VALID	(SWAP_FLAG_PRIO_MASK | SWAP_FLAG_PREFER | \
			
 
				 				 SWAP_FLAG_DISCARD | SWAP_FLAG_DISCARD_ONCE | \
			
 
				 				 SWAP_FLAG_DISCARD_PAGES)
			
 
				+#define SWAP_BATCH 64
			
 
				 
			
 
				 static inline int current_is_kswapd(void)
			
 
				 {
			
@@ -176,6 +177,12 @@ enum {
 
				  * protected by swap_info_struct.lock.
			
 
				  */
			
 
				 struct swap_cluster_info {
			
 
				+	spinlock_t lock;	/*
			
 
				+				 * Protect swap_cluster_info fields
			
 
				+				 * and swap_info_struct->swap_map
			
 
				+				 * elements correspond to the swap
			
 
				+				 * cluster
			
 
				+				 */
			
 
				 	unsigned int data:24;
			
 
				 	unsigned int flags:8;
			
 
				 };
			
@@ -337,8 +344,13 @@ int generic_swapfile_activate(struct swap_info_struct *, struct file *,
 
				 		sector_t *);
			
 
				 
			
 
				 /* linux/mm/swap_state.c */
			
 
				-extern struct address_space swapper_spaces[];
			
 
				-#define swap_address_space(entry) (&swapper_spaces[swp_type(entry)])
			
 
				+/* One swap address space for each 64M swap space */
			
 
				+#define SWAP_ADDRESS_SPACE_SHIFT	14
			
 
				+#define SWAP_ADDRESS_SPACE_PAGES	(1 << SWAP_ADDRESS_SPACE_SHIFT)
			
 
				+extern struct address_space *swapper_spaces[];
			
 
				+#define swap_address_space(entry)			    \
			
 
				+	(&swapper_spaces[swp_type(entry)][swp_offset(entry) \
			
 
				+		>> SWAP_ADDRESS_SPACE_SHIFT])
			
 
				 extern unsigned long total_swapcache_pages(void);
			
 
				 extern void show_swap_cache_info(void);
			
 
				 extern int add_to_swap(struct page *, struct list_head *list);
			
@@ -360,6 +372,7 @@ extern struct page *swapin_readahead(swp_entry_t, gfp_t,
 
				 /* linux/mm/swapfile.c */
			
 
				 extern atomic_long_t nr_swap_pages;
			
 
				 extern long total_swap_pages;
			
 
				+extern bool has_usable_swap(void);
			
 
				 
			
 
				 /* Swap 50% full? Release swapcache more aggressively.. */
			
 
				 static inline bool vm_swap_full(void)
			
@@ -375,23 +388,31 @@ static inline long get_nr_swap_pages(void)
 
				 extern void si_swapinfo(struct sysinfo *);
			
 
				 extern swp_entry_t get_swap_page(void);
			
 
				 extern swp_entry_t get_swap_page_of_type(int);
			
 
				+extern int get_swap_pages(int n, swp_entry_t swp_entries[]);
			
 
				 extern int add_swap_count_continuation(swp_entry_t, gfp_t);
			
 
				 extern void swap_shmem_alloc(swp_entry_t);
			
 
				 extern int swap_duplicate(swp_entry_t);
			
 
				 extern int swapcache_prepare(swp_entry_t);
			
 
				 extern void swap_free(swp_entry_t);
			
 
				 extern void swapcache_free(swp_entry_t);
			
 
				+extern void swapcache_free_entries(swp_entry_t *entries, int n);
			
 
				 extern int free_swap_and_cache(swp_entry_t);
			
 
				 extern int swap_type_of(dev_t, sector_t, struct block_device **);
			
 
				 extern unsigned int count_swap_pages(int, int);
			
 
				 extern sector_t map_swap_page(struct page *, struct block_device **);
			
 
				 extern sector_t swapdev_block(int, pgoff_t);
			
 
				 extern int page_swapcount(struct page *);
			
 
				+extern int __swp_swapcount(swp_entry_t entry);
			
 
				 extern int swp_swapcount(swp_entry_t entry);
			
 
				 extern struct swap_info_struct *page_swap_info(struct page *);
			
 
				 extern bool reuse_swap_page(struct page *, int *);
			
 
				 extern int try_to_free_swap(struct page *);
			
 
				 struct backing_dev_info;
			
 
				+extern int init_swap_address_space(unsigned int type, unsigned long nr_pages);
			
 
				+extern void exit_swap_address_space(unsigned int type);
			
 
				+
			
 
				+extern int get_swap_slots(int n, swp_entry_t *slots);
			
 
				+extern void swapcache_free_batch(swp_entry_t *entries, int n);
			
 
				 
			
 
				 #else /* CONFIG_SWAP */
			
 
				 
			
@@ -479,6 +500,11 @@ static inline int page_swapcount(struct page *page)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+static inline int __swp_swapcount(swp_entry_t entry)
			
 
				+{
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				 static inline int swp_swapcount(swp_entry_t entry)
			
 
				 {
			
 
				 	return 0;
			
--- a/include/linux/swap_slots.h
+++ b/include/linux/swap_slots.h
@@ -0,0 +1,30 @@
 
				+#ifndef _LINUX_SWAP_SLOTS_H
			
 
				+#define _LINUX_SWAP_SLOTS_H
			
 
				+
			
 
				+#include <linux/swap.h>
			
 
				+#include <linux/spinlock.h>
			
 
				+#include <linux/mutex.h>
			
 
				+
			
 
				+#define SWAP_SLOTS_CACHE_SIZE			SWAP_BATCH
			
 
				+#define THRESHOLD_ACTIVATE_SWAP_SLOTS_CACHE	(5*SWAP_SLOTS_CACHE_SIZE)
			
 
				+#define THRESHOLD_DEACTIVATE_SWAP_SLOTS_CACHE	(2*SWAP_SLOTS_CACHE_SIZE)
			
 
				+
			
 
				+struct swap_slots_cache {
			
 
				+	bool		lock_initialized;
			
 
				+	struct mutex	alloc_lock; /* protects slots, nr, cur */
			
 
				+	swp_entry_t	*slots;
			
 
				+	int		nr;
			
 
				+	int		cur;
			
 
				+	spinlock_t	free_lock;  /* protects slots_ret, n_ret */
			
 
				+	swp_entry_t	*slots_ret;
			
 
				+	int		n_ret;
			
 
				+};
			
 
				+
			
 
				+void disable_swap_slots_cache_lock(void);
			
 
				+void reenable_swap_slots_cache_unlock(void);
			
 
				+int enable_swap_slots_cache(void);
			
 
				+int free_swap_slot(swp_entry_t entry);
			
 
				+
			
 
				+extern bool swap_slot_cache_enabled;
			
 
				+
			
 
				+#endif /* _LINUX_SWAP_SLOTS_H */
			
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -23,6 +23,10 @@ const char *trace_print_symbols_seq(struct trace_seq *p, unsigned long val,
 
				 				    const struct trace_print_flags *symbol_array);
			
 
				 
			
 
				 #if BITS_PER_LONG == 32
			
 
				+const char *trace_print_flags_seq_u64(struct trace_seq *p, const char *delim,
			
 
				+		      unsigned long long flags,
			
 
				+		      const struct trace_print_flags_u64 *flag_array);
			
 
				+
			
 
				 const char *trace_print_symbols_seq_u64(struct trace_seq *p,
			
 
				 					unsigned long long val,
			
 
				 					const struct trace_print_flags_u64
			
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -52,6 +52,20 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma)
 
				 	return vma->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP);
			
 
				 }
			
 
				 
			
 
				+extern int dup_userfaultfd(struct vm_area_struct *, struct list_head *);
			
 
				+extern void dup_userfaultfd_complete(struct list_head *);
			
 
				+
			
 
				+extern void mremap_userfaultfd_prep(struct vm_area_struct *,
			
 
				+				    struct vm_userfaultfd_ctx *);
			
 
				+extern void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *,
			
 
				+					unsigned long from, unsigned long to,
			
 
				+					unsigned long len);
			
 
				+
			
 
				+extern void madvise_userfault_dontneed(struct vm_area_struct *vma,
			
 
				+				       struct vm_area_struct **prev,
			
 
				+				       unsigned long start,
			
 
				+				       unsigned long end);
			
 
				+
			
 
				 #else /* CONFIG_USERFAULTFD */
			
 
				 
			
 
				 /* mm helpers */
			
@@ -76,6 +90,34 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma)
 
				 	return false;
			
 
				 }
			
 
				 
			
 
				+static inline int dup_userfaultfd(struct vm_area_struct *vma,
			
 
				+				  struct list_head *l)
			
 
				+{
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static inline void dup_userfaultfd_complete(struct list_head *l)
			
 
				+{
			
 
				+}
			
 
				+
			
 
				+static inline void mremap_userfaultfd_prep(struct vm_area_struct *vma,
			
 
				+					   struct vm_userfaultfd_ctx *ctx)
			
 
				+{
			
 
				+}
			
 
				+
			
 
				+static inline void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *ctx,
			
 
				+					       unsigned long from,
			
 
				+					       unsigned long to,
			
 
				+					       unsigned long len)
			
 
				+{
			
 
				+}
			
 
				+
			
 
				+static inline void madvise_userfault_dontneed(struct vm_area_struct *vma,
			
 
				+					      struct vm_area_struct **prev,
			
 
				+					      unsigned long start,
			
 
				+					      unsigned long end)
			
 
				+{
			
 
				+}
			
 
				 #endif /* CONFIG_USERFAULTFD */
			
 
				 
			
 
				 #endif /* _LINUX_USERFAULTFD_K_H */
			
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -56,6 +56,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 
				 		COMPACTISOLATED,
			
 
				 		COMPACTSTALL, COMPACTFAIL, COMPACTSUCCESS,
			
 
				 		KCOMPACTD_WAKE,
			
 
				+		KCOMPACTD_MIGRATE_SCANNED, KCOMPACTD_FREE_SCANNED,
			
 
				 #endif
			
 
				 #ifdef CONFIG_HUGETLB_PAGE
			
 
				 		HTLB_BUDDY_PGALLOC, HTLB_BUDDY_PGALLOC_FAIL,
			
--- a/include/trace/events/compaction.h
+++ b/include/trace/events/compaction.h
@@ -9,62 +9,6 @@
 
				 #include <linux/tracepoint.h>
			
 
				 #include <trace/events/mmflags.h>
			
 
				 
			
 
				-#define COMPACTION_STATUS					\
			
 
				-	EM( COMPACT_SKIPPED,		"skipped")		\
			
 
				-	EM( COMPACT_DEFERRED,		"deferred")		\
			
 
				-	EM( COMPACT_CONTINUE,		"continue")		\
			
 
				-	EM( COMPACT_SUCCESS,		"success")		\
			
 
				-	EM( COMPACT_PARTIAL_SKIPPED,	"partial_skipped")	\
			
 
				-	EM( COMPACT_COMPLETE,		"complete")		\
			
 
				-	EM( COMPACT_NO_SUITABLE_PAGE,	"no_suitable_page")	\
			
 
				-	EM( COMPACT_NOT_SUITABLE_ZONE,	"not_suitable_zone")	\
			
 
				-	EMe(COMPACT_CONTENDED,		"contended")
			
 
				-
			
 
				-#ifdef CONFIG_ZONE_DMA
			
 
				-#define IFDEF_ZONE_DMA(X) X
			
 
				-#else
			
 
				-#define IFDEF_ZONE_DMA(X)
			
 
				-#endif
			
 
				-
			
 
				-#ifdef CONFIG_ZONE_DMA32
			
 
				-#define IFDEF_ZONE_DMA32(X) X
			
 
				-#else
			
 
				-#define IFDEF_ZONE_DMA32(X)
			
 
				-#endif
			
 
				-
			
 
				-#ifdef CONFIG_HIGHMEM
			
 
				-#define IFDEF_ZONE_HIGHMEM(X) X
			
 
				-#else
			
 
				-#define IFDEF_ZONE_HIGHMEM(X)
			
 
				-#endif
			
 
				-
			
 
				-#define ZONE_TYPE						\
			
 
				-	IFDEF_ZONE_DMA(		EM (ZONE_DMA,	 "DMA"))	\
			
 
				-	IFDEF_ZONE_DMA32(	EM (ZONE_DMA32,	 "DMA32"))	\
			
 
				-				EM (ZONE_NORMAL, "Normal")	\
			
 
				-	IFDEF_ZONE_HIGHMEM(	EM (ZONE_HIGHMEM,"HighMem"))	\
			
 
				-				EMe(ZONE_MOVABLE,"Movable")
			
 
				-
			
 
				-/*
			
 
				- * First define the enums in the above macros to be exported to userspace
			
 
				- * via TRACE_DEFINE_ENUM().
			
 
				- */
			
 
				-#undef EM
			
 
				-#undef EMe
			
 
				-#define EM(a, b)	TRACE_DEFINE_ENUM(a);
			
 
				-#define EMe(a, b)	TRACE_DEFINE_ENUM(a);
			
 
				-
			
 
				-COMPACTION_STATUS
			
 
				-ZONE_TYPE
			
 
				-
			
 
				-/*
			
 
				- * Now redefine the EM() and EMe() macros to map the enums to the strings
			
 
				- * that will be printed in the output.
			
 
				- */
			
 
				-#undef EM
			
 
				-#undef EMe
			
 
				-#define EM(a, b)	{a, b},
			
 
				-#define EMe(a, b)	{a, b}
			
 
				 
			
 
				 DECLARE_EVENT_CLASS(mm_compaction_isolate_template,
			
 
				 
			
@@ -187,6 +131,7 @@ TRACE_EVENT(mm_compaction_begin,
 
				 		__entry->sync ? "sync" : "async")
			
 
				 );
			
 
				 
			
 
				+#ifdef CONFIG_COMPACTION
			
 
				 TRACE_EVENT(mm_compaction_end,
			
 
				 	TP_PROTO(unsigned long zone_start, unsigned long migrate_pfn,
			
 
				 		unsigned long free_pfn, unsigned long zone_end, bool sync,
			
@@ -220,6 +165,7 @@ TRACE_EVENT(mm_compaction_end,
 
				 		__entry->sync ? "sync" : "async",
			
 
				 		__print_symbolic(__entry->status, COMPACTION_STATUS))
			
 
				 );
			
 
				+#endif
			
 
				 
			
 
				 TRACE_EVENT(mm_compaction_try_to_compact_pages,
			
 
				 
			
@@ -248,6 +194,7 @@ TRACE_EVENT(mm_compaction_try_to_compact_pages,
 
				 		__entry->prio)
			
 
				 );
			
 
				 
			
 
				+#ifdef CONFIG_COMPACTION
			
 
				 DECLARE_EVENT_CLASS(mm_compaction_suitable_template,
			
 
				 
			
 
				 	TP_PROTO(struct zone *zone,
			
@@ -295,7 +242,6 @@ DEFINE_EVENT(mm_compaction_suitable_template, mm_compaction_suitable,
 
				 	TP_ARGS(zone, order, ret)
			
 
				 );
			
 
				 
			
 
				-#ifdef CONFIG_COMPACTION
			
 
				 DECLARE_EVENT_CLASS(mm_compaction_defer_template,
			
 
				 
			
 
				 	TP_PROTO(struct zone *zone, int order),
			
--- a/include/trace/events/fs_dax.h
+++ b/include/trace/events/fs_dax.h
@@ -0,0 +1,156 @@
 
				+#undef TRACE_SYSTEM
			
 
				+#define TRACE_SYSTEM fs_dax
			
 
				+
			
 
				+#if !defined(_TRACE_FS_DAX_H) || defined(TRACE_HEADER_MULTI_READ)
			
 
				+#define _TRACE_FS_DAX_H
			
 
				+
			
 
				+#include <linux/tracepoint.h>
			
 
				+
			
 
				+DECLARE_EVENT_CLASS(dax_pmd_fault_class,
			
 
				+	TP_PROTO(struct inode *inode, struct vm_fault *vmf,
			
 
				+		pgoff_t max_pgoff, int result),
			
 
				+	TP_ARGS(inode, vmf, max_pgoff, result),
			
 
				+	TP_STRUCT__entry(
			
 
				+		__field(unsigned long, ino)
			
 
				+		__field(unsigned long, vm_start)
			
 
				+		__field(unsigned long, vm_end)
			
 
				+		__field(unsigned long, vm_flags)
			
 
				+		__field(unsigned long, address)
			
 
				+		__field(pgoff_t, pgoff)
			
 
				+		__field(pgoff_t, max_pgoff)
			
 
				+		__field(dev_t, dev)
			
 
				+		__field(unsigned int, flags)
			
 
				+		__field(int, result)
			
 
				+	),
			
 
				+	TP_fast_assign(
			
 
				+		__entry->dev = inode->i_sb->s_dev;
			
 
				+		__entry->ino = inode->i_ino;
			
 
				+		__entry->vm_start = vmf->vma->vm_start;
			
 
				+		__entry->vm_end = vmf->vma->vm_end;
			
 
				+		__entry->vm_flags = vmf->vma->vm_flags;
			
 
				+		__entry->address = vmf->address;
			
 
				+		__entry->flags = vmf->flags;
			
 
				+		__entry->pgoff = vmf->pgoff;
			
 
				+		__entry->max_pgoff = max_pgoff;
			
 
				+		__entry->result = result;
			
 
				+	),
			
 
				+	TP_printk("dev %d:%d ino %#lx %s %s address %#lx vm_start "
			
 
				+			"%#lx vm_end %#lx pgoff %#lx max_pgoff %#lx %s",
			
 
				+		MAJOR(__entry->dev),
			
 
				+		MINOR(__entry->dev),
			
 
				+		__entry->ino,
			
 
				+		__entry->vm_flags & VM_SHARED ? "shared" : "private",
			
 
				+		__print_flags(__entry->flags, "|", FAULT_FLAG_TRACE),
			
 
				+		__entry->address,
			
 
				+		__entry->vm_start,
			
 
				+		__entry->vm_end,
			
 
				+		__entry->pgoff,
			
 
				+		__entry->max_pgoff,
			
 
				+		__print_flags(__entry->result, "|", VM_FAULT_RESULT_TRACE)
			
 
				+	)
			
 
				+)
			
 
				+
			
 
				+#define DEFINE_PMD_FAULT_EVENT(name) \
			
 
				+DEFINE_EVENT(dax_pmd_fault_class, name, \
			
 
				+	TP_PROTO(struct inode *inode, struct vm_fault *vmf, \
			
 
				+		pgoff_t max_pgoff, int result), \
			
 
				+	TP_ARGS(inode, vmf, max_pgoff, result))
			
 
				+
			
 
				+DEFINE_PMD_FAULT_EVENT(dax_pmd_fault);
			
 
				+DEFINE_PMD_FAULT_EVENT(dax_pmd_fault_done);
			
 
				+
			
 
				+DECLARE_EVENT_CLASS(dax_pmd_load_hole_class,
			
 
				+	TP_PROTO(struct inode *inode, struct vm_fault *vmf,
			
 
				+		struct page *zero_page,
			
 
				+		void *radix_entry),
			
 
				+	TP_ARGS(inode, vmf, zero_page, radix_entry),
			
 
				+	TP_STRUCT__entry(
			
 
				+		__field(unsigned long, ino)
			
 
				+		__field(unsigned long, vm_flags)
			
 
				+		__field(unsigned long, address)
			
 
				+		__field(struct page *, zero_page)
			
 
				+		__field(void *, radix_entry)
			
 
				+		__field(dev_t, dev)
			
 
				+	),
			
 
				+	TP_fast_assign(
			
 
				+		__entry->dev = inode->i_sb->s_dev;
			
 
				+		__entry->ino = inode->i_ino;
			
 
				+		__entry->vm_flags = vmf->vma->vm_flags;
			
 
				+		__entry->address = vmf->address;
			
 
				+		__entry->zero_page = zero_page;
			
 
				+		__entry->radix_entry = radix_entry;
			
 
				+	),
			
 
				+	TP_printk("dev %d:%d ino %#lx %s address %#lx zero_page %p "
			
 
				+			"radix_entry %#lx",
			
 
				+		MAJOR(__entry->dev),
			
 
				+		MINOR(__entry->dev),
			
 
				+		__entry->ino,
			
 
				+		__entry->vm_flags & VM_SHARED ? "shared" : "private",
			
 
				+		__entry->address,
			
 
				+		__entry->zero_page,
			
 
				+		(unsigned long)__entry->radix_entry
			
 
				+	)
			
 
				+)
			
 
				+
			
 
				+#define DEFINE_PMD_LOAD_HOLE_EVENT(name) \
			
 
				+DEFINE_EVENT(dax_pmd_load_hole_class, name, \
			
 
				+	TP_PROTO(struct inode *inode, struct vm_fault *vmf, \
			
 
				+		struct page *zero_page, void *radix_entry), \
			
 
				+	TP_ARGS(inode, vmf, zero_page, radix_entry))
			
 
				+
			
 
				+DEFINE_PMD_LOAD_HOLE_EVENT(dax_pmd_load_hole);
			
 
				+DEFINE_PMD_LOAD_HOLE_EVENT(dax_pmd_load_hole_fallback);
			
 
				+
			
 
				+DECLARE_EVENT_CLASS(dax_pmd_insert_mapping_class,
			
 
				+	TP_PROTO(struct inode *inode, struct vm_fault *vmf,
			
 
				+		long length, pfn_t pfn, void *radix_entry),
			
 
				+	TP_ARGS(inode, vmf, length, pfn, radix_entry),
			
 
				+	TP_STRUCT__entry(
			
 
				+		__field(unsigned long, ino)
			
 
				+		__field(unsigned long, vm_flags)
			
 
				+		__field(unsigned long, address)
			
 
				+		__field(long, length)
			
 
				+		__field(u64, pfn_val)
			
 
				+		__field(void *, radix_entry)
			
 
				+		__field(dev_t, dev)
			
 
				+		__field(int, write)
			
 
				+	),
			
 
				+	TP_fast_assign(
			
 
				+		__entry->dev = inode->i_sb->s_dev;
			
 
				+		__entry->ino = inode->i_ino;
			
 
				+		__entry->vm_flags = vmf->vma->vm_flags;
			
 
				+		__entry->address = vmf->address;
			
 
				+		__entry->write = vmf->flags & FAULT_FLAG_WRITE;
			
 
				+		__entry->length = length;
			
 
				+		__entry->pfn_val = pfn.val;
			
 
				+		__entry->radix_entry = radix_entry;
			
 
				+	),
			
 
				+	TP_printk("dev %d:%d ino %#lx %s %s address %#lx length %#lx "
			
 
				+			"pfn %#llx %s radix_entry %#lx",
			
 
				+		MAJOR(__entry->dev),
			
 
				+		MINOR(__entry->dev),
			
 
				+		__entry->ino,
			
 
				+		__entry->vm_flags & VM_SHARED ? "shared" : "private",
			
 
				+		__entry->write ? "write" : "read",
			
 
				+		__entry->address,
			
 
				+		__entry->length,
			
 
				+		__entry->pfn_val & ~PFN_FLAGS_MASK,
			
 
				+		__print_flags_u64(__entry->pfn_val & PFN_FLAGS_MASK, "|",
			
 
				+			PFN_FLAGS_TRACE),
			
 
				+		(unsigned long)__entry->radix_entry
			
 
				+	)
			
 
				+)
			
 
				+
			
 
				+#define DEFINE_PMD_INSERT_MAPPING_EVENT(name) \
			
 
				+DEFINE_EVENT(dax_pmd_insert_mapping_class, name, \
			
 
				+	TP_PROTO(struct inode *inode, struct vm_fault *vmf, \
			
 
				+		long length, pfn_t pfn, void *radix_entry), \
			
 
				+	TP_ARGS(inode, vmf, length, pfn, radix_entry))
			
 
				+
			
 
				+DEFINE_PMD_INSERT_MAPPING_EVENT(dax_pmd_insert_mapping);
			
 
				+DEFINE_PMD_INSERT_MAPPING_EVENT(dax_pmd_insert_mapping_fallback);
			
 
				+
			
 
				+#endif /* _TRACE_FS_DAX_H */
			
 
				+
			
 
				+/* This part must be outside protection */
			
 
				+#include <trace/define_trace.h>
			
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -1,3 +1,6 @@
 
				+#include <linux/node.h>
			
 
				+#include <linux/mmzone.h>
			
 
				+#include <linux/compaction.h>
			
 
				 /*
			
 
				  * The order of these masks is important. Matching masks will be seen
			
 
				  * first and the left over flags will end up showing by themselves.
			
@@ -171,3 +174,98 @@ IF_HAVE_VM_SOFTDIRTY(VM_SOFTDIRTY,	"softdirty"	)		\
 
				 	(flags) ? __print_flags(flags, "|",				\
			
 
				 	__def_vmaflag_names						\
			
 
				 	) : "none"
			
 
				+
			
 
				+#ifdef CONFIG_COMPACTION
			
 
				+#define COMPACTION_STATUS					\
			
 
				+	EM( COMPACT_SKIPPED,		"skipped")		\
			
 
				+	EM( COMPACT_DEFERRED,		"deferred")		\
			
 
				+	EM( COMPACT_CONTINUE,		"continue")		\
			
 
				+	EM( COMPACT_SUCCESS,		"success")		\
			
 
				+	EM( COMPACT_PARTIAL_SKIPPED,	"partial_skipped")	\
			
 
				+	EM( COMPACT_COMPLETE,		"complete")		\
			
 
				+	EM( COMPACT_NO_SUITABLE_PAGE,	"no_suitable_page")	\
			
 
				+	EM( COMPACT_NOT_SUITABLE_ZONE,	"not_suitable_zone")	\
			
 
				+	EMe(COMPACT_CONTENDED,		"contended")
			
 
				+
			
 
				+/* High-level compaction status feedback */
			
 
				+#define COMPACTION_FAILED	1
			
 
				+#define COMPACTION_WITHDRAWN	2
			
 
				+#define COMPACTION_PROGRESS	3
			
 
				+
			
 
				+#define compact_result_to_feedback(result)	\
			
 
				+({						\
			
 
				+	enum compact_result __result = result;	\
			
 
				+	(compaction_failed(__result)) ? COMPACTION_FAILED : \
			
 
				+		(compaction_withdrawn(__result)) ? COMPACTION_WITHDRAWN : COMPACTION_PROGRESS; \
			
 
				+})
			
 
				+
			
 
				+#define COMPACTION_FEEDBACK		\
			
 
				+	EM(COMPACTION_FAILED,		"failed")	\
			
 
				+	EM(COMPACTION_WITHDRAWN,	"withdrawn")	\
			
 
				+	EMe(COMPACTION_PROGRESS,	"progress")
			
 
				+
			
 
				+#define COMPACTION_PRIORITY						\
			
 
				+	EM(COMPACT_PRIO_SYNC_FULL,	"COMPACT_PRIO_SYNC_FULL")	\
			
 
				+	EM(COMPACT_PRIO_SYNC_LIGHT,	"COMPACT_PRIO_SYNC_LIGHT")	\
			
 
				+	EMe(COMPACT_PRIO_ASYNC,		"COMPACT_PRIO_ASYNC")
			
 
				+#else
			
 
				+#define COMPACTION_STATUS
			
 
				+#define COMPACTION_PRIORITY
			
 
				+#define COMPACTION_FEEDBACK
			
 
				+#endif
			
 
				+
			
 
				+#ifdef CONFIG_ZONE_DMA
			
 
				+#define IFDEF_ZONE_DMA(X) X
			
 
				+#else
			
 
				+#define IFDEF_ZONE_DMA(X)
			
 
				+#endif
			
 
				+
			
 
				+#ifdef CONFIG_ZONE_DMA32
			
 
				+#define IFDEF_ZONE_DMA32(X) X
			
 
				+#else
			
 
				+#define IFDEF_ZONE_DMA32(X)
			
 
				+#endif
			
 
				+
			
 
				+#ifdef CONFIG_HIGHMEM
			
 
				+#define IFDEF_ZONE_HIGHMEM(X) X
			
 
				+#else
			
 
				+#define IFDEF_ZONE_HIGHMEM(X)
			
 
				+#endif
			
 
				+
			
 
				+#define ZONE_TYPE						\
			
 
				+	IFDEF_ZONE_DMA(		EM (ZONE_DMA,	 "DMA"))	\
			
 
				+	IFDEF_ZONE_DMA32(	EM (ZONE_DMA32,	 "DMA32"))	\
			
 
				+				EM (ZONE_NORMAL, "Normal")	\
			
 
				+	IFDEF_ZONE_HIGHMEM(	EM (ZONE_HIGHMEM,"HighMem"))	\
			
 
				+				EMe(ZONE_MOVABLE,"Movable")
			
 
				+
			
 
				+#define LRU_NAMES		\
			
 
				+		EM (LRU_INACTIVE_ANON, "inactive_anon") \
			
 
				+		EM (LRU_ACTIVE_ANON, "active_anon") \
			
 
				+		EM (LRU_INACTIVE_FILE, "inactive_file") \
			
 
				+		EM (LRU_ACTIVE_FILE, "active_file") \
			
 
				+		EMe(LRU_UNEVICTABLE, "unevictable")
			
 
				+
			
 
				+/*
			
 
				+ * First define the enums in the above macros to be exported to userspace
			
 
				+ * via TRACE_DEFINE_ENUM().
			
 
				+ */
			
 
				+#undef EM
			
 
				+#undef EMe
			
 
				+#define EM(a, b)	TRACE_DEFINE_ENUM(a);
			
 
				+#define EMe(a, b)	TRACE_DEFINE_ENUM(a);
			
 
				+
			
 
				+COMPACTION_STATUS
			
 
				+COMPACTION_PRIORITY
			
 
				+COMPACTION_FEEDBACK
			
 
				+ZONE_TYPE
			
 
				+LRU_NAMES
			
 
				+
			
 
				+/*
			
 
				+ * Now redefine the EM() and EMe() macros to map the enums to the strings
			
 
				+ * that will be printed in the output.
			
 
				+ */
			
 
				+#undef EM
			
 
				+#undef EMe
			
 
				+#define EM(a, b)	{a, b},
			
 
				+#define EMe(a, b)	{a, b}
			
--- a/include/trace/events/oom.h
+++ b/include/trace/events/oom.h
@@ -4,6 +4,7 @@
 
				 #if !defined(_TRACE_OOM_H) || defined(TRACE_HEADER_MULTI_READ)
			
 
				 #define _TRACE_OOM_H
			
 
				 #include <linux/tracepoint.h>
			
 
				+#include <trace/events/mmflags.h>
			
 
				 
			
 
				 TRACE_EVENT(oom_score_adj_update,
			
 
				 
			
@@ -27,6 +28,86 @@ TRACE_EVENT(oom_score_adj_update,
 
				 		__entry->pid, __entry->comm, __entry->oom_score_adj)
			
 
				 );
			
 
				 
			
 
				+TRACE_EVENT(reclaim_retry_zone,
			
 
				+
			
 
				+	TP_PROTO(struct zoneref *zoneref,
			
 
				+		int order,
			
 
				+		unsigned long reclaimable,
			
 
				+		unsigned long available,
			
 
				+		unsigned long min_wmark,
			
 
				+		int no_progress_loops,
			
 
				+		bool wmark_check),
			
 
				+
			
 
				+	TP_ARGS(zoneref, order, reclaimable, available, min_wmark, no_progress_loops, wmark_check),
			
 
				+
			
 
				+	TP_STRUCT__entry(
			
 
				+		__field(	int, node)
			
 
				+		__field(	int, zone_idx)
			
 
				+		__field(	int,	order)
			
 
				+		__field(	unsigned long,	reclaimable)
			
 
				+		__field(	unsigned long,	available)
			
 
				+		__field(	unsigned long,	min_wmark)
			
 
				+		__field(	int,	no_progress_loops)
			
 
				+		__field(	bool,	wmark_check)
			
 
				+	),
			
 
				+
			
 
				+	TP_fast_assign(
			
 
				+		__entry->node = zone_to_nid(zoneref->zone);
			
 
				+		__entry->zone_idx = zoneref->zone_idx;
			
 
				+		__entry->order = order;
			
 
				+		__entry->reclaimable = reclaimable;
			
 
				+		__entry->available = available;
			
 
				+		__entry->min_wmark = min_wmark;
			
 
				+		__entry->no_progress_loops = no_progress_loops;
			
 
				+		__entry->wmark_check = wmark_check;
			
 
				+	),
			
 
				+
			
 
				+	TP_printk("node=%d zone=%-8s order=%d reclaimable=%lu available=%lu min_wmark=%lu no_progress_loops=%d wmark_check=%d",
			
 
				+			__entry->node, __print_symbolic(__entry->zone_idx, ZONE_TYPE),
			
 
				+			__entry->order,
			
 
				+			__entry->reclaimable, __entry->available, __entry->min_wmark,
			
 
				+			__entry->no_progress_loops,
			
 
				+			__entry->wmark_check)
			
 
				+);
			
 
				+
			
 
				+#ifdef CONFIG_COMPACTION
			
 
				+TRACE_EVENT(compact_retry,
			
 
				+
			
 
				+	TP_PROTO(int order,
			
 
				+		enum compact_priority priority,
			
 
				+		enum compact_result result,
			
 
				+		int retries,
			
 
				+		int max_retries,
			
 
				+		bool ret),
			
 
				+
			
 
				+	TP_ARGS(order, priority, result, retries, max_retries, ret),
			
 
				+
			
 
				+	TP_STRUCT__entry(
			
 
				+		__field(	int, order)
			
 
				+		__field(	int, priority)
			
 
				+		__field(	int, result)
			
 
				+		__field(	int, retries)
			
 
				+		__field(	int, max_retries)
			
 
				+		__field(	bool, ret)
			
 
				+	),
			
 
				+
			
 
				+	TP_fast_assign(
			
 
				+		__entry->order = order;
			
 
				+		__entry->priority = priority;
			
 
				+		__entry->result = compact_result_to_feedback(result);
			
 
				+		__entry->retries = retries;
			
 
				+		__entry->max_retries = max_retries;
			
 
				+		__entry->ret = ret;
			
 
				+	),
			
 
				+
			
 
				+	TP_printk("order=%d priority=%s compaction_result=%s retries=%d max_retries=%d should_retry=%d",
			
 
				+			__entry->order,
			
 
				+			__print_symbolic(__entry->priority, COMPACTION_PRIORITY),
			
 
				+			__print_symbolic(__entry->result, COMPACTION_FEEDBACK),
			
 
				+			__entry->retries, __entry->max_retries,
			
 
				+			__entry->ret)
			
 
				+);
			
 
				+#endif /* CONFIG_COMPACTION */
			
 
				 #endif
			
 
				 
			
 
				 /* This part must be outside protection */
			
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -15,6 +15,7 @@
 
				 #define RECLAIM_WB_MIXED	0x0010u
			
 
				 #define RECLAIM_WB_SYNC		0x0004u /* Unused, all reclaim async */
			
 
				 #define RECLAIM_WB_ASYNC	0x0008u
			
 
				+#define RECLAIM_WB_LRU		(RECLAIM_WB_ANON|RECLAIM_WB_FILE)
			
 
				 
			
 
				 #define show_reclaim_flags(flags)				\
			
 
				 	(flags) ? __print_flags(flags, "|",			\
			
@@ -269,26 +270,27 @@ TRACE_EVENT(mm_shrink_slab_end,
 
				 		__entry->retval)
			
 
				 );
			
 
				 
			
 
				-DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template,
			
 
				-
			
 
				+TRACE_EVENT(mm_vmscan_lru_isolate,
			
 
				 	TP_PROTO(int classzone_idx,
			
 
				 		int order,
			
 
				 		unsigned long nr_requested,
			
 
				 		unsigned long nr_scanned,
			
 
				+		unsigned long nr_skipped,
			
 
				 		unsigned long nr_taken,
			
 
				 		isolate_mode_t isolate_mode,
			
 
				-		int file),
			
 
				+		int lru),
			
 
				 
			
 
				-	TP_ARGS(classzone_idx, order, nr_requested, nr_scanned, nr_taken, isolate_mode, file),
			
 
				+	TP_ARGS(classzone_idx, order, nr_requested, nr_scanned, nr_skipped, nr_taken, isolate_mode, lru),
			
 
				 
			
 
				 	TP_STRUCT__entry(
			
 
				 		__field(int, classzone_idx)
			
 
				 		__field(int, order)
			
 
				 		__field(unsigned long, nr_requested)
			
 
				 		__field(unsigned long, nr_scanned)
			
 
				+		__field(unsigned long, nr_skipped)
			
 
				 		__field(unsigned long, nr_taken)
			
 
				 		__field(isolate_mode_t, isolate_mode)
			
 
				-		__field(int, file)
			
 
				+		__field(int, lru)
			
 
				 	),
			
 
				 
			
 
				 	TP_fast_assign(
			
@@ -296,47 +298,21 @@ DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template,
 
				 		__entry->order = order;
			
 
				 		__entry->nr_requested = nr_requested;
			
 
				 		__entry->nr_scanned = nr_scanned;
			
 
				+		__entry->nr_skipped = nr_skipped;
			
 
				 		__entry->nr_taken = nr_taken;
			
 
				 		__entry->isolate_mode = isolate_mode;
			
 
				-		__entry->file = file;
			
 
				+		__entry->lru = lru;
			
 
				 	),
			
 
				 
			
 
				-	TP_printk("isolate_mode=%d classzone=%d order=%d nr_requested=%lu nr_scanned=%lu nr_taken=%lu file=%d",
			
 
				+	TP_printk("isolate_mode=%d classzone=%d order=%d nr_requested=%lu nr_scanned=%lu nr_skipped=%lu nr_taken=%lu lru=%s",
			
 
				 		__entry->isolate_mode,
			
 
				 		__entry->classzone_idx,
			
 
				 		__entry->order,
			
 
				 		__entry->nr_requested,
			
 
				 		__entry->nr_scanned,
			
 
				+		__entry->nr_skipped,
			
 
				 		__entry->nr_taken,
			
 
				-		__entry->file)
			
 
				-);
			
 
				-
			
 
				-DEFINE_EVENT(mm_vmscan_lru_isolate_template, mm_vmscan_lru_isolate,
			
 
				-
			
 
				-	TP_PROTO(int classzone_idx,
			
 
				-		int order,
			
 
				-		unsigned long nr_requested,
			
 
				-		unsigned long nr_scanned,
			
 
				-		unsigned long nr_taken,
			
 
				-		isolate_mode_t isolate_mode,
			
 
				-		int file),
			
 
				-
			
 
				-	TP_ARGS(classzone_idx, order, nr_requested, nr_scanned, nr_taken, isolate_mode, file)
			
 
				-
			
 
				-);
			
 
				-
			
 
				-DEFINE_EVENT(mm_vmscan_lru_isolate_template, mm_vmscan_memcg_isolate,
			
 
				-
			
 
				-	TP_PROTO(int classzone_idx,
			
 
				-		int order,
			
 
				-		unsigned long nr_requested,
			
 
				-		unsigned long nr_scanned,
			
 
				-		unsigned long nr_taken,
			
 
				-		isolate_mode_t isolate_mode,
			
 
				-		int file),
			
 
				-
			
 
				-	TP_ARGS(classzone_idx, order, nr_requested, nr_scanned, nr_taken, isolate_mode, file)
			
 
				-
			
 
				+		__print_symbolic(__entry->lru, LRU_NAMES))
			
 
				 );
			
 
				 
			
 
				 TRACE_EVENT(mm_vmscan_writepage,
			
@@ -365,14 +341,27 @@ TRACE_EVENT(mm_vmscan_lru_shrink_inactive,
 
				 
			
 
				 	TP_PROTO(int nid,
			
 
				 		unsigned long nr_scanned, unsigned long nr_reclaimed,
			
 
				+		unsigned long nr_dirty, unsigned long nr_writeback,
			
 
				+		unsigned long nr_congested, unsigned long nr_immediate,
			
 
				+		unsigned long nr_activate, unsigned long nr_ref_keep,
			
 
				+		unsigned long nr_unmap_fail,
			
 
				 		int priority, int file),
			
 
				 
			
 
				-	TP_ARGS(nid, nr_scanned, nr_reclaimed, priority, file),
			
 
				+	TP_ARGS(nid, nr_scanned, nr_reclaimed, nr_dirty, nr_writeback,
			
 
				+		nr_congested, nr_immediate, nr_activate, nr_ref_keep,
			
 
				+		nr_unmap_fail, priority, file),
			
 
				 
			
 
				 	TP_STRUCT__entry(
			
 
				 		__field(int, nid)
			
 
				 		__field(unsigned long, nr_scanned)
			
 
				 		__field(unsigned long, nr_reclaimed)
			
 
				+		__field(unsigned long, nr_dirty)
			
 
				+		__field(unsigned long, nr_writeback)
			
 
				+		__field(unsigned long, nr_congested)
			
 
				+		__field(unsigned long, nr_immediate)
			
 
				+		__field(unsigned long, nr_activate)
			
 
				+		__field(unsigned long, nr_ref_keep)
			
 
				+		__field(unsigned long, nr_unmap_fail)
			
 
				 		__field(int, priority)
			
 
				 		__field(int, reclaim_flags)
			
 
				 	),
			
@@ -381,17 +370,102 @@ TRACE_EVENT(mm_vmscan_lru_shrink_inactive,
 
				 		__entry->nid = nid;
			
 
				 		__entry->nr_scanned = nr_scanned;
			
 
				 		__entry->nr_reclaimed = nr_reclaimed;
			
 
				+		__entry->nr_dirty = nr_dirty;
			
 
				+		__entry->nr_writeback = nr_writeback;
			
 
				+		__entry->nr_congested = nr_congested;
			
 
				+		__entry->nr_immediate = nr_immediate;
			
 
				+		__entry->nr_activate = nr_activate;
			
 
				+		__entry->nr_ref_keep = nr_ref_keep;
			
 
				+		__entry->nr_unmap_fail = nr_unmap_fail;
			
 
				 		__entry->priority = priority;
			
 
				 		__entry->reclaim_flags = trace_shrink_flags(file);
			
 
				 	),
			
 
				 
			
 
				-	TP_printk("nid=%d nr_scanned=%ld nr_reclaimed=%ld priority=%d flags=%s",
			
 
				+	TP_printk("nid=%d nr_scanned=%ld nr_reclaimed=%ld nr_dirty=%ld nr_writeback=%ld nr_congested=%ld nr_immediate=%ld nr_activate=%ld nr_ref_keep=%ld nr_unmap_fail=%ld priority=%d flags=%s",
			
 
				 		__entry->nid,
			
 
				 		__entry->nr_scanned, __entry->nr_reclaimed,
			
 
				+		__entry->nr_dirty, __entry->nr_writeback,
			
 
				+		__entry->nr_congested, __entry->nr_immediate,
			
 
				+		__entry->nr_activate, __entry->nr_ref_keep,
			
 
				+		__entry->nr_unmap_fail, __entry->priority,
			
 
				+		show_reclaim_flags(__entry->reclaim_flags))
			
 
				+);
			
 
				+
			
 
				+TRACE_EVENT(mm_vmscan_lru_shrink_active,
			
 
				+
			
 
				+	TP_PROTO(int nid, unsigned long nr_taken,
			
 
				+		unsigned long nr_active, unsigned long nr_deactivated,
			
 
				+		unsigned long nr_referenced, int priority, int file),
			
 
				+
			
 
				+	TP_ARGS(nid, nr_taken, nr_active, nr_deactivated, nr_referenced, priority, file),
			
 
				+
			
 
				+	TP_STRUCT__entry(
			
 
				+		__field(int, nid)
			
 
				+		__field(unsigned long, nr_taken)
			
 
				+		__field(unsigned long, nr_active)
			
 
				+		__field(unsigned long, nr_deactivated)
			
 
				+		__field(unsigned long, nr_referenced)
			
 
				+		__field(int, priority)
			
 
				+		__field(int, reclaim_flags)
			
 
				+	),
			
 
				+
			
 
				+	TP_fast_assign(
			
 
				+		__entry->nid = nid;
			
 
				+		__entry->nr_taken = nr_taken;
			
 
				+		__entry->nr_active = nr_active;
			
 
				+		__entry->nr_deactivated = nr_deactivated;
			
 
				+		__entry->nr_referenced = nr_referenced;
			
 
				+		__entry->priority = priority;
			
 
				+		__entry->reclaim_flags = trace_shrink_flags(file);
			
 
				+	),
			
 
				+
			
 
				+	TP_printk("nid=%d nr_taken=%ld nr_active=%ld nr_deactivated=%ld nr_referenced=%ld priority=%d flags=%s",
			
 
				+		__entry->nid,
			
 
				+		__entry->nr_taken,
			
 
				+		__entry->nr_active, __entry->nr_deactivated, __entry->nr_referenced,
			
 
				 		__entry->priority,
			
 
				 		show_reclaim_flags(__entry->reclaim_flags))
			
 
				 );
			
 
				 
			
 
				+TRACE_EVENT(mm_vmscan_inactive_list_is_low,
			
 
				+
			
 
				+	TP_PROTO(int nid, int reclaim_idx,
			
 
				+		unsigned long total_inactive, unsigned long inactive,
			
 
				+		unsigned long total_active, unsigned long active,
			
 
				+		unsigned long ratio, int file),
			
 
				+
			
 
				+	TP_ARGS(nid, reclaim_idx, total_inactive, inactive, total_active, active, ratio, file),
			
 
				+
			
 
				+	TP_STRUCT__entry(
			
 
				+		__field(int, nid)
			
 
				+		__field(int, reclaim_idx)
			
 
				+		__field(unsigned long, total_inactive)
			
 
				+		__field(unsigned long, inactive)
			
 
				+		__field(unsigned long, total_active)
			
 
				+		__field(unsigned long, active)
			
 
				+		__field(unsigned long, ratio)
			
 
				+		__field(int, reclaim_flags)
			
 
				+	),
			
 
				+
			
 
				+	TP_fast_assign(
			
 
				+		__entry->nid = nid;
			
 
				+		__entry->reclaim_idx = reclaim_idx;
			
 
				+		__entry->total_inactive = total_inactive;
			
 
				+		__entry->inactive = inactive;
			
 
				+		__entry->total_active = total_active;
			
 
				+		__entry->active = active;
			
 
				+		__entry->ratio = ratio;
			
 
				+		__entry->reclaim_flags = trace_shrink_flags(file) & RECLAIM_WB_LRU;
			
 
				+	),
			
 
				+
			
 
				+	TP_printk("nid=%d reclaim_idx=%d total_inactive=%ld inactive=%ld total_active=%ld active=%ld ratio=%ld flags=%s",
			
 
				+		__entry->nid,
			
 
				+		__entry->reclaim_idx,
			
 
				+		__entry->total_inactive, __entry->inactive,
			
 
				+		__entry->total_active, __entry->active,
			
 
				+		__entry->ratio,
			
 
				+		show_reclaim_flags(__entry->reclaim_flags))
			
 
				+);
			
 
				 #endif /* _TRACE_VMSCAN_H */
			
 
				 
			
 
				 /* This part must be outside protection */
			
--- a/include/trace/trace_events.h
+++ b/include/trace/trace_events.h
@@ -283,8 +283,16 @@ TRACE_MAKE_SYSTEM_STR();
 
				 		trace_print_symbols_seq(p, value, symbols);		\
			
 
				 	})
			
 
				 
			
 
				+#undef __print_flags_u64
			
 
				 #undef __print_symbolic_u64
			
 
				 #if BITS_PER_LONG == 32
			
 
				+#define __print_flags_u64(flag, delim, flag_array...)			\
			
 
				+	({								\
			
 
				+		static const struct trace_print_flags_u64 __flags[] =	\
			
 
				+			{ flag_array, { -1, NULL } };			\
			
 
				+		trace_print_flags_seq_u64(p, delim, flag, __flags);	\
			
 
				+	})
			
 
				+
			
 
				 #define __print_symbolic_u64(value, symbol_array...)			\
			
 
				 	({								\
			
 
				 		static const struct trace_print_flags_u64 symbols[] =	\
			
@@ -292,6 +300,9 @@ TRACE_MAKE_SYSTEM_STR();
 
				 		trace_print_symbols_seq_u64(p, value, symbols);	\
			
 
				 	})
			
 
				 #else
			
 
				+#define __print_flags_u64(flag, delim, flag_array...)			\
			
 
				+			__print_flags(flag, delim, flag_array)
			
 
				+
			
 
				 #define __print_symbolic_u64(value, symbol_array...)			\
			
 
				 			__print_symbolic(value, symbol_array)
			
 
				 #endif
			
--- a/include/uapi/asm-generic/ioctl.h
+++ b/include/uapi/asm-generic/ioctl.h
@@ -48,6 +48,9 @@
 
				 /*
			
 
				  * Direction bits, which any architecture can choose to override
			
 
				  * before including this file.
			
 
				+ *
			
 
				+ * NOTE: _IOC_WRITE means userland is writing and kernel is
			
 
				+ * reading. _IOC_READ means userland is reading and kernel is writing.
			
 
				  */
			
 
				 
			
 
				 #ifndef _IOC_NONE
			
@@ -72,7 +75,12 @@
 
				 #define _IOC_TYPECHECK(t) (sizeof(t))
			
 
				 #endif
			
 
				 
			
 
				-/* used to create numbers */
			
 
				+/*
			
 
				+ * Used to create numbers.
			
 
				+ *
			
 
				+ * NOTE: _IOW means userland is writing and kernel is reading. _IOR
			
 
				+ * means userland is reading and kernel is writing.
			
 
				+ */
			
 
				 #define _IO(type,nr)		_IOC(_IOC_NONE,(type),(nr),0)
			
 
				 #define _IOR(type,nr,size)	_IOC(_IOC_READ,(type),(nr),(_IOC_TYPECHECK(size)))
			
 
				 #define _IOW(type,nr,size)	_IOC(_IOC_WRITE,(type),(nr),(_IOC_TYPECHECK(size)))
			
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -11,13 +11,18 @@
 
				 
			
 
				 #include <linux/types.h>
			
 
				 
			
 
				-#define UFFD_API ((__u64)0xAA)
			
 
				 /*
			
 
				- * After implementing the respective features it will become:
			
 
				- * #define UFFD_API_FEATURES (UFFD_FEATURE_PAGEFAULT_FLAG_WP | \
			
 
				- *			      UFFD_FEATURE_EVENT_FORK)
			
 
				+ * If the UFFDIO_API is upgraded someday, the UFFDIO_UNREGISTER and
			
 
				+ * UFFDIO_WAKE ioctls should be defined as _IOW and not as _IOR.  In
			
 
				+ * userfaultfd.h we assumed the kernel was reading (instead _IOC_READ
			
 
				+ * means the userland is reading).
			
 
				  */
			
 
				-#define UFFD_API_FEATURES (0)
			
 
				+#define UFFD_API ((__u64)0xAA)
			
 
				+#define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK |		\
			
 
				+			   UFFD_FEATURE_EVENT_REMAP |		\
			
 
				+			   UFFD_FEATURE_EVENT_MADVDONTNEED |	\
			
 
				+			   UFFD_FEATURE_MISSING_HUGETLBFS |	\
			
 
				+			   UFFD_FEATURE_MISSING_SHMEM)
			
 
				 #define UFFD_API_IOCTLS				\
			
 
				 	((__u64)1 << _UFFDIO_REGISTER |		\
			
 
				 	 (__u64)1 << _UFFDIO_UNREGISTER |	\
			
@@ -26,6 +31,9 @@
 
				 	((__u64)1 << _UFFDIO_WAKE |		\
			
 
				 	 (__u64)1 << _UFFDIO_COPY |		\
			
 
				 	 (__u64)1 << _UFFDIO_ZEROPAGE)
			
 
				+#define UFFD_API_RANGE_IOCTLS_BASIC		\
			
 
				+	((__u64)1 << _UFFDIO_WAKE |		\
			
 
				+	 (__u64)1 << _UFFDIO_COPY)
			
 
				 
			
 
				 /*
			
 
				  * Valid ioctl command number range with this API is from 0x00 to
			
@@ -71,6 +79,21 @@ struct uffd_msg {
 
				 			__u64	address;
			
 
				 		} pagefault;
			
 
				 
			
 
				+		struct {
			
 
				+			__u32	ufd;
			
 
				+		} fork;
			
 
				+
			
 
				+		struct {
			
 
				+			__u64	from;
			
 
				+			__u64	to;
			
 
				+			__u64	len;
			
 
				+		} remap;
			
 
				+
			
 
				+		struct {
			
 
				+			__u64	start;
			
 
				+			__u64	end;
			
 
				+		} madv_dn;
			
 
				+
			
 
				 		struct {
			
 
				 			/* unused reserved fields */
			
 
				 			__u64	reserved1;
			
@@ -84,9 +107,9 @@ struct uffd_msg {
 
				  * Start at 0x12 and not at 0 to be more strict against bugs.
			
 
				  */
			
 
				 #define UFFD_EVENT_PAGEFAULT	0x12
			
 
				-#if 0 /* not available yet */
			
 
				 #define UFFD_EVENT_FORK		0x13
			
 
				-#endif
			
 
				+#define UFFD_EVENT_REMAP	0x14
			
 
				+#define UFFD_EVENT_MADVDONTNEED	0x15
			
 
				 
			
 
				 /* flags for UFFD_EVENT_PAGEFAULT */
			
 
				 #define UFFD_PAGEFAULT_FLAG_WRITE	(1<<0)	/* If this was a write fault */
			
@@ -104,11 +127,37 @@ struct uffdio_api {
 
				 	 * Note: UFFD_EVENT_PAGEFAULT and UFFD_PAGEFAULT_FLAG_WRITE
			
 
				 	 * are to be considered implicitly always enabled in all kernels as
			
 
				 	 * long as the uffdio_api.api requested matches UFFD_API.
			
 
				+	 *
			
 
				+	 * UFFD_FEATURE_MISSING_HUGETLBFS means an UFFDIO_REGISTER
			
 
				+	 * with UFFDIO_REGISTER_MODE_MISSING mode will succeed on
			
 
				+	 * hugetlbfs virtual memory ranges. Adding or not adding
			
 
				+	 * UFFD_FEATURE_MISSING_HUGETLBFS to uffdio_api.features has
			
 
				+	 * no real functional effect after UFFDIO_API returns, but
			
 
				+	 * it's only useful for an initial feature set probe at
			
 
				+	 * UFFDIO_API time. There are two ways to use it:
			
 
				+	 *
			
 
				+	 * 1) by adding UFFD_FEATURE_MISSING_HUGETLBFS to the
			
 
				+	 *    uffdio_api.features before calling UFFDIO_API, an error
			
 
				+	 *    will be returned by UFFDIO_API on a kernel without
			
 
				+	 *    hugetlbfs missing support
			
 
				+	 *
			
 
				+	 * 2) the UFFD_FEATURE_MISSING_HUGETLBFS can not be added in
			
 
				+	 *    uffdio_api.features and instead it will be set by the
			
 
				+	 *    kernel in the uffdio_api.features if the kernel supports
			
 
				+	 *    it, so userland can later check if the feature flag is
			
 
				+	 *    present in uffdio_api.features after UFFDIO_API
			
 
				+	 *    succeeded.
			
 
				+	 *
			
 
				+	 * UFFD_FEATURE_MISSING_SHMEM works the same as
			
 
				+	 * UFFD_FEATURE_MISSING_HUGETLBFS, but it applies to shmem
			
 
				+	 * (i.e. tmpfs and other shmem based APIs).
			
 
				 	 */
			
 
				-#if 0 /* not available yet */
			
 
				 #define UFFD_FEATURE_PAGEFAULT_FLAG_WP		(1<<0)
			
 
				 #define UFFD_FEATURE_EVENT_FORK			(1<<1)
			
 
				-#endif
			
 
				+#define UFFD_FEATURE_EVENT_REMAP		(1<<2)
			
 
				+#define UFFD_FEATURE_EVENT_MADVDONTNEED		(1<<3)
			
 
				+#define UFFD_FEATURE_MISSING_HUGETLBFS		(1<<4)
			
 
				+#define UFFD_FEATURE_MISSING_SHMEM		(1<<5)
			
 
				 	__u64 features;
			
 
				 
			
 
				 	__u64 ioctls;
			
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1781,6 +1781,20 @@ config SLUB_DEBUG
 
				 	  SLUB sysfs support. /sys/slab will not exist and there will be
			
 
				 	  no support for cache validation etc.
			
 
				 
			
 
				+config SLUB_MEMCG_SYSFS_ON
			
 
				+	default n
			
 
				+	bool "Enable memcg SLUB sysfs support by default" if EXPERT
			
 
				+	depends on SLUB && SYSFS && MEMCG
			
 
				+	help
			
 
				+	  SLUB creates a directory under /sys/kernel/slab for each
			
 
				+	  allocation cache to host info and debug files. If memory
			
 
				+	  cgroup is enabled, each cache can have per memory cgroup
			
 
				+	  caches. SLUB can create the same sysfs directories for these
			
 
				+	  caches under /sys/kernel/slab/CACHE/cgroup but it can lead
			
 
				+	  to a very high number of debug files being created. This is
			
 
				+	  controlled by slub_memcg_sysfs boot parameter and this
			
 
				+	  config option determines the parameter's default value.
			
 
				+
			
 
				 config COMPAT_BRK
			
 
				 	bool "Disable heap randomization"
			
 
				 	default y
			
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -55,6 +55,7 @@
 
				 #include <linux/rmap.h>
			
 
				 #include <linux/ksm.h>
			
 
				 #include <linux/acct.h>
			
 
				+#include <linux/userfaultfd_k.h>
			
 
				 #include <linux/tsacct_kern.h>
			
 
				 #include <linux/cn_proc.h>
			
 
				 #include <linux/freezer.h>
			
@@ -561,6 +562,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
 
				 	struct rb_node **rb_link, *rb_parent;
			
 
				 	int retval;
			
 
				 	unsigned long charge;
			
 
				+	LIST_HEAD(uf);
			
 
				 
			
 
				 	uprobe_start_dup_mmap();
			
 
				 	if (down_write_killable(&oldmm->mmap_sem)) {
			
@@ -617,12 +619,13 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
 
				 		if (retval)
			
 
				 			goto fail_nomem_policy;
			
 
				 		tmp->vm_mm = mm;
			
 
				+		retval = dup_userfaultfd(tmp, &uf);
			
 
				+		if (retval)
			
 
				+			goto fail_nomem_anon_vma_fork;
			
 
				 		if (anon_vma_fork(tmp, mpnt))
			
 
				 			goto fail_nomem_anon_vma_fork;
			
 
				-		tmp->vm_flags &=
			
 
				-			~(VM_LOCKED|VM_LOCKONFAULT|VM_UFFD_MISSING|VM_UFFD_WP);
			
 
				+		tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
			
 
				 		tmp->vm_next = tmp->vm_prev = NULL;
			
 
				-		tmp->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
			
 
				 		file = tmp->vm_file;
			
 
				 		if (file) {
			
 
				 			struct inode *inode = file_inode(file);
			
@@ -678,6 +681,7 @@ out:
 
				 	up_write(&mm->mmap_sem);
			
 
				 	flush_tlb_mm(oldmm);
			
 
				 	up_write(&oldmm->mmap_sem);
			
 
				+	dup_userfaultfd_complete(&uf);
			
 
				 fail_uprobe_end:
			
 
				 	uprobe_end_dup_mmap();
			
 
				 	return retval;
			
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -123,6 +123,44 @@ trace_print_symbols_seq(struct trace_seq *p, unsigned long val,
 
				 EXPORT_SYMBOL(trace_print_symbols_seq);
			
 
				 
			
 
				 #if BITS_PER_LONG == 32
			
 
				+const char *
			
 
				+trace_print_flags_seq_u64(struct trace_seq *p, const char *delim,
			
 
				+		      unsigned long long flags,
			
 
				+		      const struct trace_print_flags_u64 *flag_array)
			
 
				+{
			
 
				+	unsigned long long mask;
			
 
				+	const char *str;
			
 
				+	const char *ret = trace_seq_buffer_ptr(p);
			
 
				+	int i, first = 1;
			
 
				+
			
 
				+	for (i = 0;  flag_array[i].name && flags; i++) {
			
 
				+
			
 
				+		mask = flag_array[i].mask;
			
 
				+		if ((flags & mask) != mask)
			
 
				+			continue;
			
 
				+
			
 
				+		str = flag_array[i].name;
			
 
				+		flags &= ~mask;
			
 
				+		if (!first && delim)
			
 
				+			trace_seq_puts(p, delim);
			
 
				+		else
			
 
				+			first = 0;
			
 
				+		trace_seq_puts(p, str);
			
 
				+	}
			
 
				+
			
 
				+	/* check for left over flags */
			
 
				+	if (flags) {
			
 
				+		if (!first && delim)
			
 
				+			trace_seq_puts(p, delim);
			
 
				+		trace_seq_printf(p, "0x%llx", flags);
			
 
				+	}
			
 
				+
			
 
				+	trace_seq_putc(p, 0);
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+EXPORT_SYMBOL(trace_print_flags_seq_u64);
			
 
				+
			
 
				 const char *
			
 
				 trace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val,
			
 
				 			 const struct trace_print_flags_u64 *symbol_array)
			
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -137,12 +137,14 @@ static void watchdog_overflow_callback(struct perf_event *event,
 
				  * Reduce the watchdog noise by only printing messages
			
 
				  * that are different from what cpu0 displayed.
			
 
				  */
			
 
				-static unsigned long cpu0_err;
			
 
				+static unsigned long firstcpu_err;
			
 
				+static atomic_t watchdog_cpus;
			
 
				 
			
 
				 int watchdog_nmi_enable(unsigned int cpu)
			
 
				 {
			
 
				 	struct perf_event_attr *wd_attr;
			
 
				 	struct perf_event *event = per_cpu(watchdog_ev, cpu);
			
 
				+	int firstcpu = 0;
			
 
				 
			
 
				 	/* nothing to do if the hard lockup detector is disabled */
			
 
				 	if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
			
@@ -156,19 +158,22 @@ int watchdog_nmi_enable(unsigned int cpu)
 
				 	if (event != NULL)
			
 
				 		goto out_enable;
			
 
				 
			
 
				+	if (atomic_inc_return(&watchdog_cpus) == 1)
			
 
				+		firstcpu = 1;
			
 
				+
			
 
				 	wd_attr = &wd_hw_attr;
			
 
				 	wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
			
 
				 
			
 
				 	/* Try to register using hardware perf events */
			
 
				 	event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
			
 
				 
			
 
				-	/* save cpu0 error for future comparision */
			
 
				-	if (cpu == 0 && IS_ERR(event))
			
 
				-		cpu0_err = PTR_ERR(event);
			
 
				+	/* save the first cpu's error for future comparision */
			
 
				+	if (firstcpu && IS_ERR(event))
			
 
				+		firstcpu_err = PTR_ERR(event);
			
 
				 
			
 
				 	if (!IS_ERR(event)) {
			
 
				-		/* only print for cpu0 or different than cpu0 */
			
 
				-		if (cpu == 0 || cpu0_err)
			
 
				+		/* only print for the first cpu initialized */
			
 
				+		if (firstcpu || firstcpu_err)
			
 
				 			pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n");
			
 
				 		goto out_save;
			
 
				 	}
			
@@ -186,7 +191,7 @@ int watchdog_nmi_enable(unsigned int cpu)
 
				 	smp_mb__after_atomic();
			
 
				 
			
 
				 	/* skip displaying the same error again */
			
 
				-	if (cpu > 0 && (PTR_ERR(event) == cpu0_err))
			
 
				+	if (!firstcpu && (PTR_ERR(event) == firstcpu_err))
			
 
				 		return PTR_ERR(event);
			
 
				 
			
 
				 	/* vary the KERN level based on the returned errno */
			
@@ -222,9 +227,9 @@ void watchdog_nmi_disable(unsigned int cpu)
 
				 
			
 
				 		/* should be in cleanup, but blocks oprofile */
			
 
				 		perf_event_release_kernel(event);
			
 
				-	}
			
 
				-	if (cpu == 0) {
			
 
				+
			
 
				 		/* watchdog_nmi_enable() expects this to be zero initially. */
			
 
				-		cpu0_err = 0;
			
 
				+		if (atomic_dec_and_test(&watchdog_cpus))
			
 
				+			firstcpu_err = 0;
			
 
				 	}
			
 
				 }
			
--- a/lib/dma-debug.c
+++ b/lib/dma-debug.c
@@ -1155,6 +1155,11 @@ static void check_unmap(struct dma_debug_entry *ref)
 
				 			   dir2name[ref->direction]);
			
 
				 	}
			
 
				 
			
 
				+	/*
			
 
				+	 * Drivers should use dma_mapping_error() to check the returned
			
 
				+	 * addresses of dma_map_single() and dma_map_page().
			
 
				+	 * If not, print this warning message. See Documentation/DMA-API.txt.
			
 
				+	 */
			
 
				 	if (entry->map_err_type == MAP_ERR_NOT_CHECKED) {
			
 
				 		err_printk(ref->dev, entry,
			
 
				 			   "DMA-API: device driver failed to check map error"
			
--- a/lib/show_mem.c
+++ b/lib/show_mem.c
@@ -9,13 +9,13 @@
 
				 #include <linux/quicklist.h>
			
 
				 #include <linux/cma.h>
			
 
				 
			
 
				-void show_mem(unsigned int filter)
			
 
				+void show_mem(unsigned int filter, nodemask_t *nodemask)
			
 
				 {
			
 
				 	pg_data_t *pgdat;
			
 
				 	unsigned long total = 0, reserved = 0, highmem = 0;
			
 
				 
			
 
				 	printk("Mem-Info:\n");
			
 
				-	show_free_areas(filter);
			
 
				+	show_free_areas(filter, nodemask);
			
 
				 
			
 
				 	for_each_online_pgdat(pgdat) {
			
 
				 		unsigned long flags;
			
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -35,7 +35,7 @@ obj-y			:= filemap.o mempool.o oom_kill.o \
 
				 			   readahead.o swap.o truncate.o vmscan.o shmem.o \
			
 
				 			   util.o mmzone.o vmstat.o backing-dev.o \
			
 
				 			   mm_init.o mmu_context.o percpu.o slab_common.o \
			
 
				-			   compaction.o vmacache.o \
			
 
				+			   compaction.o vmacache.o swap_slots.o \
			
 
				 			   interval_tree.o list_lru.o workingset.o \
			
 
				 			   debug.o $(mmu-y)
			
 
				 
			
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -411,8 +411,8 @@ retry:
 
				 
			
 
				 	while (*node != NULL) {
			
 
				 		parent = *node;
			
 
				-		congested = container_of(parent, struct bdi_writeback_congested,
			
 
				-					 rb_node);
			
 
				+		congested = rb_entry(parent, struct bdi_writeback_congested,
			
 
				+				     rb_node);
			
 
				 		if (congested->blkcg_id < blkcg_id)
			
 
				 			node = &parent->rb_left;
			
 
				 		else if (congested->blkcg_id > blkcg_id)
			
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -53,7 +53,7 @@ early_param("bootmem_debug", bootmem_debug_setup);
 
				 
			
 
				 static unsigned long __init bootmap_bytes(unsigned long pages)
			
 
				 {
			
 
				-	unsigned long bytes = DIV_ROUND_UP(pages, 8);
			
 
				+	unsigned long bytes = DIV_ROUND_UP(pages, BITS_PER_BYTE);
			
 
				 
			
 
				 	return ALIGN(bytes, sizeof(long));
			
 
				 }
			
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -548,7 +548,7 @@ isolate_fail:
 
				 	if (blockpfn == end_pfn)
			
 
				 		update_pageblock_skip(cc, valid_page, total_isolated, false);
			
 
				 
			
 
				-	count_compact_events(COMPACTFREE_SCANNED, nr_scanned);
			
 
				+	cc->total_free_scanned += nr_scanned;
			
 
				 	if (total_isolated)
			
 
				 		count_compact_events(COMPACTISOLATED, total_isolated);
			
 
				 	return total_isolated;
			
@@ -931,7 +931,7 @@ isolate_fail:
 
				 	trace_mm_compaction_isolate_migratepages(start_pfn, low_pfn,
			
 
				 						nr_scanned, nr_isolated);
			
 
				 
			
 
				-	count_compact_events(COMPACTMIGRATE_SCANNED, nr_scanned);
			
 
				+	cc->total_migrate_scanned += nr_scanned;
			
 
				 	if (nr_isolated)
			
 
				 		count_compact_events(COMPACTISOLATED, nr_isolated);
			
 
				 
			
@@ -1631,6 +1631,9 @@ out:
 
				 			zone->compact_cached_free_pfn = free_pfn;
			
 
				 	}
			
 
				 
			
 
				+	count_compact_events(COMPACTMIGRATE_SCANNED, cc->total_migrate_scanned);
			
 
				+	count_compact_events(COMPACTFREE_SCANNED, cc->total_free_scanned);
			
 
				+
			
 
				 	trace_mm_compaction_end(start_pfn, cc->migrate_pfn,
			
 
				 				cc->free_pfn, end_pfn, sync, ret);
			
 
				 
			
@@ -1645,6 +1648,8 @@ static enum compact_result compact_zone_order(struct zone *zone, int order,
 
				 	struct compact_control cc = {
			
 
				 		.nr_freepages = 0,
			
 
				 		.nr_migratepages = 0,
			
 
				+		.total_migrate_scanned = 0,
			
 
				+		.total_free_scanned = 0,
			
 
				 		.order = order,
			
 
				 		.gfp_mask = gfp_mask,
			
 
				 		.zone = zone,
			
@@ -1757,6 +1762,8 @@ static void compact_node(int nid)
 
				 	struct zone *zone;
			
 
				 	struct compact_control cc = {
			
 
				 		.order = -1,
			
 
				+		.total_migrate_scanned = 0,
			
 
				+		.total_free_scanned = 0,
			
 
				 		.mode = MIGRATE_SYNC,
			
 
				 		.ignore_skip_hint = true,
			
 
				 		.whole_zone = true,
			
@@ -1883,6 +1890,8 @@ static void kcompactd_do_work(pg_data_t *pgdat)
 
				 	struct zone *zone;
			
 
				 	struct compact_control cc = {
			
 
				 		.order = pgdat->kcompactd_max_order,
			
 
				+		.total_migrate_scanned = 0,
			
 
				+		.total_free_scanned = 0,
			
 
				 		.classzone_idx = pgdat->kcompactd_classzone_idx,
			
 
				 		.mode = MIGRATE_SYNC_LIGHT,
			
 
				 		.ignore_skip_hint = true,
			
@@ -1891,7 +1900,7 @@ static void kcompactd_do_work(pg_data_t *pgdat)
 
				 	};
			
 
				 	trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order,
			
 
				 							cc.classzone_idx);
			
 
				-	count_vm_event(KCOMPACTD_WAKE);
			
 
				+	count_compact_event(KCOMPACTD_WAKE);
			
 
				 
			
 
				 	for (zoneid = 0; zoneid <= cc.classzone_idx; zoneid++) {
			
 
				 		int status;
			
@@ -1909,6 +1918,8 @@ static void kcompactd_do_work(pg_data_t *pgdat)
 
				 
			
 
				 		cc.nr_freepages = 0;
			
 
				 		cc.nr_migratepages = 0;
			
 
				+		cc.total_migrate_scanned = 0;
			
 
				+		cc.total_free_scanned = 0;
			
 
				 		cc.zone = zone;
			
 
				 		INIT_LIST_HEAD(&cc.freepages);
			
 
				 		INIT_LIST_HEAD(&cc.migratepages);
			
@@ -1927,6 +1938,11 @@ static void kcompactd_do_work(pg_data_t *pgdat)
 
				 			defer_compaction(zone, cc.order);
			
 
				 		}
			
 
				 
			
 
				+		count_compact_events(KCOMPACTD_MIGRATE_SCANNED,
			
 
				+				     cc.total_migrate_scanned);
			
 
				+		count_compact_events(KCOMPACTD_FREE_SCANNED,
			
 
				+				     cc.total_free_scanned);
			
 
				+
			
 
				 		VM_BUG_ON(!list_empty(&cc.freepages));
			
 
				 		VM_BUG_ON(!list_empty(&cc.migratepages));
			
 
				 	}
			
@@ -1950,6 +1966,13 @@ void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx)
 
				 	if (pgdat->kcompactd_max_order < order)
			
 
				 		pgdat->kcompactd_max_order = order;
			
 
				 
			
 
				+	/*
			
 
				+	 * Pairs with implicit barrier in wait_event_freezable()
			
 
				+	 * such that wakeups are not missed in the lockless
			
 
				+	 * waitqueue_active() call.
			
 
				+	 */
			
 
				+	smp_acquire__after_ctrl_dep();
			
 
				+
			
 
				 	if (pgdat->kcompactd_classzone_idx > classzone_idx)
			
 
				 		pgdat->kcompactd_classzone_idx = classzone_idx;
			
 
				 
			
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -788,7 +788,7 @@ static int wake_page_function(wait_queue_t *wait, unsigned mode, int sync, void
 
				 	return autoremove_wake_function(wait, mode, sync, key);
			
 
				 }
			
 
				 
			
 
				-void wake_up_page_bit(struct page *page, int bit_nr)
			
 
				+static void wake_up_page_bit(struct page *page, int bit_nr)
			
 
				 {
			
 
				 	wait_queue_head_t *q = page_waitqueue(page);
			
 
				 	struct wait_page_key key;
			
@@ -821,7 +821,13 @@ void wake_up_page_bit(struct page *page, int bit_nr)
 
				 	}
			
 
				 	spin_unlock_irqrestore(&q->lock, flags);
			
 
				 }
			
 
				-EXPORT_SYMBOL(wake_up_page_bit);
			
 
				+
			
 
				+static void wake_up_page(struct page *page, int bit)
			
 
				+{
			
 
				+	if (!PageWaiters(page))
			
 
				+		return;
			
 
				+	wake_up_page_bit(page, bit);
			
 
				+}
			
 
				 
			
 
				 static inline int wait_on_page_bit_common(wait_queue_head_t *q,
			
 
				 		struct page *page, int bit_nr, int state, bool lock)
			
@@ -1013,7 +1019,7 @@ EXPORT_SYMBOL_GPL(page_endio);
 
				 
			
 
				 /**
			
 
				  * __lock_page - get a lock on the page, assuming we need to sleep to get it
			
 
				- * @page: the page to lock
			
 
				+ * @__page: the page to lock
			
 
				  */
			
 
				 void __lock_page(struct page *__page)
			
 
				 {
			
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -572,7 +572,7 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 
				 			if (is_vm_hugetlb_page(vma)) {
			
 
				 				i = follow_hugetlb_page(mm, vma, pages, vmas,
			
 
				 						&start, &nr_pages, i,
			
 
				-						gup_flags);
			
 
				+						gup_flags, nonblocking);
			
 
				 				continue;
			
 
				 			}
			
 
				 		}
			
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -142,42 +142,6 @@ static struct shrinker huge_zero_page_shrinker = {
 
				 };
			
 
				 
			
 
				 #ifdef CONFIG_SYSFS
			
 
				-
			
 
				-static ssize_t triple_flag_store(struct kobject *kobj,
			
 
				-				 struct kobj_attribute *attr,
			
 
				-				 const char *buf, size_t count,
			
 
				-				 enum transparent_hugepage_flag enabled,
			
 
				-				 enum transparent_hugepage_flag deferred,
			
 
				-				 enum transparent_hugepage_flag req_madv)
			
 
				-{
			
 
				-	if (!memcmp("defer", buf,
			
 
				-		    min(sizeof("defer")-1, count))) {
			
 
				-		if (enabled == deferred)
			
 
				-			return -EINVAL;
			
 
				-		clear_bit(enabled, &transparent_hugepage_flags);
			
 
				-		clear_bit(req_madv, &transparent_hugepage_flags);
			
 
				-		set_bit(deferred, &transparent_hugepage_flags);
			
 
				-	} else if (!memcmp("always", buf,
			
 
				-		    min(sizeof("always")-1, count))) {
			
 
				-		clear_bit(deferred, &transparent_hugepage_flags);
			
 
				-		clear_bit(req_madv, &transparent_hugepage_flags);
			
 
				-		set_bit(enabled, &transparent_hugepage_flags);
			
 
				-	} else if (!memcmp("madvise", buf,
			
 
				-			   min(sizeof("madvise")-1, count))) {
			
 
				-		clear_bit(enabled, &transparent_hugepage_flags);
			
 
				-		clear_bit(deferred, &transparent_hugepage_flags);
			
 
				-		set_bit(req_madv, &transparent_hugepage_flags);
			
 
				-	} else if (!memcmp("never", buf,
			
 
				-			   min(sizeof("never")-1, count))) {
			
 
				-		clear_bit(enabled, &transparent_hugepage_flags);
			
 
				-		clear_bit(req_madv, &transparent_hugepage_flags);
			
 
				-		clear_bit(deferred, &transparent_hugepage_flags);
			
 
				-	} else
			
 
				-		return -EINVAL;
			
 
				-
			
 
				-	return count;
			
 
				-}
			
 
				-
			
 
				 static ssize_t enabled_show(struct kobject *kobj,
			
 
				 			    struct kobj_attribute *attr, char *buf)
			
 
				 {
			
@@ -193,19 +157,28 @@ static ssize_t enabled_store(struct kobject *kobj,
 
				 			     struct kobj_attribute *attr,
			
 
				 			     const char *buf, size_t count)
			
 
				 {
			
 
				-	ssize_t ret;
			
 
				+	ssize_t ret = count;
			
 
				 
			
 
				-	ret = triple_flag_store(kobj, attr, buf, count,
			
 
				-				TRANSPARENT_HUGEPAGE_FLAG,
			
 
				-				TRANSPARENT_HUGEPAGE_FLAG,
			
 
				-				TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
			
 
				+	if (!memcmp("always", buf,
			
 
				+		    min(sizeof("always")-1, count))) {
			
 
				+		clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
			
 
				+		set_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
			
 
				+	} else if (!memcmp("madvise", buf,
			
 
				+			   min(sizeof("madvise")-1, count))) {
			
 
				+		clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
			
 
				+		set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
			
 
				+	} else if (!memcmp("never", buf,
			
 
				+			   min(sizeof("never")-1, count))) {
			
 
				+		clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
			
 
				+		clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
			
 
				+	} else
			
 
				+		ret = -EINVAL;
			
 
				 
			
 
				 	if (ret > 0) {
			
 
				 		int err = start_stop_khugepaged();
			
 
				 		if (err)
			
 
				 			ret = err;
			
 
				 	}
			
 
				-
			
 
				 	return ret;
			
 
				 }
			
 
				 static struct kobj_attribute enabled_attr =
			
@@ -241,32 +214,58 @@ ssize_t single_hugepage_flag_store(struct kobject *kobj,
 
				 	return count;
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * Currently defrag only disables __GFP_NOWAIT for allocation. A blind
			
 
				- * __GFP_REPEAT is too aggressive, it's never worth swapping tons of
			
 
				- * memory just to allocate one more hugepage.
			
 
				- */
			
 
				 static ssize_t defrag_show(struct kobject *kobj,
			
 
				 			   struct kobj_attribute *attr, char *buf)
			
 
				 {
			
 
				 	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
			
 
				-		return sprintf(buf, "[always] defer madvise never\n");
			
 
				+		return sprintf(buf, "[always] defer defer+madvise madvise never\n");
			
 
				 	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
			
 
				-		return sprintf(buf, "always [defer] madvise never\n");
			
 
				-	else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
			
 
				-		return sprintf(buf, "always defer [madvise] never\n");
			
 
				-	else
			
 
				-		return sprintf(buf, "always defer madvise [never]\n");
			
 
				-
			
 
				+		return sprintf(buf, "always [defer] defer+madvise madvise never\n");
			
 
				+	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
			
 
				+		return sprintf(buf, "always defer [defer+madvise] madvise never\n");
			
 
				+	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
			
 
				+		return sprintf(buf, "always defer defer+madvise [madvise] never\n");
			
 
				+	return sprintf(buf, "always defer defer+madvise madvise [never]\n");
			
 
				 }
			
 
				+
			
 
				 static ssize_t defrag_store(struct kobject *kobj,
			
 
				 			    struct kobj_attribute *attr,
			
 
				 			    const char *buf, size_t count)
			
 
				 {
			
 
				-	return triple_flag_store(kobj, attr, buf, count,
			
 
				-				 TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
			
 
				-				 TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
			
 
				-				 TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG);
			
 
				+	if (!memcmp("always", buf,
			
 
				+		    min(sizeof("always")-1, count))) {
			
 
				+		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
			
 
				+		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
			
 
				+		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
			
 
				+		set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
			
 
				+	} else if (!memcmp("defer", buf,
			
 
				+		    min(sizeof("defer")-1, count))) {
			
 
				+		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
			
 
				+		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
			
 
				+		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
			
 
				+		set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
			
 
				+	} else if (!memcmp("defer+madvise", buf,
			
 
				+		    min(sizeof("defer+madvise")-1, count))) {
			
 
				+		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
			
 
				+		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
			
 
				+		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
			
 
				+		set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
			
 
				+	} else if (!memcmp("madvise", buf,
			
 
				+			   min(sizeof("madvise")-1, count))) {
			
 
				+		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
			
 
				+		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
			
 
				+		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
			
 
				+		set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
			
 
				+	} else if (!memcmp("never", buf,
			
 
				+			   min(sizeof("never")-1, count))) {
			
 
				+		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
			
 
				+		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
			
 
				+		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
			
 
				+		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
			
 
				+	} else
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	return count;
			
 
				 }
			
 
				 static struct kobj_attribute defrag_attr =
			
 
				 	__ATTR(defrag, 0644, defrag_show, defrag_store);
			
@@ -612,25 +611,28 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
 
				 }
			
 
				 
			
 
				 /*
			
 
				- * If THP defrag is set to always then directly reclaim/compact as necessary
			
 
				- * If set to defer then do only background reclaim/compact and defer to khugepaged
			
 
				- * If set to madvise and the VMA is flagged then directly reclaim/compact
			
 
				- * When direct reclaim/compact is allowed, don't retry except for flagged VMA's
			
 
				+ * always: directly stall for all thp allocations
			
 
				+ * defer: wake kswapd and fail if not immediately available
			
 
				+ * defer+madvise: wake kswapd and directly stall for MADV_HUGEPAGE, otherwise
			
 
				+ *		  fail if not immediately available
			
 
				+ * madvise: directly stall for MADV_HUGEPAGE, otherwise fail if not immediately
			
 
				+ *	    available
			
 
				+ * never: never stall for any thp allocation
			
 
				  */
			
 
				 static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
			
 
				 {
			
 
				-	bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
			
 
				+	const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
			
 
				 
			
 
				-	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
			
 
				-				&transparent_hugepage_flags) && vma_madvised)
			
 
				-		return GFP_TRANSHUGE;
			
 
				-	else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
			
 
				-						&transparent_hugepage_flags))
			
 
				-		return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
			
 
				-	else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
			
 
				-						&transparent_hugepage_flags))
			
 
				+	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
			
 
				 		return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
			
 
				-
			
 
				+	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
			
 
				+		return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
			
 
				+	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
			
 
				+		return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM :
			
 
				+							     __GFP_KSWAPD_RECLAIM);
			
 
				+	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
			
 
				+		return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM :
			
 
				+							     0);
			
 
				 	return GFP_TRANSHUGE_LIGHT;
			
 
				 }
			
 
				 
			
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -32,6 +32,7 @@
 
				 #include <linux/hugetlb.h>
			
 
				 #include <linux/hugetlb_cgroup.h>
			
 
				 #include <linux/node.h>
			
 
				+#include <linux/userfaultfd_k.h>
			
 
				 #include "internal.h"
			
 
				 
			
 
				 int hugepages_treat_as_movable;
			
@@ -3680,6 +3681,38 @@ retry:
 
				 		size = i_size_read(mapping->host) >> huge_page_shift(h);
			
 
				 		if (idx >= size)
			
 
				 			goto out;
			
 
				+
			
 
				+		/*
			
 
				+		 * Check for page in userfault range
			
 
				+		 */
			
 
				+		if (userfaultfd_missing(vma)) {
			
 
				+			u32 hash;
			
 
				+			struct vm_fault vmf = {
			
 
				+				.vma = vma,
			
 
				+				.address = address,
			
 
				+				.flags = flags,
			
 
				+				/*
			
 
				+				 * Hard to debug if it ends up being
			
 
				+				 * used by a callee that assumes
			
 
				+				 * something about the other
			
 
				+				 * uninitialized fields... same as in
			
 
				+				 * memory.c
			
 
				+				 */
			
 
				+			};
			
 
				+
			
 
				+			/*
			
 
				+			 * hugetlb_fault_mutex must be dropped before
			
 
				+			 * handling userfault.  Reacquire after handling
			
 
				+			 * fault to make calling code simpler.
			
 
				+			 */
			
 
				+			hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping,
			
 
				+							idx, address);
			
 
				+			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
			
 
				+			ret = handle_userfault(&vmf, VM_UFFD_MISSING);
			
 
				+			mutex_lock(&hugetlb_fault_mutex_table[hash]);
			
 
				+			goto out;
			
 
				+		}
			
 
				+
			
 
				 		page = alloc_huge_page(vma, address, 0);
			
 
				 		if (IS_ERR(page)) {
			
 
				 			ret = PTR_ERR(page);
			
@@ -3948,10 +3981,113 @@ out_mutex:
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * Used by userfaultfd UFFDIO_COPY.  Based on mcopy_atomic_pte with
			
 
				+ * modifications for huge pages.
			
 
				+ */
			
 
				+int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
			
 
				+			    pte_t *dst_pte,
			
 
				+			    struct vm_area_struct *dst_vma,
			
 
				+			    unsigned long dst_addr,
			
 
				+			    unsigned long src_addr,
			
 
				+			    struct page **pagep)
			
 
				+{
			
 
				+	int vm_shared = dst_vma->vm_flags & VM_SHARED;
			
 
				+	struct hstate *h = hstate_vma(dst_vma);
			
 
				+	pte_t _dst_pte;
			
 
				+	spinlock_t *ptl;
			
 
				+	int ret;
			
 
				+	struct page *page;
			
 
				+
			
 
				+	if (!*pagep) {
			
 
				+		ret = -ENOMEM;
			
 
				+		page = alloc_huge_page(dst_vma, dst_addr, 0);
			
 
				+		if (IS_ERR(page))
			
 
				+			goto out;
			
 
				+
			
 
				+		ret = copy_huge_page_from_user(page,
			
 
				+						(const void __user *) src_addr,
			
 
				+						pages_per_huge_page(h), false);
			
 
				+
			
 
				+		/* fallback to copy_from_user outside mmap_sem */
			
 
				+		if (unlikely(ret)) {
			
 
				+			ret = -EFAULT;
			
 
				+			*pagep = page;
			
 
				+			/* don't free the page */
			
 
				+			goto out;
			
 
				+		}
			
 
				+	} else {
			
 
				+		page = *pagep;
			
 
				+		*pagep = NULL;
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * The memory barrier inside __SetPageUptodate makes sure that
			
 
				+	 * preceding stores to the page contents become visible before
			
 
				+	 * the set_pte_at() write.
			
 
				+	 */
			
 
				+	__SetPageUptodate(page);
			
 
				+	set_page_huge_active(page);
			
 
				+
			
 
				+	/*
			
 
				+	 * If shared, add to page cache
			
 
				+	 */
			
 
				+	if (vm_shared) {
			
 
				+		struct address_space *mapping = dst_vma->vm_file->f_mapping;
			
 
				+		pgoff_t idx = vma_hugecache_offset(h, dst_vma, dst_addr);
			
 
				+
			
 
				+		ret = huge_add_to_page_cache(page, mapping, idx);
			
 
				+		if (ret)
			
 
				+			goto out_release_nounlock;
			
 
				+	}
			
 
				+
			
 
				+	ptl = huge_pte_lockptr(h, dst_mm, dst_pte);
			
 
				+	spin_lock(ptl);
			
 
				+
			
 
				+	ret = -EEXIST;
			
 
				+	if (!huge_pte_none(huge_ptep_get(dst_pte)))
			
 
				+		goto out_release_unlock;
			
 
				+
			
 
				+	if (vm_shared) {
			
 
				+		page_dup_rmap(page, true);
			
 
				+	} else {
			
 
				+		ClearPagePrivate(page);
			
 
				+		hugepage_add_new_anon_rmap(page, dst_vma, dst_addr);
			
 
				+	}
			
 
				+
			
 
				+	_dst_pte = make_huge_pte(dst_vma, page, dst_vma->vm_flags & VM_WRITE);
			
 
				+	if (dst_vma->vm_flags & VM_WRITE)
			
 
				+		_dst_pte = huge_pte_mkdirty(_dst_pte);
			
 
				+	_dst_pte = pte_mkyoung(_dst_pte);
			
 
				+
			
 
				+	set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
			
 
				+
			
 
				+	(void)huge_ptep_set_access_flags(dst_vma, dst_addr, dst_pte, _dst_pte,
			
 
				+					dst_vma->vm_flags & VM_WRITE);
			
 
				+	hugetlb_count_add(pages_per_huge_page(h), dst_mm);
			
 
				+
			
 
				+	/* No need to invalidate - it was non-present before */
			
 
				+	update_mmu_cache(dst_vma, dst_addr, dst_pte);
			
 
				+
			
 
				+	spin_unlock(ptl);
			
 
				+	if (vm_shared)
			
 
				+		unlock_page(page);
			
 
				+	ret = 0;
			
 
				+out:
			
 
				+	return ret;
			
 
				+out_release_unlock:
			
 
				+	spin_unlock(ptl);
			
 
				+out_release_nounlock:
			
 
				+	if (vm_shared)
			
 
				+		unlock_page(page);
			
 
				+	put_page(page);
			
 
				+	goto out;
			
 
				+}
			
 
				+
			
 
				 long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
			
 
				 			 struct page **pages, struct vm_area_struct **vmas,
			
 
				 			 unsigned long *position, unsigned long *nr_pages,
			
 
				-			 long i, unsigned int flags)
			
 
				+			 long i, unsigned int flags, int *nonblocking)
			
 
				 {
			
 
				 	unsigned long pfn_offset;
			
 
				 	unsigned long vaddr = *position;
			
@@ -4014,16 +4150,43 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
				 		    ((flags & FOLL_WRITE) &&
			
 
				 		      !huge_pte_write(huge_ptep_get(pte)))) {
			
 
				 			int ret;
			
 
				+			unsigned int fault_flags = 0;
			
 
				 
			
 
				 			if (pte)
			
 
				 				spin_unlock(ptl);
			
 
				-			ret = hugetlb_fault(mm, vma, vaddr,
			
 
				-				(flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0);
			
 
				-			if (!(ret & VM_FAULT_ERROR))
			
 
				-				continue;
			
 
				-
			
 
				-			remainder = 0;
			
 
				-			break;
			
 
				+			if (flags & FOLL_WRITE)
			
 
				+				fault_flags |= FAULT_FLAG_WRITE;
			
 
				+			if (nonblocking)
			
 
				+				fault_flags |= FAULT_FLAG_ALLOW_RETRY;
			
 
				+			if (flags & FOLL_NOWAIT)
			
 
				+				fault_flags |= FAULT_FLAG_ALLOW_RETRY |
			
 
				+					FAULT_FLAG_RETRY_NOWAIT;
			
 
				+			if (flags & FOLL_TRIED) {
			
 
				+				VM_WARN_ON_ONCE(fault_flags &
			
 
				+						FAULT_FLAG_ALLOW_RETRY);
			
 
				+				fault_flags |= FAULT_FLAG_TRIED;
			
 
				+			}
			
 
				+			ret = hugetlb_fault(mm, vma, vaddr, fault_flags);
			
 
				+			if (ret & VM_FAULT_ERROR) {
			
 
				+				remainder = 0;
			
 
				+				break;
			
 
				+			}
			
 
				+			if (ret & VM_FAULT_RETRY) {
			
 
				+				if (nonblocking)
			
 
				+					*nonblocking = 0;
			
 
				+				*nr_pages = 0;
			
 
				+				/*
			
 
				+				 * VM_FAULT_RETRY must not return an
			
 
				+				 * error, it will return zero
			
 
				+				 * instead.
			
 
				+				 *
			
 
				+				 * No need to update "position" as the
			
 
				+				 * caller will not check it after
			
 
				+				 * *nr_pages is set to 0.
			
 
				+				 */
			
 
				+				return i;
			
 
				+			}
			
 
				+			continue;
			
 
				 		}
			
 
				 
			
 
				 		pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
			
@@ -4052,6 +4215,11 @@ same_page:
 
				 		spin_unlock(ptl);
			
 
				 	}
			
 
				 	*nr_pages = remainder;
			
 
				+	/*
			
 
				+	 * setting position is actually required only if remainder is
			
 
				+	 * not zero but it's faster not to add a "if (remainder)"
			
 
				+	 * branch.
			
 
				+	 */
			
 
				 	*position = vaddr;
			
 
				 
			
 
				 	return i ? i : -EFAULT;
			
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -43,6 +43,11 @@ int do_swap_page(struct vm_fault *vmf);
 
				 void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
			
 
				 		unsigned long floor, unsigned long ceiling);
			
 
				 
			
 
				+static inline bool can_madv_dontneed_vma(struct vm_area_struct *vma)
			
 
				+{
			
 
				+	return !(vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP));
			
 
				+}
			
 
				+
			
 
				 void unmap_page_range(struct mmu_gather *tlb,
			
 
				 			     struct vm_area_struct *vma,
			
 
				 			     unsigned long addr, unsigned long end,
			
@@ -133,9 +138,9 @@ struct alloc_context {
 
				  * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
			
 
				  */
			
 
				 static inline unsigned long
			
 
				-__find_buddy_index(unsigned long page_idx, unsigned int order)
			
 
				+__find_buddy_pfn(unsigned long page_pfn, unsigned int order)
			
 
				 {
			
 
				-	return page_idx ^ (1 << order);
			
 
				+	return page_pfn ^ (1 << order);
			
 
				 }
			
 
				 
			
 
				 extern struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
			
@@ -175,6 +180,8 @@ struct compact_control {
 
				 	struct list_head migratepages;	/* List of pages being migrated */
			
 
				 	unsigned long nr_freepages;	/* Number of isolated free pages */
			
 
				 	unsigned long nr_migratepages;	/* Number of pages to migrate */
			
 
				+	unsigned long total_migrate_scanned;
			
 
				+	unsigned long total_free_scanned;
			
 
				 	unsigned long free_pfn;		/* isolate_freepages search base */
			
 
				 	unsigned long migrate_pfn;	/* isolate_migratepages search base */
			
 
				 	unsigned long last_migrated_pfn;/* Not yet flushed page being freed */
			
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -10,6 +10,7 @@
 
				 #include <linux/syscalls.h>
			
 
				 #include <linux/mempolicy.h>
			
 
				 #include <linux/page-isolation.h>
			
 
				+#include <linux/userfaultfd_k.h>
			
 
				 #include <linux/hugetlb.h>
			
 
				 #include <linux/falloc.h>
			
 
				 #include <linux/sched.h>
			
@@ -24,6 +25,8 @@
 
				 
			
 
				 #include <asm/tlb.h>
			
 
				 
			
 
				+#include "internal.h"
			
 
				+
			
 
				 /*
			
 
				  * Any behaviour which results in changes to the vma->vm_flags needs to
			
 
				  * take mmap_sem for writing. Others, which simply traverse vmas, need
			
@@ -473,10 +476,11 @@ static long madvise_dontneed(struct vm_area_struct *vma,
 
				 			     unsigned long start, unsigned long end)
			
 
				 {
			
 
				 	*prev = vma;
			
 
				-	if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP))
			
 
				+	if (!can_madv_dontneed_vma(vma))
			
 
				 		return -EINVAL;
			
 
				 
			
 
				-	zap_page_range(vma, start, end - start, NULL);
			
 
				+	madvise_userfault_dontneed(vma, prev, start, end);
			
 
				+	zap_page_range(vma, start, end - start);
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -611,10 +611,10 @@ int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size,
 
				 
			
 
				 int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
			
 
				 {
			
 
				-	memblock_dbg("memblock_add: [%#016llx-%#016llx] flags %#02lx %pF\n",
			
 
				-		     (unsigned long long)base,
			
 
				-		     (unsigned long long)base + size - 1,
			
 
				-		     0UL, (void *)_RET_IP_);
			
 
				+	phys_addr_t end = base + size - 1;
			
 
				+
			
 
				+	memblock_dbg("memblock_add: [%pa-%pa] %pF\n",
			
 
				+		     &base, &end, (void *)_RET_IP_);
			
 
				 
			
 
				 	return memblock_add_range(&memblock.memory, base, size, MAX_NUMNODES, 0);
			
 
				 }
			
@@ -718,10 +718,10 @@ int __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size)
 
				 
			
 
				 int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size)
			
 
				 {
			
 
				-	memblock_dbg("   memblock_free: [%#016llx-%#016llx] %pF\n",
			
 
				-		     (unsigned long long)base,
			
 
				-		     (unsigned long long)base + size - 1,
			
 
				-		     (void *)_RET_IP_);
			
 
				+	phys_addr_t end = base + size - 1;
			
 
				+
			
 
				+	memblock_dbg("   memblock_free: [%pa-%pa] %pF\n",
			
 
				+		     &base, &end, (void *)_RET_IP_);
			
 
				 
			
 
				 	kmemleak_free_part_phys(base, size);
			
 
				 	return memblock_remove_range(&memblock.reserved, base, size);
			
@@ -729,10 +729,10 @@ int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size)
 
				 
			
 
				 int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
			
 
				 {
			
 
				-	memblock_dbg("memblock_reserve: [%#016llx-%#016llx] flags %#02lx %pF\n",
			
 
				-		     (unsigned long long)base,
			
 
				-		     (unsigned long long)base + size - 1,
			
 
				-		     0UL, (void *)_RET_IP_);
			
 
				+	phys_addr_t end = base + size - 1;
			
 
				+
			
 
				+	memblock_dbg("memblock_reserve: [%pa-%pa] %pF\n",
			
 
				+		     &base, &end, (void *)_RET_IP_);
			
 
				 
			
 
				 	return memblock_add_range(&memblock.reserved, base, size, MAX_NUMNODES, 0);
			
 
				 }
			
@@ -1105,6 +1105,31 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid,
 
				 		*out_nid = r->nid;
			
 
				 }
			
 
				 
			
 
				+unsigned long __init_memblock memblock_next_valid_pfn(unsigned long pfn,
			
 
				+						      unsigned long max_pfn)
			
 
				+{
			
 
				+	struct memblock_type *type = &memblock.memory;
			
 
				+	unsigned int right = type->cnt;
			
 
				+	unsigned int mid, left = 0;
			
 
				+	phys_addr_t addr = PFN_PHYS(pfn + 1);
			
 
				+
			
 
				+	do {
			
 
				+		mid = (right + left) / 2;
			
 
				+
			
 
				+		if (addr < type->regions[mid].base)
			
 
				+			right = mid;
			
 
				+		else if (addr >= (type->regions[mid].base +
			
 
				+				  type->regions[mid].size))
			
 
				+			left = mid + 1;
			
 
				+		else {
			
 
				+			/* addr is within the region, so pfn + 1 is valid */
			
 
				+			return min(pfn + 1, max_pfn);
			
 
				+		}
			
 
				+	} while (left < right);
			
 
				+
			
 
				+	return min(PHYS_PFN(type->regions[right].base), max_pfn);
			
 
				+}
			
 
				+
			
 
				 /**
			
 
				  * memblock_set_node - set node ID on memblock regions
			
 
				  * @base: base of area to set node ID for
			
@@ -1202,8 +1227,8 @@ phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys
 
				 	alloc = __memblock_alloc_base(size, align, max_addr);
			
 
				 
			
 
				 	if (alloc == 0)
			
 
				-		panic("ERROR: Failed to allocate 0x%llx bytes below 0x%llx.\n",
			
 
				-		      (unsigned long long) size, (unsigned long long) max_addr);
			
 
				+		panic("ERROR: Failed to allocate %pa bytes below %pa.\n",
			
 
				+		      &size, &max_addr);
			
 
				 
			
 
				 	return alloc;
			
 
				 }
			
@@ -1274,18 +1299,17 @@ static void * __init memblock_virt_alloc_internal(
 
				 
			
 
				 	if (max_addr > memblock.current_limit)
			
 
				 		max_addr = memblock.current_limit;
			
 
				-
			
 
				 again:
			
 
				 	alloc = memblock_find_in_range_node(size, align, min_addr, max_addr,
			
 
				 					    nid, flags);
			
 
				-	if (alloc)
			
 
				+	if (alloc && !memblock_reserve(alloc, size))
			
 
				 		goto done;
			
 
				 
			
 
				 	if (nid != NUMA_NO_NODE) {
			
 
				 		alloc = memblock_find_in_range_node(size, align, min_addr,
			
 
				 						    max_addr, NUMA_NO_NODE,
			
 
				 						    flags);
			
 
				-		if (alloc)
			
 
				+		if (alloc && !memblock_reserve(alloc, size))
			
 
				 			goto done;
			
 
				 	}
			
 
				 
			
@@ -1303,7 +1327,6 @@ again:
 
				 
			
 
				 	return NULL;
			
 
				 done:
			
 
				-	memblock_reserve(alloc, size);
			
 
				 	ptr = phys_to_virt(alloc);
			
 
				 	memset(ptr, 0, size);
			
 
				 
			
@@ -1615,8 +1638,7 @@ int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size
 
				 
			
 
				 	if (idx == -1)
			
 
				 		return 0;
			
 
				-	return memblock.memory.regions[idx].base <= base &&
			
 
				-		(memblock.memory.regions[idx].base +
			
 
				+	return (memblock.memory.regions[idx].base +
			
 
				 		 memblock.memory.regions[idx].size) >= end;
			
 
				 }
			
 
				 
			
@@ -1673,7 +1695,7 @@ phys_addr_t __init_memblock memblock_get_current_limit(void)
 
				 
			
 
				 static void __init_memblock memblock_dump(struct memblock_type *type, char *name)
			
 
				 {
			
 
				-	unsigned long long base, size;
			
 
				+	phys_addr_t base, end, size;
			
 
				 	unsigned long flags;
			
 
				 	int idx;
			
 
				 	struct memblock_region *rgn;
			
@@ -1685,23 +1707,24 @@ static void __init_memblock memblock_dump(struct memblock_type *type, char *name
 
				 
			
 
				 		base = rgn->base;
			
 
				 		size = rgn->size;
			
 
				+		end = base + size - 1;
			
 
				 		flags = rgn->flags;
			
 
				 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
			
 
				 		if (memblock_get_region_node(rgn) != MAX_NUMNODES)
			
 
				 			snprintf(nid_buf, sizeof(nid_buf), " on node %d",
			
 
				 				 memblock_get_region_node(rgn));
			
 
				 #endif
			
 
				-		pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes%s flags: %#lx\n",
			
 
				-			name, idx, base, base + size - 1, size, nid_buf, flags);
			
 
				+		pr_info(" %s[%#x]\t[%pa-%pa], %pa bytes%s flags: %#lx\n",
			
 
				+			name, idx, &base, &end, &size, nid_buf, flags);
			
 
				 	}
			
 
				 }
			
 
				 
			
 
				 void __init_memblock __memblock_dump_all(void)
			
 
				 {
			
 
				 	pr_info("MEMBLOCK configuration:\n");
			
 
				-	pr_info(" memory size = %#llx reserved size = %#llx\n",
			
 
				-		(unsigned long long)memblock.memory.total_size,
			
 
				-		(unsigned long long)memblock.reserved.total_size);
			
 
				+	pr_info(" memory size = %pa reserved size = %pa\n",
			
 
				+		&memblock.memory.total_size,
			
 
				+		&memblock.reserved.total_size);
			
 
				 
			
 
				 	memblock_dump(&memblock.memory, "memory");
			
 
				 	memblock_dump(&memblock.reserved, "reserved");
			
@@ -1727,19 +1750,14 @@ static int memblock_debug_show(struct seq_file *m, void *private)
 
				 	struct memblock_type *type = m->private;
			
 
				 	struct memblock_region *reg;
			
 
				 	int i;
			
 
				+	phys_addr_t end;
			
 
				 
			
 
				 	for (i = 0; i < type->cnt; i++) {
			
 
				 		reg = &type->regions[i];
			
 
				-		seq_printf(m, "%4d: ", i);
			
 
				-		if (sizeof(phys_addr_t) == 4)
			
 
				-			seq_printf(m, "0x%08lx..0x%08lx\n",
			
 
				-				   (unsigned long)reg->base,
			
 
				-				   (unsigned long)(reg->base + reg->size - 1));
			
 
				-		else
			
 
				-			seq_printf(m, "0x%016llx..0x%016llx\n",
			
 
				-				   (unsigned long long)reg->base,
			
 
				-				   (unsigned long long)(reg->base + reg->size - 1));
			
 
				+		end = reg->base + reg->size - 1;
			
 
				 
			
 
				+		seq_printf(m, "%4d: ", i);
			
 
				+		seq_printf(m, "%pa..%pa\n", &reg->base, &end);
			
 
				 	}
			
 
				 	return 0;
			
 
				 }
			
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -317,6 +317,8 @@ void memcg_put_cache_ids(void)
 
				 DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
			
 
				 EXPORT_SYMBOL(memcg_kmem_enabled_key);
			
 
				 
			
 
				+struct workqueue_struct *memcg_kmem_cache_wq;
			
 
				+
			
 
				 #endif /* !CONFIG_SLOB */
			
 
				 
			
 
				 /**
			
@@ -2143,8 +2145,6 @@ struct memcg_kmem_cache_create_work {
 
				 	struct work_struct work;
			
 
				 };
			
 
				 
			
 
				-static struct workqueue_struct *memcg_kmem_cache_create_wq;
			
 
				-
			
 
				 static void memcg_kmem_cache_create_func(struct work_struct *w)
			
 
				 {
			
 
				 	struct memcg_kmem_cache_create_work *cw =
			
@@ -2176,7 +2176,7 @@ static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
 
				 	cw->cachep = cachep;
			
 
				 	INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
			
 
				 
			
 
				-	queue_work(memcg_kmem_cache_create_wq, &cw->work);
			
 
				+	queue_work(memcg_kmem_cache_wq, &cw->work);
			
 
				 }
			
 
				 
			
 
				 static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
			
@@ -2837,6 +2837,7 @@ static int memcg_online_kmem(struct mem_cgroup *memcg)
 
				 	 */
			
 
				 	memcg->kmemcg_id = memcg_id;
			
 
				 	memcg->kmem_state = KMEM_ONLINE;
			
 
				+	INIT_LIST_HEAD(&memcg->kmem_caches);
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
@@ -4002,9 +4003,9 @@ static struct cftype mem_cgroup_legacy_files[] = {
 
				 #ifdef CONFIG_SLABINFO
			
 
				 	{
			
 
				 		.name = "kmem.slabinfo",
			
 
				-		.seq_start = slab_start,
			
 
				-		.seq_next = slab_next,
			
 
				-		.seq_stop = slab_stop,
			
 
				+		.seq_start = memcg_slab_start,
			
 
				+		.seq_next = memcg_slab_next,
			
 
				+		.seq_stop = memcg_slab_stop,
			
 
				 		.seq_show = memcg_slab_show,
			
 
				 	},
			
 
				 #endif
			
@@ -5777,12 +5778,12 @@ static int __init mem_cgroup_init(void)
 
				 #ifndef CONFIG_SLOB
			
 
				 	/*
			
 
				 	 * Kmem cache creation is mostly done with the slab_mutex held,
			
 
				-	 * so use a special workqueue to avoid stalling all worker
			
 
				-	 * threads in case lots of cgroups are created simultaneously.
			
 
				+	 * so use a workqueue with limited concurrency to avoid stalling
			
 
				+	 * all worker threads in case lots of cgroups are created and
			
 
				+	 * destroyed simultaneously.
			
 
				 	 */
			
 
				-	memcg_kmem_cache_create_wq =
			
 
				-		alloc_ordered_workqueue("memcg_kmem_cache_create", 0);
			
 
				-	BUG_ON(!memcg_kmem_cache_create_wq);
			
 
				+	memcg_kmem_cache_wq = alloc_workqueue("memcg_kmem_cache", 0, 1);
			
 
				+	BUG_ON(!memcg_kmem_cache_wq);
			
 
				 #endif
			
 
				 
			
 
				 	cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
			
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1155,12 +1155,6 @@ again:
 
				 
			
 
				 			if (!PageAnon(page)) {
			
 
				 				if (pte_dirty(ptent)) {
			
 
				-					/*
			
 
				-					 * oom_reaper cannot tear down dirty
			
 
				-					 * pages
			
 
				-					 */
			
 
				-					if (unlikely(details && details->ignore_dirty))
			
 
				-						continue;
			
 
				 					force_flush = 1;
			
 
				 					set_page_dirty(page);
			
 
				 				}
			
@@ -1179,8 +1173,8 @@ again:
 
				 			}
			
 
				 			continue;
			
 
				 		}
			
 
				-		/* only check swap_entries if explicitly asked for in details */
			
 
				-		if (unlikely(details && !details->check_swap_entries))
			
 
				+		/* If details->check_mapping, we leave swap entries. */
			
 
				+		if (unlikely(details))
			
 
				 			continue;
			
 
				 
			
 
				 		entry = pte_to_swp_entry(ptent);
			
@@ -1376,12 +1370,11 @@ void unmap_vmas(struct mmu_gather *tlb,
 
				  * @vma: vm_area_struct holding the applicable pages
			
 
				  * @start: starting address of pages to zap
			
 
				  * @size: number of bytes to zap
			
 
				- * @details: details of shared cache invalidation
			
 
				  *
			
 
				  * Caller must protect the VMA list
			
 
				  */
			
 
				 void zap_page_range(struct vm_area_struct *vma, unsigned long start,
			
 
				-		unsigned long size, struct zap_details *details)
			
 
				+		unsigned long size)
			
 
				 {
			
 
				 	struct mm_struct *mm = vma->vm_mm;
			
 
				 	struct mmu_gather tlb;
			
@@ -1392,7 +1385,7 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start,
 
				 	update_hiwater_rss(mm);
			
 
				 	mmu_notifier_invalidate_range_start(mm, start, end);
			
 
				 	for ( ; vma && vma->vm_start < end; vma = vma->vm_next)
			
 
				-		unmap_single_vma(&tlb, vma, start, end, details);
			
 
				+		unmap_single_vma(&tlb, vma, start, end, NULL);
			
 
				 	mmu_notifier_invalidate_range_end(mm, start, end);
			
 
				 	tlb_finish_mmu(&tlb, start, end);
			
 
				 }
			
@@ -3471,12 +3464,10 @@ out:
 
				 
			
 
				 static int create_huge_pmd(struct vm_fault *vmf)
			
 
				 {
			
 
				-	struct vm_area_struct *vma = vmf->vma;
			
 
				-	if (vma_is_anonymous(vma))
			
 
				+	if (vma_is_anonymous(vmf->vma))
			
 
				 		return do_huge_pmd_anonymous_page(vmf);
			
 
				-	if (vma->vm_ops->pmd_fault)
			
 
				-		return vma->vm_ops->pmd_fault(vma, vmf->address, vmf->pmd,
			
 
				-				vmf->flags);
			
 
				+	if (vmf->vma->vm_ops->pmd_fault)
			
 
				+		return vmf->vma->vm_ops->pmd_fault(vmf);
			
 
				 	return VM_FAULT_FALLBACK;
			
 
				 }
			
 
				 
			
@@ -3485,8 +3476,7 @@ static int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
 
				 	if (vma_is_anonymous(vmf->vma))
			
 
				 		return do_huge_pmd_wp_page(vmf, orig_pmd);
			
 
				 	if (vmf->vma->vm_ops->pmd_fault)
			
 
				-		return vmf->vma->vm_ops->pmd_fault(vmf->vma, vmf->address,
			
 
				-						   vmf->pmd, vmf->flags);
			
 
				+		return vmf->vma->vm_ops->pmd_fault(vmf);
			
 
				 
			
 
				 	/* COW handled on pte level: split pmd */
			
 
				 	VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma);
			
@@ -4155,6 +4145,38 @@ void copy_user_huge_page(struct page *dst, struct page *src,
 
				 		copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
			
 
				 	}
			
 
				 }
			
 
				+
			
 
				+long copy_huge_page_from_user(struct page *dst_page,
			
 
				+				const void __user *usr_src,
			
 
				+				unsigned int pages_per_huge_page,
			
 
				+				bool allow_pagefault)
			
 
				+{
			
 
				+	void *src = (void *)usr_src;
			
 
				+	void *page_kaddr;
			
 
				+	unsigned long i, rc = 0;
			
 
				+	unsigned long ret_val = pages_per_huge_page * PAGE_SIZE;
			
 
				+
			
 
				+	for (i = 0; i < pages_per_huge_page; i++) {
			
 
				+		if (allow_pagefault)
			
 
				+			page_kaddr = kmap(dst_page + i);
			
 
				+		else
			
 
				+			page_kaddr = kmap_atomic(dst_page + i);
			
 
				+		rc = copy_from_user(page_kaddr,
			
 
				+				(const void __user *)(src + i * PAGE_SIZE),
			
 
				+				PAGE_SIZE);
			
 
				+		if (allow_pagefault)
			
 
				+			kunmap(dst_page + i);
			
 
				+		else
			
 
				+			kunmap_atomic(page_kaddr);
			
 
				+
			
 
				+		ret_val -= (PAGE_SIZE - rc);
			
 
				+		if (rc)
			
 
				+			break;
			
 
				+
			
 
				+		cond_resched();
			
 
				+	}
			
 
				+	return ret_val;
			
 
				+}
			
 
				 #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
			
 
				 
			
 
				 #if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS
			
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -179,7 +179,7 @@ static void release_memory_resource(struct resource *res)
 
				 void get_page_bootmem(unsigned long info,  struct page *page,
			
 
				 		      unsigned long type)
			
 
				 {
			
 
				-	page->lru.next = (struct list_head *) type;
			
 
				+	page->freelist = (void *)type;
			
 
				 	SetPagePrivate(page);
			
 
				 	set_page_private(page, info);
			
 
				 	page_ref_inc(page);
			
@@ -189,11 +189,12 @@ void put_page_bootmem(struct page *page)
 
				 {
			
 
				 	unsigned long type;
			
 
				 
			
 
				-	type = (unsigned long) page->lru.next;
			
 
				+	type = (unsigned long) page->freelist;
			
 
				 	BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
			
 
				 	       type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE);
			
 
				 
			
 
				 	if (page_ref_dec_return(page) == 1) {
			
 
				+		page->freelist = NULL;
			
 
				 		ClearPagePrivate(page);
			
 
				 		set_page_private(page, 0);
			
 
				 		INIT_LIST_HEAD(&page->lru);
			
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2806,11 +2806,11 @@ static inline void verify_mm_writelocked(struct mm_struct *mm)
 
				  *  anonymous maps.  eventually we may be able to do some
			
 
				  *  brk-specific accounting here.
			
 
				  */
			
 
				-static int do_brk(unsigned long addr, unsigned long request)
			
 
				+static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
			
 
				 {
			
 
				 	struct mm_struct *mm = current->mm;
			
 
				 	struct vm_area_struct *vma, *prev;
			
 
				-	unsigned long flags, len;
			
 
				+	unsigned long len;
			
 
				 	struct rb_node **rb_link, *rb_parent;
			
 
				 	pgoff_t pgoff = addr >> PAGE_SHIFT;
			
 
				 	int error;
			
@@ -2821,7 +2821,10 @@ static int do_brk(unsigned long addr, unsigned long request)
 
				 	if (!len)
			
 
				 		return 0;
			
 
				 
			
 
				-	flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
			
 
				+	/* Until we need other flags, refuse anything except VM_EXEC. */
			
 
				+	if ((flags & (~VM_EXEC)) != 0)
			
 
				+		return -EINVAL;
			
 
				+	flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
			
 
				 
			
 
				 	error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
			
 
				 	if (offset_in_page(error))
			
@@ -2889,7 +2892,12 @@ out:
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-int vm_brk(unsigned long addr, unsigned long len)
			
 
				+static int do_brk(unsigned long addr, unsigned long len)
			
 
				+{
			
 
				+	return do_brk_flags(addr, len, 0);
			
 
				+}
			
 
				+
			
 
				+int vm_brk_flags(unsigned long addr, unsigned long len, unsigned long flags)
			
 
				 {
			
 
				 	struct mm_struct *mm = current->mm;
			
 
				 	int ret;
			
@@ -2898,13 +2906,19 @@ int vm_brk(unsigned long addr, unsigned long len)
 
				 	if (down_write_killable(&mm->mmap_sem))
			
 
				 		return -EINTR;
			
 
				 
			
 
				-	ret = do_brk(addr, len);
			
 
				+	ret = do_brk_flags(addr, len, flags);
			
 
				 	populate = ((mm->def_flags & VM_LOCKED) != 0);
			
 
				 	up_write(&mm->mmap_sem);
			
 
				 	if (populate && !ret)
			
 
				 		mm_populate(addr, len);
			
 
				 	return ret;
			
 
				 }
			
 
				+EXPORT_SYMBOL(vm_brk_flags);
			
 
				+
			
 
				+int vm_brk(unsigned long addr, unsigned long len)
			
 
				+{
			
 
				+	return vm_brk_flags(addr, len, 0);
			
 
				+}
			
 
				 EXPORT_SYMBOL(vm_brk);
			
 
				 
			
 
				 /* Release all mmaps. */
			
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -60,7 +60,7 @@ struct zoneref *__next_zones_zonelist(struct zoneref *z,
 
				 	 * Find the next suitable zone to use for the allocation.
			
 
				 	 * Only filter based on nodemask if it's set
			
 
				 	 */
			
 
				-	if (likely(nodes == NULL))
			
 
				+	if (unlikely(nodes == NULL))
			
 
				 		while (zonelist_zone_idx(z) > highest_zoneidx)
			
 
				 			z++;
			
 
				 	else
			
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -33,34 +33,6 @@
 
				 
			
 
				 #include "internal.h"
			
 
				 
			
 
				-/*
			
 
				- * For a prot_numa update we only hold mmap_sem for read so there is a
			
 
				- * potential race with faulting where a pmd was temporarily none. This
			
 
				- * function checks for a transhuge pmd under the appropriate lock. It
			
 
				- * returns a pte if it was successfully locked or NULL if it raced with
			
 
				- * a transhuge insertion.
			
 
				- */
			
 
				-static pte_t *lock_pte_protection(struct vm_area_struct *vma, pmd_t *pmd,
			
 
				-			unsigned long addr, int prot_numa, spinlock_t **ptl)
			
 
				-{
			
 
				-	pte_t *pte;
			
 
				-	spinlock_t *pmdl;
			
 
				-
			
 
				-	/* !prot_numa is protected by mmap_sem held for write */
			
 
				-	if (!prot_numa)
			
 
				-		return pte_offset_map_lock(vma->vm_mm, pmd, addr, ptl);
			
 
				-
			
 
				-	pmdl = pmd_lock(vma->vm_mm, pmd);
			
 
				-	if (unlikely(pmd_trans_huge(*pmd) || pmd_none(*pmd))) {
			
 
				-		spin_unlock(pmdl);
			
 
				-		return NULL;
			
 
				-	}
			
 
				-
			
 
				-	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, ptl);
			
 
				-	spin_unlock(pmdl);
			
 
				-	return pte;
			
 
				-}
			
 
				-
			
 
				 static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
			
 
				 		unsigned long addr, unsigned long end, pgprot_t newprot,
			
 
				 		int dirty_accountable, int prot_numa)
			
@@ -71,7 +43,21 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 
				 	unsigned long pages = 0;
			
 
				 	int target_node = NUMA_NO_NODE;
			
 
				 
			
 
				-	pte = lock_pte_protection(vma, pmd, addr, prot_numa, &ptl);
			
 
				+	/*
			
 
				+	 * Can be called with only the mmap_sem for reading by
			
 
				+	 * prot_numa so we must check the pmd isn't constantly
			
 
				+	 * changing from under us from pmd_none to pmd_trans_huge
			
 
				+	 * and/or the other way around.
			
 
				+	 */
			
 
				+	if (pmd_trans_unstable(pmd))
			
 
				+		return 0;
			
 
				+
			
 
				+	/*
			
 
				+	 * The pmd points to a regular pte so the pmd can't change
			
 
				+	 * from under us even if the mmap_sem is only hold for
			
 
				+	 * reading.
			
 
				+	 */
			
 
				+	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
			
 
				 	if (!pte)
			
 
				 		return 0;
			
 
				 
			
@@ -177,8 +163,6 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
 
				 		if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
			
 
				 			if (next - addr != HPAGE_PMD_SIZE) {
			
 
				 				__split_huge_pmd(vma, pmd, addr, false, NULL);
			
 
				-				if (pmd_trans_unstable(pmd))
			
 
				-					continue;
			
 
				 			} else {
			
 
				 				int nr_ptes = change_huge_pmd(vma, pmd, addr,
			
 
				 						newprot, prot_numa);
			
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -22,6 +22,7 @@
 
				 #include <linux/mmu_notifier.h>
			
 
				 #include <linux/uaccess.h>
			
 
				 #include <linux/mm-arch-hooks.h>
			
 
				+#include <linux/userfaultfd_k.h>
			
 
				 
			
 
				 #include <asm/cacheflush.h>
			
 
				 #include <asm/tlbflush.h>
			
@@ -250,7 +251,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
 
				 
			
 
				 static unsigned long move_vma(struct vm_area_struct *vma,
			
 
				 		unsigned long old_addr, unsigned long old_len,
			
 
				-		unsigned long new_len, unsigned long new_addr, bool *locked)
			
 
				+		unsigned long new_len, unsigned long new_addr,
			
 
				+		bool *locked, struct vm_userfaultfd_ctx *uf)
			
 
				 {
			
 
				 	struct mm_struct *mm = vma->vm_mm;
			
 
				 	struct vm_area_struct *new_vma;
			
@@ -309,6 +311,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 
				 		old_addr = new_addr;
			
 
				 		new_addr = err;
			
 
				 	} else {
			
 
				+		mremap_userfaultfd_prep(new_vma, uf);
			
 
				 		arch_remap(mm, old_addr, old_addr + old_len,
			
 
				 			   new_addr, new_addr + new_len);
			
 
				 	}
			
@@ -413,7 +416,8 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
 
				 }
			
 
				 
			
 
				 static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
			
 
				-		unsigned long new_addr, unsigned long new_len, bool *locked)
			
 
				+		unsigned long new_addr, unsigned long new_len, bool *locked,
			
 
				+		struct vm_userfaultfd_ctx *uf)
			
 
				 {
			
 
				 	struct mm_struct *mm = current->mm;
			
 
				 	struct vm_area_struct *vma;
			
@@ -458,7 +462,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
 
				 	if (offset_in_page(ret))
			
 
				 		goto out1;
			
 
				 
			
 
				-	ret = move_vma(vma, addr, old_len, new_len, new_addr, locked);
			
 
				+	ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, uf);
			
 
				 	if (!(offset_in_page(ret)))
			
 
				 		goto out;
			
 
				 out1:
			
@@ -497,6 +501,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 
				 	unsigned long ret = -EINVAL;
			
 
				 	unsigned long charged = 0;
			
 
				 	bool locked = false;
			
 
				+	struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX;
			
 
				 
			
 
				 	if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
			
 
				 		return ret;
			
@@ -523,7 +528,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 
				 
			
 
				 	if (flags & MREMAP_FIXED) {
			
 
				 		ret = mremap_to(addr, old_len, new_addr, new_len,
			
 
				-				&locked);
			
 
				+				&locked, &uf);
			
 
				 		goto out;
			
 
				 	}
			
 
				 
			
@@ -592,7 +597,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 
				 			goto out;
			
 
				 		}
			
 
				 
			
 
				-		ret = move_vma(vma, addr, old_len, new_len, new_addr, &locked);
			
 
				+		ret = move_vma(vma, addr, old_len, new_len, new_addr,
			
 
				+			       &locked, &uf);
			
 
				 	}
			
 
				 out:
			
 
				 	if (offset_in_page(ret)) {
			
@@ -602,5 +608,6 @@ out:
 
				 	up_write(&current->mm->mmap_sem);
			
 
				 	if (locked && new_len > old_len)
			
 
				 		mm_populate(new_addr + old_len, new_len - old_len);
			
 
				+	mremap_userfaultfd_complete(&uf, addr, new_addr, old_len);
			
 
				 	return ret;
			
 
				 }
			
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1191,7 +1191,7 @@ error_free:
 
				 enomem:
			
 
				 	pr_err("Allocation of length %lu from process %d (%s) failed\n",
			
 
				 	       len, current->pid, current->comm);
			
 
				-	show_free_areas(0);
			
 
				+	show_free_areas(0, NULL);
			
 
				 	return -ENOMEM;
			
 
				 }
			
 
				 
			
@@ -1412,13 +1412,13 @@ error_getting_vma:
 
				 	kmem_cache_free(vm_region_jar, region);
			
 
				 	pr_warn("Allocation of vma for %lu byte allocation from process %d failed\n",
			
 
				 			len, current->pid);
			
 
				-	show_free_areas(0);
			
 
				+	show_free_areas(0, NULL);
			
 
				 	return -ENOMEM;
			
 
				 
			
 
				 error_getting_region:
			
 
				 	pr_warn("Allocation of vm region for %lu byte allocation from process %d failed\n",
			
 
				 			len, current->pid);
			
 
				-	show_free_areas(0);
			
 
				+	show_free_areas(0, NULL);
			
 
				 	return -ENOMEM;
			
 
				 }
			
 
				 
			
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -417,7 +417,7 @@ static void dump_header(struct oom_control *oc, struct task_struct *p)
 
				 	if (oc->memcg)
			
 
				 		mem_cgroup_print_oom_info(oc->memcg, p);
			
 
				 	else
			
 
				-		show_mem(SHOW_MEM_FILTER_NODES);
			
 
				+		show_mem(SHOW_MEM_FILTER_NODES, nm);
			
 
				 	if (sysctl_oom_dump_tasks)
			
 
				 		dump_tasks(oc->memcg, oc->nodemask);
			
 
				 }
			
@@ -465,8 +465,6 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
 
				 {
			
 
				 	struct mmu_gather tlb;
			
 
				 	struct vm_area_struct *vma;
			
 
				-	struct zap_details details = {.check_swap_entries = true,
			
 
				-				      .ignore_dirty = true};
			
 
				 	bool ret = true;
			
 
				 
			
 
				 	/*
			
@@ -510,14 +508,7 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
 
				 
			
 
				 	tlb_gather_mmu(&tlb, mm, 0, -1);
			
 
				 	for (vma = mm->mmap ; vma; vma = vma->vm_next) {
			
 
				-		if (is_vm_hugetlb_page(vma))
			
 
				-			continue;
			
 
				-
			
 
				-		/*
			
 
				-		 * mlocked VMAs require explicit munlocking before unmap.
			
 
				-		 * Let's keep it simple here and skip such VMAs.
			
 
				-		 */
			
 
				-		if (vma->vm_flags & VM_LOCKED)
			
 
				+		if (!can_madv_dontneed_vma(vma))
			
 
				 			continue;
			
 
				 
			
 
				 		/*
			
@@ -532,7 +523,7 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
 
				 		 */
			
 
				 		if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED))
			
 
				 			unmap_page_range(&tlb, vma, vma->vm_start, vma->vm_end,
			
 
				-					 &details);
			
 
				+					 NULL);
			
 
				 	}
			
 
				 	tlb_finish_mmu(&tlb, 0, -1);
			
 
				 	pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
			
@@ -1013,7 +1004,7 @@ bool out_of_memory(struct oom_control *oc)
 
				 	 * make sure exclude 0 mask - all other users should have at least
			
 
				 	 * ___GFP_DIRECT_RECLAIM to get here.
			
 
				 	 */
			
 
				-	if (oc->gfp_mask && !(oc->gfp_mask & (__GFP_FS|__GFP_NOFAIL)))
			
 
				+	if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS))
			
 
				 		return true;
			
 
				 
			
 
				 	/*
			
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -55,6 +55,7 @@
 
				 #include <linux/kmemleak.h>
			
 
				 #include <linux/compaction.h>
			
 
				 #include <trace/events/kmem.h>
			
 
				+#include <trace/events/oom.h>
			
 
				 #include <linux/prefetch.h>
			
 
				 #include <linux/mm_inline.h>
			
 
				 #include <linux/migrate.h>
			
@@ -714,7 +715,7 @@ static inline void rmv_page_order(struct page *page)
 
				 /*
			
 
				  * This function checks whether a page is free && is the buddy
			
 
				  * we can do coalesce a page and its buddy if
			
 
				- * (a) the buddy is not in a hole &&
			
 
				+ * (a) the buddy is not in a hole (check before calling!) &&
			
 
				  * (b) the buddy is in the buddy system &&
			
 
				  * (c) a page and its buddy have the same order &&
			
 
				  * (d) a page and its buddy are in the same zone.
			
@@ -729,9 +730,6 @@ static inline void rmv_page_order(struct page *page)
 
				 static inline int page_is_buddy(struct page *page, struct page *buddy,
			
 
				 							unsigned int order)
			
 
				 {
			
 
				-	if (!pfn_valid_within(page_to_pfn(buddy)))
			
 
				-		return 0;
			
 
				-
			
 
				 	if (page_is_guard(buddy) && page_order(buddy) == order) {
			
 
				 		if (page_zone_id(page) != page_zone_id(buddy))
			
 
				 			return 0;
			
@@ -787,9 +785,8 @@ static inline void __free_one_page(struct page *page,
 
				 		struct zone *zone, unsigned int order,
			
 
				 		int migratetype)
			
 
				 {
			
 
				-	unsigned long page_idx;
			
 
				-	unsigned long combined_idx;
			
 
				-	unsigned long uninitialized_var(buddy_idx);
			
 
				+	unsigned long combined_pfn;
			
 
				+	unsigned long uninitialized_var(buddy_pfn);
			
 
				 	struct page *buddy;
			
 
				 	unsigned int max_order;
			
 
				 
			
@@ -802,15 +799,16 @@ static inline void __free_one_page(struct page *page,
 
				 	if (likely(!is_migrate_isolate(migratetype)))
			
 
				 		__mod_zone_freepage_state(zone, 1 << order, migratetype);
			
 
				 
			
 
				-	page_idx = pfn & ((1 << MAX_ORDER) - 1);
			
 
				-
			
 
				-	VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page);
			
 
				+	VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
			
 
				 	VM_BUG_ON_PAGE(bad_range(zone, page), page);
			
 
				 
			
 
				 continue_merging:
			
 
				 	while (order < max_order - 1) {
			
 
				-		buddy_idx = __find_buddy_index(page_idx, order);
			
 
				-		buddy = page + (buddy_idx - page_idx);
			
 
				+		buddy_pfn = __find_buddy_pfn(pfn, order);
			
 
				+		buddy = page + (buddy_pfn - pfn);
			
 
				+
			
 
				+		if (!pfn_valid_within(buddy_pfn))
			
 
				+			goto done_merging;
			
 
				 		if (!page_is_buddy(page, buddy, order))
			
 
				 			goto done_merging;
			
 
				 		/*
			
@@ -824,9 +822,9 @@ continue_merging:
 
				 			zone->free_area[order].nr_free--;
			
 
				 			rmv_page_order(buddy);
			
 
				 		}
			
 
				-		combined_idx = buddy_idx & page_idx;
			
 
				-		page = page + (combined_idx - page_idx);
			
 
				-		page_idx = combined_idx;
			
 
				+		combined_pfn = buddy_pfn & pfn;
			
 
				+		page = page + (combined_pfn - pfn);
			
 
				+		pfn = combined_pfn;
			
 
				 		order++;
			
 
				 	}
			
 
				 	if (max_order < MAX_ORDER) {
			
@@ -841,8 +839,8 @@ continue_merging:
 
				 		if (unlikely(has_isolate_pageblock(zone))) {
			
 
				 			int buddy_mt;
			
 
				 
			
 
				-			buddy_idx = __find_buddy_index(page_idx, order);
			
 
				-			buddy = page + (buddy_idx - page_idx);
			
 
				+			buddy_pfn = __find_buddy_pfn(pfn, order);
			
 
				+			buddy = page + (buddy_pfn - pfn);
			
 
				 			buddy_mt = get_pageblock_migratetype(buddy);
			
 
				 
			
 
				 			if (migratetype != buddy_mt
			
@@ -865,12 +863,12 @@ done_merging:
 
				 	 * so it's less likely to be used soon and more likely to be merged
			
 
				 	 * as a higher order page
			
 
				 	 */
			
 
				-	if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {
			
 
				+	if ((order < MAX_ORDER-2) && pfn_valid_within(buddy_pfn)) {
			
 
				 		struct page *higher_page, *higher_buddy;
			
 
				-		combined_idx = buddy_idx & page_idx;
			
 
				-		higher_page = page + (combined_idx - page_idx);
			
 
				-		buddy_idx = __find_buddy_index(combined_idx, order + 1);
			
 
				-		higher_buddy = higher_page + (buddy_idx - combined_idx);
			
 
				+		combined_pfn = buddy_pfn & pfn;
			
 
				+		higher_page = page + (combined_pfn - pfn);
			
 
				+		buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1);
			
 
				+		higher_buddy = higher_page + (buddy_pfn - combined_pfn);
			
 
				 		if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
			
 
				 			list_add_tail(&page->lru,
			
 
				 				&zone->free_area[order].free_list[migratetype]);
			
@@ -3007,18 +3005,12 @@ static inline bool should_suppress_show_mem(void)
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				-static DEFINE_RATELIMIT_STATE(nopage_rs,
			
 
				-		DEFAULT_RATELIMIT_INTERVAL,
			
 
				-		DEFAULT_RATELIMIT_BURST);
			
 
				-
			
 
				-void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)
			
 
				+static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)
			
 
				 {
			
 
				 	unsigned int filter = SHOW_MEM_FILTER_NODES;
			
 
				-	struct va_format vaf;
			
 
				-	va_list args;
			
 
				+	static DEFINE_RATELIMIT_STATE(show_mem_rs, HZ, 1);
			
 
				 
			
 
				-	if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
			
 
				-	    debug_guardpage_minorder() > 0)
			
 
				+	if (should_suppress_show_mem() || !__ratelimit(&show_mem_rs))
			
 
				 		return;
			
 
				 
			
 
				 	/*
			
@@ -3033,6 +3025,20 @@ void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)
 
				 	if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
			
 
				 		filter &= ~SHOW_MEM_FILTER_NODES;
			
 
				 
			
 
				+	show_mem(filter, nodemask);
			
 
				+}
			
 
				+
			
 
				+void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
			
 
				+{
			
 
				+	struct va_format vaf;
			
 
				+	va_list args;
			
 
				+	static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL,
			
 
				+				      DEFAULT_RATELIMIT_BURST);
			
 
				+
			
 
				+	if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
			
 
				+	    debug_guardpage_minorder() > 0)
			
 
				+		return;
			
 
				+
			
 
				 	pr_warn("%s: ", current->comm);
			
 
				 
			
 
				 	va_start(args, fmt);
			
@@ -3041,11 +3047,36 @@ void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)
 
				 	pr_cont("%pV", &vaf);
			
 
				 	va_end(args);
			
 
				 
			
 
				-	pr_cont(", mode:%#x(%pGg)\n", gfp_mask, &gfp_mask);
			
 
				+	pr_cont(", mode:%#x(%pGg), nodemask=", gfp_mask, &gfp_mask);
			
 
				+	if (nodemask)
			
 
				+		pr_cont("%*pbl\n", nodemask_pr_args(nodemask));
			
 
				+	else
			
 
				+		pr_cont("(null)\n");
			
 
				+
			
 
				+	cpuset_print_current_mems_allowed();
			
 
				 
			
 
				 	dump_stack();
			
 
				-	if (!should_suppress_show_mem())
			
 
				-		show_mem(filter);
			
 
				+	warn_alloc_show_mem(gfp_mask, nodemask);
			
 
				+}
			
 
				+
			
 
				+static inline struct page *
			
 
				+__alloc_pages_cpuset_fallback(gfp_t gfp_mask, unsigned int order,
			
 
				+			      unsigned int alloc_flags,
			
 
				+			      const struct alloc_context *ac)
			
 
				+{
			
 
				+	struct page *page;
			
 
				+
			
 
				+	page = get_page_from_freelist(gfp_mask, order,
			
 
				+			alloc_flags|ALLOC_CPUSET, ac);
			
 
				+	/*
			
 
				+	 * fallback to ignore cpuset restriction if our nodes
			
 
				+	 * are depleted
			
 
				+	 */
			
 
				+	if (!page)
			
 
				+		page = get_page_from_freelist(gfp_mask, order,
			
 
				+				alloc_flags, ac);
			
 
				+
			
 
				+	return page;
			
 
				 }
			
 
				 
			
 
				 static inline struct page *
			
@@ -3083,47 +3114,42 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
 
				 	if (page)
			
 
				 		goto out;
			
 
				 
			
 
				-	if (!(gfp_mask & __GFP_NOFAIL)) {
			
 
				-		/* Coredumps can quickly deplete all memory reserves */
			
 
				-		if (current->flags & PF_DUMPCORE)
			
 
				-			goto out;
			
 
				-		/* The OOM killer will not help higher order allocs */
			
 
				-		if (order > PAGE_ALLOC_COSTLY_ORDER)
			
 
				-			goto out;
			
 
				-		/* The OOM killer does not needlessly kill tasks for lowmem */
			
 
				-		if (ac->high_zoneidx < ZONE_NORMAL)
			
 
				-			goto out;
			
 
				-		if (pm_suspended_storage())
			
 
				-			goto out;
			
 
				-		/*
			
 
				-		 * XXX: GFP_NOFS allocations should rather fail than rely on
			
 
				-		 * other request to make a forward progress.
			
 
				-		 * We are in an unfortunate situation where out_of_memory cannot
			
 
				-		 * do much for this context but let's try it to at least get
			
 
				-		 * access to memory reserved if the current task is killed (see
			
 
				-		 * out_of_memory). Once filesystems are ready to handle allocation
			
 
				-		 * failures more gracefully we should just bail out here.
			
 
				-		 */
			
 
				+	/* Coredumps can quickly deplete all memory reserves */
			
 
				+	if (current->flags & PF_DUMPCORE)
			
 
				+		goto out;
			
 
				+	/* The OOM killer will not help higher order allocs */
			
 
				+	if (order > PAGE_ALLOC_COSTLY_ORDER)
			
 
				+		goto out;
			
 
				+	/* The OOM killer does not needlessly kill tasks for lowmem */
			
 
				+	if (ac->high_zoneidx < ZONE_NORMAL)
			
 
				+		goto out;
			
 
				+	if (pm_suspended_storage())
			
 
				+		goto out;
			
 
				+	/*
			
 
				+	 * XXX: GFP_NOFS allocations should rather fail than rely on
			
 
				+	 * other request to make a forward progress.
			
 
				+	 * We are in an unfortunate situation where out_of_memory cannot
			
 
				+	 * do much for this context but let's try it to at least get
			
 
				+	 * access to memory reserved if the current task is killed (see
			
 
				+	 * out_of_memory). Once filesystems are ready to handle allocation
			
 
				+	 * failures more gracefully we should just bail out here.
			
 
				+	 */
			
 
				+
			
 
				+	/* The OOM killer may not free memory on a specific node */
			
 
				+	if (gfp_mask & __GFP_THISNODE)
			
 
				+		goto out;
			
 
				 
			
 
				-		/* The OOM killer may not free memory on a specific node */
			
 
				-		if (gfp_mask & __GFP_THISNODE)
			
 
				-			goto out;
			
 
				-	}
			
 
				 	/* Exhausted what can be done so it's blamo time */
			
 
				 	if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
			
 
				 		*did_some_progress = 1;
			
 
				 
			
 
				-		if (gfp_mask & __GFP_NOFAIL) {
			
 
				-			page = get_page_from_freelist(gfp_mask, order,
			
 
				-					ALLOC_NO_WATERMARKS|ALLOC_CPUSET, ac);
			
 
				-			/*
			
 
				-			 * fallback to ignore cpuset restriction if our nodes
			
 
				-			 * are depleted
			
 
				-			 */
			
 
				-			if (!page)
			
 
				-				page = get_page_from_freelist(gfp_mask, order,
			
 
				+		/*
			
 
				+		 * Help non-failing allocations by giving them access to memory
			
 
				+		 * reserves
			
 
				+		 */
			
 
				+		if (gfp_mask & __GFP_NOFAIL)
			
 
				+			page = __alloc_pages_cpuset_fallback(gfp_mask, order,
			
 
				 					ALLOC_NO_WATERMARKS, ac);
			
 
				-		}
			
 
				 	}
			
 
				 out:
			
 
				 	mutex_unlock(&oom_lock);
			
@@ -3192,6 +3218,9 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
 
				 {
			
 
				 	int max_retries = MAX_COMPACT_RETRIES;
			
 
				 	int min_priority;
			
 
				+	bool ret = false;
			
 
				+	int retries = *compaction_retries;
			
 
				+	enum compact_priority priority = *compact_priority;
			
 
				 
			
 
				 	if (!order)
			
 
				 		return false;
			
@@ -3213,8 +3242,10 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
 
				 	 * But do not retry if the given zonelist is not suitable for
			
 
				 	 * compaction.
			
 
				 	 */
			
 
				-	if (compaction_withdrawn(compact_result))
			
 
				-		return compaction_zonelist_suitable(ac, order, alloc_flags);
			
 
				+	if (compaction_withdrawn(compact_result)) {
			
 
				+		ret = compaction_zonelist_suitable(ac, order, alloc_flags);
			
 
				+		goto out;
			
 
				+	}
			
 
				 
			
 
				 	/*
			
 
				 	 * !costly requests are much more important than __GFP_REPEAT
			
@@ -3226,8 +3257,10 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
 
				 	 */
			
 
				 	if (order > PAGE_ALLOC_COSTLY_ORDER)
			
 
				 		max_retries /= 4;
			
 
				-	if (*compaction_retries <= max_retries)
			
 
				-		return true;
			
 
				+	if (*compaction_retries <= max_retries) {
			
 
				+		ret = true;
			
 
				+		goto out;
			
 
				+	}
			
 
				 
			
 
				 	/*
			
 
				 	 * Make sure there are attempts at the highest priority if we exhausted
			
@@ -3236,12 +3269,15 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
 
				 check_priority:
			
 
				 	min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ?
			
 
				 			MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY;
			
 
				+
			
 
				 	if (*compact_priority > min_priority) {
			
 
				 		(*compact_priority)--;
			
 
				 		*compaction_retries = 0;
			
 
				-		return true;
			
 
				+		ret = true;
			
 
				 	}
			
 
				-	return false;
			
 
				+out:
			
 
				+	trace_compact_retry(order, priority, compact_result, retries, max_retries, ret);
			
 
				+	return ret;
			
 
				 }
			
 
				 #else
			
 
				 static inline struct page *
			
@@ -3464,6 +3500,8 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
 
				 					ac->nodemask) {
			
 
				 		unsigned long available;
			
 
				 		unsigned long reclaimable;
			
 
				+		unsigned long min_wmark = min_wmark_pages(zone);
			
 
				+		bool wmark;
			
 
				 
			
 
				 		available = reclaimable = zone_reclaimable_pages(zone);
			
 
				 		available -= DIV_ROUND_UP((*no_progress_loops) * available,
			
@@ -3474,8 +3512,11 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
 
				 		 * Would the allocation succeed if we reclaimed the whole
			
 
				 		 * available?
			
 
				 		 */
			
 
				-		if (__zone_watermark_ok(zone, order, min_wmark_pages(zone),
			
 
				-				ac_classzone_idx(ac), alloc_flags, available)) {
			
 
				+		wmark = __zone_watermark_ok(zone, order, min_wmark,
			
 
				+				ac_classzone_idx(ac), alloc_flags, available);
			
 
				+		trace_reclaim_retry_zone(z, order, reclaimable,
			
 
				+				available, min_wmark, *no_progress_loops, wmark);
			
 
				+		if (wmark) {
			
 
				 			/*
			
 
				 			 * If we didn't make any progress and have a lot of
			
 
				 			 * dirty + writeback pages then we should wait for
			
@@ -3555,6 +3596,14 @@ retry_cpuset:
 
				 	no_progress_loops = 0;
			
 
				 	compact_priority = DEF_COMPACT_PRIORITY;
			
 
				 	cpuset_mems_cookie = read_mems_allowed_begin();
			
 
				+
			
 
				+	/*
			
 
				+	 * The fast path uses conservative alloc_flags to succeed only until
			
 
				+	 * kswapd needs to be woken up, and to avoid the cost of setting up
			
 
				+	 * alloc_flags precisely. So we do that now.
			
 
				+	 */
			
 
				+	alloc_flags = gfp_to_alloc_flags(gfp_mask);
			
 
				+
			
 
				 	/*
			
 
				 	 * We need to recalculate the starting point for the zonelist iterator
			
 
				 	 * because we might have used different nodemask in the fast path, or
			
@@ -3566,14 +3615,6 @@ retry_cpuset:
 
				 	if (!ac->preferred_zoneref->zone)
			
 
				 		goto nopage;
			
 
				 
			
 
				-
			
 
				-	/*
			
 
				-	 * The fast path uses conservative alloc_flags to succeed only until
			
 
				-	 * kswapd needs to be woken up, and to avoid the cost of setting up
			
 
				-	 * alloc_flags precisely. So we do that now.
			
 
				-	 */
			
 
				-	alloc_flags = gfp_to_alloc_flags(gfp_mask);
			
 
				-
			
 
				 	if (gfp_mask & __GFP_KSWAPD_RECLAIM)
			
 
				 		wake_all_kswapds(order, ac);
			
 
				 
			
@@ -3650,35 +3691,21 @@ retry:
 
				 		goto got_pg;
			
 
				 
			
 
				 	/* Caller is not willing to reclaim, we can't balance anything */
			
 
				-	if (!can_direct_reclaim) {
			
 
				-		/*
			
 
				-		 * All existing users of the __GFP_NOFAIL are blockable, so warn
			
 
				-		 * of any new users that actually allow this type of allocation
			
 
				-		 * to fail.
			
 
				-		 */
			
 
				-		WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL);
			
 
				+	if (!can_direct_reclaim)
			
 
				 		goto nopage;
			
 
				-	}
			
 
				 
			
 
				-	/* Avoid recursion of direct reclaim */
			
 
				-	if (current->flags & PF_MEMALLOC) {
			
 
				-		/*
			
 
				-		 * __GFP_NOFAIL request from this context is rather bizarre
			
 
				-		 * because we cannot reclaim anything and only can loop waiting
			
 
				-		 * for somebody to do a work for us.
			
 
				-		 */
			
 
				-		if (WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
			
 
				-			cond_resched();
			
 
				-			goto retry;
			
 
				-		}
			
 
				-		goto nopage;
			
 
				+	/* Make sure we know about allocations which stall for too long */
			
 
				+	if (time_after(jiffies, alloc_start + stall_timeout)) {
			
 
				+		warn_alloc(gfp_mask, ac->nodemask,
			
 
				+			"page allocation stalls for %ums, order:%u",
			
 
				+			jiffies_to_msecs(jiffies-alloc_start), order);
			
 
				+		stall_timeout += 10 * HZ;
			
 
				 	}
			
 
				 
			
 
				-	/* Avoid allocations with no watermarks from looping endlessly */
			
 
				-	if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
			
 
				+	/* Avoid recursion of direct reclaim */
			
 
				+	if (current->flags & PF_MEMALLOC)
			
 
				 		goto nopage;
			
 
				 
			
 
				-
			
 
				 	/* Try direct reclaim and then allocating */
			
 
				 	page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
			
 
				 							&did_some_progress);
			
@@ -3702,14 +3729,6 @@ retry:
 
				 	if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT))
			
 
				 		goto nopage;
			
 
				 
			
 
				-	/* Make sure we know about allocations which stall for too long */
			
 
				-	if (time_after(jiffies, alloc_start + stall_timeout)) {
			
 
				-		warn_alloc(gfp_mask,
			
 
				-			"page allocation stalls for %ums, order:%u",
			
 
				-			jiffies_to_msecs(jiffies-alloc_start), order);
			
 
				-		stall_timeout += 10 * HZ;
			
 
				-	}
			
 
				-
			
 
				 	if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
			
 
				 				 did_some_progress > 0, &no_progress_loops))
			
 
				 		goto retry;
			
@@ -3738,6 +3757,10 @@ retry:
 
				 	if (page)
			
 
				 		goto got_pg;
			
 
				 
			
 
				+	/* Avoid allocations with no watermarks from looping endlessly */
			
 
				+	if (test_thread_flag(TIF_MEMDIE))
			
 
				+		goto nopage;
			
 
				+
			
 
				 	/* Retry as long as the OOM killer is making progress */
			
 
				 	if (did_some_progress) {
			
 
				 		no_progress_loops = 0;
			
@@ -3755,7 +3778,48 @@ nopage:
 
				 	if (read_mems_allowed_retry(cpuset_mems_cookie))
			
 
				 		goto retry_cpuset;
			
 
				 
			
 
				-	warn_alloc(gfp_mask,
			
 
				+	/*
			
 
				+	 * Make sure that __GFP_NOFAIL request doesn't leak out and make sure
			
 
				+	 * we always retry
			
 
				+	 */
			
 
				+	if (gfp_mask & __GFP_NOFAIL) {
			
 
				+		/*
			
 
				+		 * All existing users of the __GFP_NOFAIL are blockable, so warn
			
 
				+		 * of any new users that actually require GFP_NOWAIT
			
 
				+		 */
			
 
				+		if (WARN_ON_ONCE(!can_direct_reclaim))
			
 
				+			goto fail;
			
 
				+
			
 
				+		/*
			
 
				+		 * PF_MEMALLOC request from this context is rather bizarre
			
 
				+		 * because we cannot reclaim anything and only can loop waiting
			
 
				+		 * for somebody to do a work for us
			
 
				+		 */
			
 
				+		WARN_ON_ONCE(current->flags & PF_MEMALLOC);
			
 
				+
			
 
				+		/*
			
 
				+		 * non failing costly orders are a hard requirement which we
			
 
				+		 * are not prepared for much so let's warn about these users
			
 
				+		 * so that we can identify them and convert them to something
			
 
				+		 * else.
			
 
				+		 */
			
 
				+		WARN_ON_ONCE(order > PAGE_ALLOC_COSTLY_ORDER);
			
 
				+
			
 
				+		/*
			
 
				+		 * Help non-failing allocations by giving them access to memory
			
 
				+		 * reserves but do not use ALLOC_NO_WATERMARKS because this
			
 
				+		 * could deplete whole memory reserves which would just make
			
 
				+		 * the situation worse
			
 
				+		 */
			
 
				+		page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_HARDER, ac);
			
 
				+		if (page)
			
 
				+			goto got_pg;
			
 
				+
			
 
				+		cond_resched();
			
 
				+		goto retry;
			
 
				+	}
			
 
				+fail:
			
 
				+	warn_alloc(gfp_mask, ac->nodemask,
			
 
				 			"page allocation failure: order:%u", order);
			
 
				 got_pg:
			
 
				 	return page;
			
@@ -4252,20 +4316,20 @@ void si_meminfo_node(struct sysinfo *val, int nid)
 
				  * Determine whether the node should be displayed or not, depending on whether
			
 
				  * SHOW_MEM_FILTER_NODES was passed to show_free_areas().
			
 
				  */
			
 
				-bool skip_free_areas_node(unsigned int flags, int nid)
			
 
				+static bool show_mem_node_skip(unsigned int flags, int nid, nodemask_t *nodemask)
			
 
				 {
			
 
				-	bool ret = false;
			
 
				-	unsigned int cpuset_mems_cookie;
			
 
				-
			
 
				 	if (!(flags & SHOW_MEM_FILTER_NODES))
			
 
				-		goto out;
			
 
				+		return false;
			
 
				 
			
 
				-	do {
			
 
				-		cpuset_mems_cookie = read_mems_allowed_begin();
			
 
				-		ret = !node_isset(nid, cpuset_current_mems_allowed);
			
 
				-	} while (read_mems_allowed_retry(cpuset_mems_cookie));
			
 
				-out:
			
 
				-	return ret;
			
 
				+	/*
			
 
				+	 * no node mask - aka implicit memory numa policy. Do not bother with
			
 
				+	 * the synchronization - read_mems_allowed_begin - because we do not
			
 
				+	 * have to be precise here.
			
 
				+	 */
			
 
				+	if (!nodemask)
			
 
				+		nodemask = &cpuset_current_mems_allowed;
			
 
				+
			
 
				+	return !node_isset(nid, *nodemask);
			
 
				 }
			
 
				 
			
 
				 #define K(x) ((x) << (PAGE_SHIFT-10))
			
@@ -4306,7 +4370,7 @@ static void show_migration_types(unsigned char type)
 
				  * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's
			
 
				  *   cpuset.
			
 
				  */
			
 
				-void show_free_areas(unsigned int filter)
			
 
				+void show_free_areas(unsigned int filter, nodemask_t *nodemask)
			
 
				 {
			
 
				 	unsigned long free_pcp = 0;
			
 
				 	int cpu;
			
@@ -4314,7 +4378,7 @@ void show_free_areas(unsigned int filter)
 
				 	pg_data_t *pgdat;
			
 
				 
			
 
				 	for_each_populated_zone(zone) {
			
 
				-		if (skip_free_areas_node(filter, zone_to_nid(zone)))
			
 
				+		if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
			
 
				 			continue;
			
 
				 
			
 
				 		for_each_online_cpu(cpu)
			
@@ -4348,6 +4412,9 @@ void show_free_areas(unsigned int filter)
 
				 		global_page_state(NR_FREE_CMA_PAGES));
			
 
				 
			
 
				 	for_each_online_pgdat(pgdat) {
			
 
				+		if (show_mem_node_skip(filter, pgdat->node_id, nodemask))
			
 
				+			continue;
			
 
				+
			
 
				 		printk("Node %d"
			
 
				 			" active_anon:%lukB"
			
 
				 			" inactive_anon:%lukB"
			
@@ -4397,7 +4464,7 @@ void show_free_areas(unsigned int filter)
 
				 	for_each_populated_zone(zone) {
			
 
				 		int i;
			
 
				 
			
 
				-		if (skip_free_areas_node(filter, zone_to_nid(zone)))
			
 
				+		if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
			
 
				 			continue;
			
 
				 
			
 
				 		free_pcp = 0;
			
@@ -4462,7 +4529,7 @@ void show_free_areas(unsigned int filter)
 
				 		unsigned long nr[MAX_ORDER], flags, total = 0;
			
 
				 		unsigned char types[MAX_ORDER];
			
 
				 
			
 
				-		if (skip_free_areas_node(filter, zone_to_nid(zone)))
			
 
				+		if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
			
 
				 			continue;
			
 
				 		show_node(zone);
			
 
				 		printk(KERN_CONT "%s: ", zone->name);
			
@@ -5083,8 +5150,17 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 
				 		if (context != MEMMAP_EARLY)
			
 
				 			goto not_early;
			
 
				 
			
 
				-		if (!early_pfn_valid(pfn))
			
 
				+		if (!early_pfn_valid(pfn)) {
			
 
				+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
			
 
				+			/*
			
 
				+			 * Skip to the pfn preceding the next valid one (or
			
 
				+			 * end_pfn), such that we hit a valid pfn (or end_pfn)
			
 
				+			 * on our next iteration of the loop.
			
 
				+			 */
			
 
				+			pfn = memblock_next_valid_pfn(pfn, end_pfn) - 1;
			
 
				+#endif
			
 
				 			continue;
			
 
				+		}
			
 
				 		if (!early_pfn_in_nid(pfn, nid))
			
 
				 			continue;
			
 
				 		if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised))
			
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -83,7 +83,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
 
				 	unsigned long flags, nr_pages;
			
 
				 	bool isolated_page = false;
			
 
				 	unsigned int order;
			
 
				-	unsigned long page_idx, buddy_idx;
			
 
				+	unsigned long pfn, buddy_pfn;
			
 
				 	struct page *buddy;
			
 
				 
			
 
				 	zone = page_zone(page);
			
@@ -102,11 +102,11 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
 
				 	if (PageBuddy(page)) {
			
 
				 		order = page_order(page);
			
 
				 		if (order >= pageblock_order) {
			
 
				-			page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
			
 
				-			buddy_idx = __find_buddy_index(page_idx, order);
			
 
				-			buddy = page + (buddy_idx - page_idx);
			
 
				+			pfn = page_to_pfn(page);
			
 
				+			buddy_pfn = __find_buddy_pfn(pfn, order);
			
 
				+			buddy = page + (buddy_pfn - pfn);
			
 
				 
			
 
				-			if (pfn_valid_within(page_to_pfn(buddy)) &&
			
 
				+			if (pfn_valid_within(buddy_pfn) &&
			
 
				 			    !is_migrate_isolate_page(buddy)) {
			
 
				 				__isolate_free_page(page, order);
			
 
				 				isolated_page = true;
			
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -34,6 +34,8 @@
 
				 #include <linux/uio.h>
			
 
				 #include <linux/khugepaged.h>
			
 
				 
			
 
				+#include <asm/tlbflush.h> /* for arch/microblaze update_mmu_cache() */
			
 
				+
			
 
				 static struct vfsmount *shm_mnt;
			
 
				 
			
 
				 #ifdef CONFIG_SHMEM
			
@@ -70,6 +72,8 @@ static struct vfsmount *shm_mnt;
 
				 #include <linux/syscalls.h>
			
 
				 #include <linux/fcntl.h>
			
 
				 #include <uapi/linux/memfd.h>
			
 
				+#include <linux/userfaultfd_k.h>
			
 
				+#include <linux/rmap.h>
			
 
				 
			
 
				 #include <linux/uaccess.h>
			
 
				 #include <asm/pgtable.h>
			
@@ -115,13 +119,14 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
 
				 				struct shmem_inode_info *info, pgoff_t index);
			
 
				 static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
			
 
				 		struct page **pagep, enum sgp_type sgp,
			
 
				-		gfp_t gfp, struct mm_struct *fault_mm, int *fault_type);
			
 
				+		gfp_t gfp, struct vm_area_struct *vma,
			
 
				+		struct vm_fault *vmf, int *fault_type);
			
 
				 
			
 
				 int shmem_getpage(struct inode *inode, pgoff_t index,
			
 
				 		struct page **pagep, enum sgp_type sgp)
			
 
				 {
			
 
				 	return shmem_getpage_gfp(inode, index, pagep, sgp,
			
 
				-		mapping_gfp_mask(inode->i_mapping), NULL, NULL);
			
 
				+		mapping_gfp_mask(inode->i_mapping), NULL, NULL, NULL);
			
 
				 }
			
 
				 
			
 
				 static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
			
@@ -190,6 +195,11 @@ static const struct inode_operations shmem_special_inode_operations;
 
				 static const struct vm_operations_struct shmem_vm_ops;
			
 
				 static struct file_system_type shmem_fs_type;
			
 
				 
			
 
				+bool vma_is_shmem(struct vm_area_struct *vma)
			
 
				+{
			
 
				+	return vma->vm_ops == &shmem_vm_ops;
			
 
				+}
			
 
				+
			
 
				 static LIST_HEAD(shmem_swaplist);
			
 
				 static DEFINE_MUTEX(shmem_swaplist_mutex);
			
 
				 
			
@@ -1570,7 +1580,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
 
				  */
			
 
				 static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
			
 
				 	struct page **pagep, enum sgp_type sgp, gfp_t gfp,
			
 
				-	struct mm_struct *fault_mm, int *fault_type)
			
 
				+	struct vm_area_struct *vma, struct vm_fault *vmf, int *fault_type)
			
 
				 {
			
 
				 	struct address_space *mapping = inode->i_mapping;
			
 
				 	struct shmem_inode_info *info = SHMEM_I(inode);
			
@@ -1624,7 +1634,7 @@ repeat:
 
				 	 * bring it back from swap or allocate.
			
 
				 	 */
			
 
				 	sbinfo = SHMEM_SB(inode->i_sb);
			
 
				-	charge_mm = fault_mm ? : current->mm;
			
 
				+	charge_mm = vma ? vma->vm_mm : current->mm;
			
 
				 
			
 
				 	if (swap.val) {
			
 
				 		/* Look it up and read it in.. */
			
@@ -1634,7 +1644,8 @@ repeat:
 
				 			if (fault_type) {
			
 
				 				*fault_type |= VM_FAULT_MAJOR;
			
 
				 				count_vm_event(PGMAJFAULT);
			
 
				-				mem_cgroup_count_vm_event(fault_mm, PGMAJFAULT);
			
 
				+				mem_cgroup_count_vm_event(charge_mm,
			
 
				+							  PGMAJFAULT);
			
 
				 			}
			
 
				 			/* Here we actually start the io */
			
 
				 			page = shmem_swapin(swap, gfp, info, index);
			
@@ -1703,6 +1714,11 @@ repeat:
 
				 		swap_free(swap);
			
 
				 
			
 
				 	} else {
			
 
				+		if (vma && userfaultfd_missing(vma)) {
			
 
				+			*fault_type = handle_userfault(vmf, VM_UFFD_MISSING);
			
 
				+			return 0;
			
 
				+		}
			
 
				+
			
 
				 		/* shmem_symlink() */
			
 
				 		if (mapping->a_ops != &shmem_aops)
			
 
				 			goto alloc_nohuge;
			
@@ -1965,7 +1981,7 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 
				 		sgp = SGP_NOHUGE;
			
 
				 
			
 
				 	error = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, sgp,
			
 
				-				  gfp, vma->vm_mm, &ret);
			
 
				+				  gfp, vma, vmf, &ret);
			
 
				 	if (error)
			
 
				 		return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
			
 
				 	return ret;
			
@@ -2175,10 +2191,123 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
 
				 
			
 
				 bool shmem_mapping(struct address_space *mapping)
			
 
				 {
			
 
				-	if (!mapping->host)
			
 
				-		return false;
			
 
				+	return mapping->a_ops == &shmem_aops;
			
 
				+}
			
 
				 
			
 
				-	return mapping->host->i_sb->s_op == &shmem_ops;
			
 
				+int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm,
			
 
				+			   pmd_t *dst_pmd,
			
 
				+			   struct vm_area_struct *dst_vma,
			
 
				+			   unsigned long dst_addr,
			
 
				+			   unsigned long src_addr,
			
 
				+			   struct page **pagep)
			
 
				+{
			
 
				+	struct inode *inode = file_inode(dst_vma->vm_file);
			
 
				+	struct shmem_inode_info *info = SHMEM_I(inode);
			
 
				+	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
			
 
				+	struct address_space *mapping = inode->i_mapping;
			
 
				+	gfp_t gfp = mapping_gfp_mask(mapping);
			
 
				+	pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
			
 
				+	struct mem_cgroup *memcg;
			
 
				+	spinlock_t *ptl;
			
 
				+	void *page_kaddr;
			
 
				+	struct page *page;
			
 
				+	pte_t _dst_pte, *dst_pte;
			
 
				+	int ret;
			
 
				+
			
 
				+	ret = -ENOMEM;
			
 
				+	if (shmem_acct_block(info->flags, 1))
			
 
				+		goto out;
			
 
				+	if (sbinfo->max_blocks) {
			
 
				+		if (percpu_counter_compare(&sbinfo->used_blocks,
			
 
				+					   sbinfo->max_blocks) >= 0)
			
 
				+			goto out_unacct_blocks;
			
 
				+		percpu_counter_inc(&sbinfo->used_blocks);
			
 
				+	}
			
 
				+
			
 
				+	if (!*pagep) {
			
 
				+		page = shmem_alloc_page(gfp, info, pgoff);
			
 
				+		if (!page)
			
 
				+			goto out_dec_used_blocks;
			
 
				+
			
 
				+		page_kaddr = kmap_atomic(page);
			
 
				+		ret = copy_from_user(page_kaddr, (const void __user *)src_addr,
			
 
				+				     PAGE_SIZE);
			
 
				+		kunmap_atomic(page_kaddr);
			
 
				+
			
 
				+		/* fallback to copy_from_user outside mmap_sem */
			
 
				+		if (unlikely(ret)) {
			
 
				+			*pagep = page;
			
 
				+			if (sbinfo->max_blocks)
			
 
				+				percpu_counter_add(&sbinfo->used_blocks, -1);
			
 
				+			shmem_unacct_blocks(info->flags, 1);
			
 
				+			/* don't free the page */
			
 
				+			return -EFAULT;
			
 
				+		}
			
 
				+	} else {
			
 
				+		page = *pagep;
			
 
				+		*pagep = NULL;
			
 
				+	}
			
 
				+
			
 
				+	VM_BUG_ON(PageLocked(page) || PageSwapBacked(page));
			
 
				+	__SetPageLocked(page);
			
 
				+	__SetPageSwapBacked(page);
			
 
				+	__SetPageUptodate(page);
			
 
				+
			
 
				+	ret = mem_cgroup_try_charge(page, dst_mm, gfp, &memcg, false);
			
 
				+	if (ret)
			
 
				+		goto out_release;
			
 
				+
			
 
				+	ret = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK);
			
 
				+	if (!ret) {
			
 
				+		ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL);
			
 
				+		radix_tree_preload_end();
			
 
				+	}
			
 
				+	if (ret)
			
 
				+		goto out_release_uncharge;
			
 
				+
			
 
				+	mem_cgroup_commit_charge(page, memcg, false, false);
			
 
				+
			
 
				+	_dst_pte = mk_pte(page, dst_vma->vm_page_prot);
			
 
				+	if (dst_vma->vm_flags & VM_WRITE)
			
 
				+		_dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte));
			
 
				+
			
 
				+	ret = -EEXIST;
			
 
				+	dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
			
 
				+	if (!pte_none(*dst_pte))
			
 
				+		goto out_release_uncharge_unlock;
			
 
				+
			
 
				+	lru_cache_add_anon(page);
			
 
				+
			
 
				+	spin_lock(&info->lock);
			
 
				+	info->alloced++;
			
 
				+	inode->i_blocks += BLOCKS_PER_PAGE;
			
 
				+	shmem_recalc_inode(inode);
			
 
				+	spin_unlock(&info->lock);
			
 
				+
			
 
				+	inc_mm_counter(dst_mm, mm_counter_file(page));
			
 
				+	page_add_file_rmap(page, false);
			
 
				+	set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
			
 
				+
			
 
				+	/* No need to invalidate - it was non-present before */
			
 
				+	update_mmu_cache(dst_vma, dst_addr, dst_pte);
			
 
				+	unlock_page(page);
			
 
				+	pte_unmap_unlock(dst_pte, ptl);
			
 
				+	ret = 0;
			
 
				+out:
			
 
				+	return ret;
			
 
				+out_release_uncharge_unlock:
			
 
				+	pte_unmap_unlock(dst_pte, ptl);
			
 
				+out_release_uncharge:
			
 
				+	mem_cgroup_cancel_charge(page, memcg, false);
			
 
				+out_release:
			
 
				+	unlock_page(page);
			
 
				+	put_page(page);
			
 
				+out_dec_used_blocks:
			
 
				+	if (sbinfo->max_blocks)
			
 
				+		percpu_counter_add(&sbinfo->used_blocks, -1);
			
 
				+out_unacct_blocks:
			
 
				+	shmem_unacct_blocks(info->flags, 1);
			
 
				+	goto out;
			
 
				 }
			
 
				 
			
 
				 #ifdef CONFIG_TMPFS
			
@@ -4140,7 +4269,7 @@ struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
 
				 
			
 
				 	BUG_ON(mapping->a_ops != &shmem_aops);
			
 
				 	error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE,
			
 
				-				  gfp, NULL, NULL);
			
 
				+				  gfp, NULL, NULL, NULL);
			
 
				 	if (error)
			
 
				 		page = ERR_PTR(error);
			
 
				 	else
			
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1288,7 +1288,8 @@ void __init kmem_cache_init(void)
 
				 	 * Initialize the caches that provide memory for the  kmem_cache_node
			
 
				 	 * structures first.  Without this, further allocations will bug.
			
 
				 	 */
			
 
				-	kmalloc_caches[INDEX_NODE] = create_kmalloc_cache("kmalloc-node",
			
 
				+	kmalloc_caches[INDEX_NODE] = create_kmalloc_cache(
			
 
				+				kmalloc_info[INDEX_NODE].name,
			
 
				 				kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS);
			
 
				 	slab_state = PARTIAL_NODE;
			
 
				 	setup_kmalloc_cache_index_table();
			
@@ -2332,6 +2333,13 @@ int __kmem_cache_shrink(struct kmem_cache *cachep)
 
				 	return (ret ? 1 : 0);
			
 
				 }
			
 
				 
			
 
				+#ifdef CONFIG_MEMCG
			
 
				+void __kmemcg_cache_deactivate(struct kmem_cache *cachep)
			
 
				+{
			
 
				+	__kmem_cache_shrink(cachep);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				 int __kmem_cache_shutdown(struct kmem_cache *cachep)
			
 
				 {
			
 
				 	return __kmem_cache_shrink(cachep);