8 лет назад · 995d03ae26
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -240,18 +240,6 @@ int ocfs2_set_acl(handle_t *handle,
 
				 	switch (type) {
			
 
				 	case ACL_TYPE_ACCESS:
			
 
				 		name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS;
			
 
				-		if (acl) {
			
 
				-			umode_t mode;
			
 
				-
			
 
				-			ret = posix_acl_update_mode(inode, &mode, &acl);
			
 
				-			if (ret)
			
 
				-				return ret;
			
 
				-
			
 
				-			ret = ocfs2_acl_set_mode(inode, di_bh,
			
 
				-						 handle, mode);
			
 
				-			if (ret)
			
 
				-				return ret;
			
 
				-		}
			
 
				 		break;
			
 
				 	case ACL_TYPE_DEFAULT:
			
 
				 		name_index = OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT;
			
@@ -289,7 +277,19 @@ int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 
				 	had_lock = ocfs2_inode_lock_tracker(inode, &bh, 1, &oh);
			
 
				 	if (had_lock < 0)
			
 
				 		return had_lock;
			
 
				+	if (type == ACL_TYPE_ACCESS && acl) {
			
 
				+		umode_t mode;
			
 
				+
			
 
				+		status = posix_acl_update_mode(inode, &mode, &acl);
			
 
				+		if (status)
			
 
				+			goto unlock;
			
 
				+
			
 
				+		status = ocfs2_acl_set_mode(inode, bh, NULL, mode);
			
 
				+		if (status)
			
 
				+			goto unlock;
			
 
				+	}
			
 
				 	status = ocfs2_set_acl(NULL, inode, bh, type, acl, NULL, NULL);
			
 
				+unlock:
			
 
				 	ocfs2_inode_unlock_tracker(inode, 1, &oh, had_lock);
			
 
				 	brelse(bh);
			
 
				 	return status;
			
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -854,6 +854,9 @@ wakeup:
 
				 	__wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, &range);
			
 
				 	spin_unlock(&ctx->fault_pending_wqh.lock);
			
 
				 
			
 
				+	/* Flush pending events that may still wait on event_wqh */
			
 
				+	wake_up_all(&ctx->event_wqh);
			
 
				+
			
 
				 	wake_up_poll(&ctx->fd_wqh, POLLHUP);
			
 
				 	userfaultfd_ctx_put(ctx);
			
 
				 	return 0;
			
@@ -1643,6 +1646,8 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
 
				 		ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start,
			
 
				 				     uffdio_zeropage.range.len);
			
 
				 		mmput(ctx->mm);
			
 
				+	} else {
			
 
				+		return -ENOSPC;
			
 
				 	}
			
 
				 	if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage)))
			
 
				 		return -EFAULT;
			
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -18,6 +18,19 @@
 
				 
			
 
				 #ifdef CONFIG_CPUSETS
			
 
				 
			
 
				+/*
			
 
				+ * Static branch rewrites can happen in an arbitrary order for a given
			
 
				+ * key. In code paths where we need to loop with read_mems_allowed_begin() and
			
 
				+ * read_mems_allowed_retry() to get a consistent view of mems_allowed, we need
			
 
				+ * to ensure that begin() always gets rewritten before retry() in the
			
 
				+ * disabled -> enabled transition. If not, then if local irqs are disabled
			
 
				+ * around the loop, we can deadlock since retry() would always be
			
 
				+ * comparing the latest value of the mems_allowed seqcount against 0 as
			
 
				+ * begin() still would see cpusets_enabled() as false. The enabled -> disabled
			
 
				+ * transition should happen in reverse order for the same reasons (want to stop
			
 
				+ * looking at real value of mems_allowed.sequence in retry() first).
			
 
				+ */
			
 
				+extern struct static_key_false cpusets_pre_enable_key;
			
 
				 extern struct static_key_false cpusets_enabled_key;
			
 
				 static inline bool cpusets_enabled(void)
			
 
				 {
			
@@ -32,12 +45,14 @@ static inline int nr_cpusets(void)
 
				 
			
 
				 static inline void cpuset_inc(void)
			
 
				 {
			
 
				+	static_branch_inc(&cpusets_pre_enable_key);
			
 
				 	static_branch_inc(&cpusets_enabled_key);
			
 
				 }
			
 
				 
			
 
				 static inline void cpuset_dec(void)
			
 
				 {
			
 
				 	static_branch_dec(&cpusets_enabled_key);
			
 
				+	static_branch_dec(&cpusets_pre_enable_key);
			
 
				 }
			
 
				 
			
 
				 extern int cpuset_init(void);
			
@@ -115,7 +130,7 @@ extern void cpuset_print_current_mems_allowed(void);
 
				  */
			
 
				 static inline unsigned int read_mems_allowed_begin(void)
			
 
				 {
			
 
				-	if (!cpusets_enabled())
			
 
				+	if (!static_branch_unlikely(&cpusets_pre_enable_key))
			
 
				 		return 0;
			
 
				 
			
 
				 	return read_seqcount_begin(&current->mems_allowed_seq);
			
@@ -129,7 +144,7 @@ static inline unsigned int read_mems_allowed_begin(void)
 
				  */
			
 
				 static inline bool read_mems_allowed_retry(unsigned int seq)
			
 
				 {
			
 
				-	if (!cpusets_enabled())
			
 
				+	if (!static_branch_unlikely(&cpusets_enabled_key))
			
 
				 		return false;
			
 
				 
			
 
				 	return read_seqcount_retry(&current->mems_allowed_seq, seq);
			
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -15,7 +15,7 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
 
				  * @threadfn: the function to run in the thread
			
 
				  * @data: data pointer for @threadfn()
			
 
				  * @namefmt: printf-style format string for the thread name
			
 
				- * @...: arguments for @namefmt.
			
 
				+ * @arg...: arguments for @namefmt.
			
 
				  *
			
 
				  * This macro will create a kthread on the current node, leaving it in
			
 
				  * the stopped state.  This is just a helper for kthread_create_on_node();
			
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -494,6 +494,10 @@ struct mm_struct {
 
				 	 * PROT_NONE or PROT_NUMA mapped page.
			
 
				 	 */
			
 
				 	bool tlb_flush_pending;
			
 
				+#endif
			
 
				+#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
			
 
				+	/* See flush_tlb_batched_pending() */
			
 
				+	bool tlb_flush_batched;
			
 
				 #endif
			
 
				 	struct uprobes_state uprobes_state;
			
 
				 #ifdef CONFIG_HUGETLB_PAGE
			
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -163,8 +163,6 @@ void release_pages(struct page **pages, int nr, bool cold);
 
				  */
			
 
				 static inline int page_cache_get_speculative(struct page *page)
			
 
				 {
			
 
				-	VM_BUG_ON(in_interrupt());
			
 
				-
			
 
				 #ifdef CONFIG_TINY_RCU
			
 
				 # ifdef CONFIG_PREEMPT_COUNT
			
 
				 	VM_BUG_ON(!in_atomic() && !irqs_disabled());
			
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -1034,7 +1034,8 @@ void msg_exit_ns(struct ipc_namespace *ns)
 
				 static int sysvipc_msg_proc_show(struct seq_file *s, void *it)
			
 
				 {
			
 
				 	struct user_namespace *user_ns = seq_user_ns(s);
			
 
				-	struct msg_queue *msq = it;
			
 
				+	struct kern_ipc_perm *ipcp = it;
			
 
				+	struct msg_queue *msq = container_of(ipcp, struct msg_queue, q_perm);
			
 
				 
			
 
				 	seq_printf(s,
			
 
				 		   "%10d %10d  %4o  %10lu %10lu %5u %5u %5u %5u %5u %5u %10lu %10lu %10lu\n",
			
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -2179,7 +2179,8 @@ void exit_sem(struct task_struct *tsk)
 
				 static int sysvipc_sem_proc_show(struct seq_file *s, void *it)
			
 
				 {
			
 
				 	struct user_namespace *user_ns = seq_user_ns(s);
			
 
				-	struct sem_array *sma = it;
			
 
				+	struct kern_ipc_perm *ipcp = it;
			
 
				+	struct sem_array *sma = container_of(ipcp, struct sem_array, sem_perm);
			
 
				 	time_t sem_otime;
			
 
				 
			
 
				 	/*
			
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -1380,9 +1380,11 @@ SYSCALL_DEFINE1(shmdt, char __user *, shmaddr)
 
				 static int sysvipc_shm_proc_show(struct seq_file *s, void *it)
			
 
				 {
			
 
				 	struct user_namespace *user_ns = seq_user_ns(s);
			
 
				-	struct shmid_kernel *shp = it;
			
 
				+	struct kern_ipc_perm *ipcp = it;
			
 
				+	struct shmid_kernel *shp;
			
 
				 	unsigned long rss = 0, swp = 0;
			
 
				 
			
 
				+	shp = container_of(ipcp, struct shmid_kernel, shm_perm);
			
 
				 	shm_add_rss_swap(shp, &rss, &swp);
			
 
				 
			
 
				 #if BITS_PER_LONG <= 32
			
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -63,6 +63,7 @@
 
				 #include <linux/cgroup.h>
			
 
				 #include <linux/wait.h>
			
 
				 
			
 
				+DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
			
 
				 DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
			
 
				 
			
 
				 /* See "Frequency meter" comments, below. */
			
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -575,13 +575,10 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
 
				  */
			
 
				 void __init pidhash_init(void)
			
 
				 {
			
 
				-	unsigned int pidhash_size;
			
 
				-
			
 
				 	pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18,
			
 
				 					   HASH_EARLY | HASH_SMALL | HASH_ZERO,
			
 
				 					   &pidhash_shift, NULL,
			
 
				 					   0, 4096);
			
 
				-	pidhash_size = 1U << pidhash_shift;
			
 
				 }
			
 
				 
			
 
				 void __init pidmap_init(void)
			
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4078,6 +4078,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
				 	unsigned long vaddr = *position;
			
 
				 	unsigned long remainder = *nr_pages;
			
 
				 	struct hstate *h = hstate_vma(vma);
			
 
				+	int err = -EFAULT;
			
 
				 
			
 
				 	while (vaddr < vma->vm_end && remainder) {
			
 
				 		pte_t *pte;
			
@@ -4154,11 +4155,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
				 			}
			
 
				 			ret = hugetlb_fault(mm, vma, vaddr, fault_flags);
			
 
				 			if (ret & VM_FAULT_ERROR) {
			
 
				-				int err = vm_fault_to_errno(ret, flags);
			
 
				-
			
 
				-				if (err)
			
 
				-					return err;
			
 
				-
			
 
				+				err = vm_fault_to_errno(ret, flags);
			
 
				 				remainder = 0;
			
 
				 				break;
			
 
				 			}
			
@@ -4213,7 +4210,7 @@ same_page:
 
				 	 */
			
 
				 	*position = vaddr;
			
 
				 
			
 
				-	return i ? i : -EFAULT;
			
 
				+	return i ? i : err;
			
 
				 }
			
 
				 
			
 
				 #ifndef __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE
			
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -498,6 +498,7 @@ extern struct workqueue_struct *mm_percpu_wq;
 
				 #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
			
 
				 void try_to_unmap_flush(void);
			
 
				 void try_to_unmap_flush_dirty(void);
			
 
				+void flush_tlb_batched_pending(struct mm_struct *mm);
			
 
				 #else
			
 
				 static inline void try_to_unmap_flush(void)
			
 
				 {
			
@@ -505,7 +506,9 @@ static inline void try_to_unmap_flush(void)
 
				 static inline void try_to_unmap_flush_dirty(void)
			
 
				 {
			
 
				 }
			
 
				-
			
 
				+static inline void flush_tlb_batched_pending(struct mm_struct *mm)
			
 
				+{
			
 
				+}
			
 
				 #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
			
 
				 
			
 
				 extern const struct trace_print_flags pageflag_names[];
			
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -401,6 +401,7 @@ void kasan_report(unsigned long addr, size_t size,
 
				 	disable_trace_on_warning();
			
 
				 
			
 
				 	info.access_addr = (void *)addr;
			
 
				+	info.first_bad_addr = (void *)addr;
			
 
				 	info.access_size = size;
			
 
				 	info.is_write = is_write;
			
 
				 	info.ip = ip;
			
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -320,6 +320,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 
				 
			
 
				 	tlb_remove_check_page_size_change(tlb, PAGE_SIZE);
			
 
				 	orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
			
 
				+	flush_tlb_batched_pending(mm);
			
 
				 	arch_enter_lazy_mmu_mode();
			
 
				 	for (; addr != end; pte++, addr += PAGE_SIZE) {
			
 
				 		ptent = *pte;
			
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1197,6 +1197,7 @@ again:
 
				 	init_rss_vec(rss);
			
 
				 	start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
			
 
				 	pte = start_pte;
			
 
				+	flush_tlb_batched_pending(mm);
			
 
				 	arch_enter_lazy_mmu_mode();
			
 
				 	do {
			
 
				 		pte_t ptent = *pte;
			
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -64,6 +64,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 
				 	    atomic_read(&vma->vm_mm->mm_users) == 1)
			
 
				 		target_node = numa_node_id();
			
 
				 
			
 
				+	flush_tlb_batched_pending(vma->vm_mm);
			
 
				 	arch_enter_lazy_mmu_mode();
			
 
				 	do {
			
 
				 		oldpte = *pte;
			
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -152,6 +152,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 
				 	new_ptl = pte_lockptr(mm, new_pmd);
			
 
				 	if (new_ptl != old_ptl)
			
 
				 		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
			
 
				+	flush_tlb_batched_pending(vma->vm_mm);
			
 
				 	arch_enter_lazy_mmu_mode();
			
 
				 
			
 
				 	for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
			
@@ -428,6 +429,7 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
 
				 static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
			
 
				 		unsigned long new_addr, unsigned long new_len, bool *locked,
			
 
				 		struct vm_userfaultfd_ctx *uf,
			
 
				+		struct list_head *uf_unmap_early,
			
 
				 		struct list_head *uf_unmap)
			
 
				 {
			
 
				 	struct mm_struct *mm = current->mm;
			
@@ -446,7 +448,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
 
				 	if (addr + old_len > new_addr && new_addr + new_len > addr)
			
 
				 		goto out;
			
 
				 
			
 
				-	ret = do_munmap(mm, new_addr, new_len, NULL);
			
 
				+	ret = do_munmap(mm, new_addr, new_len, uf_unmap_early);
			
 
				 	if (ret)
			
 
				 		goto out;
			
 
				 
			
@@ -514,6 +516,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 
				 	unsigned long charged = 0;
			
 
				 	bool locked = false;
			
 
				 	struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX;
			
 
				+	LIST_HEAD(uf_unmap_early);
			
 
				 	LIST_HEAD(uf_unmap);
			
 
				 
			
 
				 	if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
			
@@ -541,7 +544,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 
				 
			
 
				 	if (flags & MREMAP_FIXED) {
			
 
				 		ret = mremap_to(addr, old_len, new_addr, new_len,
			
 
				-				&locked, &uf, &uf_unmap);
			
 
				+				&locked, &uf, &uf_unmap_early, &uf_unmap);
			
 
				 		goto out;
			
 
				 	}
			
 
				 
			
@@ -621,6 +624,7 @@ out:
 
				 	up_write(&current->mm->mmap_sem);
			
 
				 	if (locked && new_len > old_len)
			
 
				 		mm_populate(new_addr + old_len, new_len - old_len);
			
 
				+	userfaultfd_unmap_complete(mm, &uf_unmap_early);
			
 
				 	mremap_userfaultfd_complete(&uf, addr, new_addr, old_len);
			
 
				 	userfaultfd_unmap_complete(mm, &uf_unmap);
			
 
				 	return ret;
			
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4891,9 +4891,11 @@ int numa_zonelist_order_handler(struct ctl_table *table, int write,
 
				 				NUMA_ZONELIST_ORDER_LEN);
			
 
				 			user_zonelist_order = oldval;
			
 
				 		} else if (oldval != user_zonelist_order) {
			
 
				+			mem_hotplug_begin();
			
 
				 			mutex_lock(&zonelists_mutex);
			
 
				 			build_all_zonelists(NULL, NULL);
			
 
				 			mutex_unlock(&zonelists_mutex);
			
 
				+			mem_hotplug_done();
			
 
				 		}
			
 
				 	}
			
 
				 out:
			
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -22,6 +22,7 @@
 
				 #include <linux/frontswap.h>
			
 
				 #include <linux/blkdev.h>
			
 
				 #include <linux/uio.h>
			
 
				+#include <linux/sched/task.h>
			
 
				 #include <asm/pgtable.h>
			
 
				 
			
 
				 static struct bio *get_swap_bio(gfp_t gfp_flags,
			
@@ -136,6 +137,7 @@ out:
 
				 	WRITE_ONCE(bio->bi_private, NULL);
			
 
				 	bio_put(bio);
			
 
				 	wake_up_process(waiter);
			
 
				+	put_task_struct(waiter);
			
 
				 }
			
 
				 
			
 
				 int generic_swapfile_activate(struct swap_info_struct *sis,
			
@@ -378,6 +380,11 @@ int swap_readpage(struct page *page, bool do_poll)
 
				 		goto out;
			
 
				 	}
			
 
				 	bdev = bio->bi_bdev;
			
 
				+	/*
			
 
				+	 * Keep this task valid during swap readpage because the oom killer may
			
 
				+	 * attempt to access it in the page fault retry time check.
			
 
				+	 */
			
 
				+	get_task_struct(current);
			
 
				 	bio->bi_private = current;
			
 
				 	bio_set_op_attrs(bio, REQ_OP_READ, 0);
			
 
				 	count_vm_event(PSWPIN);
			
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -604,6 +604,13 @@ static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
 
				 	arch_tlbbatch_add_mm(&tlb_ubc->arch, mm);
			
 
				 	tlb_ubc->flush_required = true;
			
 
				 
			
 
				+	/*
			
 
				+	 * Ensure compiler does not re-order the setting of tlb_flush_batched
			
 
				+	 * before the PTE is cleared.
			
 
				+	 */
			
 
				+	barrier();
			
 
				+	mm->tlb_flush_batched = true;
			
 
				+
			
 
				 	/*
			
 
				 	 * If the PTE was dirty then it's best to assume it's writable. The
			
 
				 	 * caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush()
			
@@ -631,6 +638,35 @@ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
 
				 
			
 
				 	return should_defer;
			
 
				 }
			
 
				+
			
 
				+/*
			
 
				+ * Reclaim unmaps pages under the PTL but do not flush the TLB prior to
			
 
				+ * releasing the PTL if TLB flushes are batched. It's possible for a parallel
			
 
				+ * operation such as mprotect or munmap to race between reclaim unmapping
			
 
				+ * the page and flushing the page. If this race occurs, it potentially allows
			
 
				+ * access to data via a stale TLB entry. Tracking all mm's that have TLB
			
 
				+ * batching in flight would be expensive during reclaim so instead track
			
 
				+ * whether TLB batching occurred in the past and if so then do a flush here
			
 
				+ * if required. This will cost one additional flush per reclaim cycle paid
			
 
				+ * by the first operation at risk such as mprotect and mumap.
			
 
				+ *
			
 
				+ * This must be called under the PTL so that an access to tlb_flush_batched
			
 
				+ * that is potentially a "reclaim vs mprotect/munmap/etc" race will synchronise
			
 
				+ * via the PTL.
			
 
				+ */
			
 
				+void flush_tlb_batched_pending(struct mm_struct *mm)
			
 
				+{
			
 
				+	if (mm->tlb_flush_batched) {
			
 
				+		flush_tlb_mm(mm);
			
 
				+
			
 
				+		/*
			
 
				+		 * Do not allow the compiler to re-order the clearing of
			
 
				+		 * tlb_flush_batched before the tlb is flushed.
			
 
				+		 */
			
 
				+		barrier();
			
 
				+		mm->tlb_flush_batched = false;
			
 
				+	}
			
 
				+}
			
 
				 #else
			
 
				 static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
			
 
				 {
			
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -2453,7 +2453,6 @@ void zs_destroy_pool(struct zs_pool *pool)
 
				 	}
			
 
				 
			
 
				 	destroy_cache(pool);
			
 
				-	kfree(pool->size_class);
			
 
				 	kfree(pool->name);
			
 
				 	kfree(pool);
			
 
				 }