10 年之前 · 9d7fb04276
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -32,6 +32,14 @@ extern struct fs_struct init_fs;
 
				 #define INIT_CPUSET_SEQ(tsk)
			
 
				 #endif
			
 
				 
			
 
				+#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
			
 
				+#define INIT_PREV_CPUTIME(x)	.prev_cputime = {			\
			
 
				+	.lock = __RAW_SPIN_LOCK_UNLOCKED(x.prev_cputime.lock),		\
			
 
				+},
			
 
				+#else
			
 
				+#define INIT_PREV_CPUTIME(x)
			
 
				+#endif
			
 
				+
			
 
				 #define INIT_SIGNALS(sig) {						\
			
 
				 	.nr_threads	= 1,						\
			
 
				 	.thread_head	= LIST_HEAD_INIT(init_task.thread_node),	\
			
@@ -46,6 +54,7 @@ extern struct fs_struct init_fs;
 
				 		.cputime_atomic	= INIT_CPUTIME_ATOMIC,			\
			
 
				 		.running	= 0,					\
			
 
				 	},								\
			
 
				+	INIT_PREV_CPUTIME(sig)						\
			
 
				 	.cred_guard_mutex =						\
			
 
				 		 __MUTEX_INITIALIZER(sig.cred_guard_mutex),		\
			
 
				 }
			
@@ -246,6 +255,7 @@ extern struct task_group root_task_group;
 
				 	INIT_TASK_RCU_TASKS(tsk)					\
			
 
				 	INIT_CPUSET_SEQ(tsk)						\
			
 
				 	INIT_RT_MUTEXES(tsk)						\
			
 
				+	INIT_PREV_CPUTIME(tsk)						\
			
 
				 	INIT_VTIME(tsk)							\
			
 
				 	INIT_NUMA_BALANCING(tsk)					\
			
 
				 	INIT_KASAN(tsk)							\
			
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -530,39 +530,49 @@ struct cpu_itimer {
 
				 };
			
 
				 
			
 
				 /**
			
 
				- * struct cputime - snaphsot of system and user cputime
			
 
				+ * struct prev_cputime - snaphsot of system and user cputime
			
 
				  * @utime: time spent in user mode
			
 
				  * @stime: time spent in system mode
			
 
				+ * @lock: protects the above two fields
			
 
				  *
			
 
				- * Gathers a generic snapshot of user and system time.
			
 
				+ * Stores previous user/system time values such that we can guarantee
			
 
				+ * monotonicity.
			
 
				  */
			
 
				-struct cputime {
			
 
				+struct prev_cputime {
			
 
				+#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
			
 
				 	cputime_t utime;
			
 
				 	cputime_t stime;
			
 
				+	raw_spinlock_t lock;
			
 
				+#endif
			
 
				 };
			
 
				 
			
 
				+static inline void prev_cputime_init(struct prev_cputime *prev)
			
 
				+{
			
 
				+#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
			
 
				+	prev->utime = prev->stime = 0;
			
 
				+	raw_spin_lock_init(&prev->lock);
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				 /**
			
 
				  * struct task_cputime - collected CPU time counts
			
 
				  * @utime:		time spent in user mode, in &cputime_t units
			
 
				  * @stime:		time spent in kernel mode, in &cputime_t units
			
 
				  * @sum_exec_runtime:	total time spent on the CPU, in nanoseconds
			
 
				  *
			
 
				- * This is an extension of struct cputime that includes the total runtime
			
 
				- * spent by the task from the scheduler point of view.
			
 
				- *
			
 
				- * As a result, this structure groups together three kinds of CPU time
			
 
				- * that are tracked for threads and thread groups.  Most things considering
			
 
				- * CPU time want to group these counts together and treat all three
			
 
				- * of them in parallel.
			
 
				+ * This structure groups together three kinds of CPU time that are tracked for
			
 
				+ * threads and thread groups.  Most things considering CPU time want to group
			
 
				+ * these counts together and treat all three of them in parallel.
			
 
				  */
			
 
				 struct task_cputime {
			
 
				 	cputime_t utime;
			
 
				 	cputime_t stime;
			
 
				 	unsigned long long sum_exec_runtime;
			
 
				 };
			
 
				+
			
 
				 /* Alternate field names when used to cache expirations. */
			
 
				-#define prof_exp	stime
			
 
				 #define virt_exp	utime
			
 
				+#define prof_exp	stime
			
 
				 #define sched_exp	sum_exec_runtime
			
 
				 
			
 
				 #define INIT_CPUTIME	\
			
@@ -715,9 +725,7 @@ struct signal_struct {
 
				 	cputime_t utime, stime, cutime, cstime;
			
 
				 	cputime_t gtime;
			
 
				 	cputime_t cgtime;
			
 
				-#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
			
 
				-	struct cputime prev_cputime;
			
 
				-#endif
			
 
				+	struct prev_cputime prev_cputime;
			
 
				 	unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
			
 
				 	unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
			
 
				 	unsigned long inblock, oublock, cinblock, coublock;
			
@@ -1481,9 +1489,7 @@ struct task_struct {
 
				 
			
 
				 	cputime_t utime, stime, utimescaled, stimescaled;
			
 
				 	cputime_t gtime;
			
 
				-#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
			
 
				-	struct cputime prev_cputime;
			
 
				-#endif
			
 
				+	struct prev_cputime prev_cputime;
			
 
				 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
			
 
				 	seqlock_t vtime_seqlock;
			
 
				 	unsigned long long vtime_snap;
			
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1067,6 +1067,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
 
				 	rcu_assign_pointer(tsk->sighand, sig);
			
 
				 	if (!sig)
			
 
				 		return -ENOMEM;
			
 
				+
			
 
				 	atomic_set(&sig->count, 1);
			
 
				 	memcpy(sig->action, current->sighand->action, sizeof(sig->action));
			
 
				 	return 0;
			
@@ -1128,6 +1129,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 
				 	init_sigpending(&sig->shared_pending);
			
 
				 	INIT_LIST_HEAD(&sig->posix_timers);
			
 
				 	seqlock_init(&sig->stats_lock);
			
 
				+	prev_cputime_init(&sig->prev_cputime);
			
 
				 
			
 
				 	hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
			
 
				 	sig->real_timer.function = it_real_fn;
			
@@ -1335,9 +1337,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 
				 
			
 
				 	p->utime = p->stime = p->gtime = 0;
			
 
				 	p->utimescaled = p->stimescaled = 0;
			
 
				-#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
			
 
				-	p->prev_cputime.utime = p->prev_cputime.stime = 0;
			
 
				-#endif
			
 
				+	prev_cputime_init(&p->prev_cputime);
			
 
				+
			
 
				 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
			
 
				 	seqlock_init(&p->vtime_seqlock);
			
 
				 	p->vtime_snap = 0;
			
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -555,48 +555,43 @@ static cputime_t scale_stime(u64 stime, u64 rtime, u64 total)
 
				 }
			
 
				 
			
 
				 /*
			
 
				- * Atomically advance counter to the new value. Interrupts, vcpu
			
 
				- * scheduling, and scaling inaccuracies can cause cputime_advance
			
 
				- * to be occasionally called with a new value smaller than counter.
			
 
				- * Let's enforce atomicity.
			
 
				+ * Adjust tick based cputime random precision against scheduler runtime
			
 
				+ * accounting.
			
 
				  *
			
 
				- * Normally a caller will only go through this loop once, or not
			
 
				- * at all in case a previous caller updated counter the same jiffy.
			
 
				- */
			
 
				-static void cputime_advance(cputime_t *counter, cputime_t new)
			
 
				-{
			
 
				-	cputime_t old;
			
 
				-
			
 
				-	while (new > (old = READ_ONCE(*counter)))
			
 
				-		cmpxchg_cputime(counter, old, new);
			
 
				-}
			
 
				-
			
 
				-/*
			
 
				- * Adjust tick based cputime random precision against scheduler
			
 
				- * runtime accounting.
			
 
				+ * Tick based cputime accounting depend on random scheduling timeslices of a
			
 
				+ * task to be interrupted or not by the timer.  Depending on these
			
 
				+ * circumstances, the number of these interrupts may be over or
			
 
				+ * under-optimistic, matching the real user and system cputime with a variable
			
 
				+ * precision.
			
 
				+ *
			
 
				+ * Fix this by scaling these tick based values against the total runtime
			
 
				+ * accounted by the CFS scheduler.
			
 
				+ *
			
 
				+ * This code provides the following guarantees:
			
 
				+ *
			
 
				+ *   stime + utime == rtime
			
 
				+ *   stime_i+1 >= stime_i, utime_i+1 >= utime_i
			
 
				+ *
			
 
				+ * Assuming that rtime_i+1 >= rtime_i.
			
 
				  */
			
 
				 static void cputime_adjust(struct task_cputime *curr,
			
 
				-			   struct cputime *prev,
			
 
				+			   struct prev_cputime *prev,
			
 
				 			   cputime_t *ut, cputime_t *st)
			
 
				 {
			
 
				 	cputime_t rtime, stime, utime;
			
 
				+	unsigned long flags;
			
 
				 
			
 
				-	/*
			
 
				-	 * Tick based cputime accounting depend on random scheduling
			
 
				-	 * timeslices of a task to be interrupted or not by the timer.
			
 
				-	 * Depending on these circumstances, the number of these interrupts
			
 
				-	 * may be over or under-optimistic, matching the real user and system
			
 
				-	 * cputime with a variable precision.
			
 
				-	 *
			
 
				-	 * Fix this by scaling these tick based values against the total
			
 
				-	 * runtime accounted by the CFS scheduler.
			
 
				-	 */
			
 
				+	/* Serialize concurrent callers such that we can honour our guarantees */
			
 
				+	raw_spin_lock_irqsave(&prev->lock, flags);
			
 
				 	rtime = nsecs_to_cputime(curr->sum_exec_runtime);
			
 
				 
			
 
				 	/*
			
 
				-	 * Update userspace visible utime/stime values only if actual execution
			
 
				-	 * time is bigger than already exported. Note that can happen, that we
			
 
				-	 * provided bigger values due to scaling inaccuracy on big numbers.
			
 
				+	 * This is possible under two circumstances:
			
 
				+	 *  - rtime isn't monotonic after all (a bug);
			
 
				+	 *  - we got reordered by the lock.
			
 
				+	 *
			
 
				+	 * In both cases this acts as a filter such that the rest of the code
			
 
				+	 * can assume it is monotonic regardless of anything else.
			
 
				 	 */
			
 
				 	if (prev->stime + prev->utime >= rtime)
			
 
				 		goto out;
			
@@ -606,22 +601,46 @@ static void cputime_adjust(struct task_cputime *curr,
 
				 
			
 
				 	if (utime == 0) {
			
 
				 		stime = rtime;
			
 
				-	} else if (stime == 0) {
			
 
				-		utime = rtime;
			
 
				-	} else {
			
 
				-		cputime_t total = stime + utime;
			
 
				+		goto update;
			
 
				+	}
			
 
				 
			
 
				-		stime = scale_stime((__force u64)stime,
			
 
				-				    (__force u64)rtime, (__force u64)total);
			
 
				-		utime = rtime - stime;
			
 
				+	if (stime == 0) {
			
 
				+		utime = rtime;
			
 
				+		goto update;
			
 
				 	}
			
 
				 
			
 
				-	cputime_advance(&prev->stime, stime);
			
 
				-	cputime_advance(&prev->utime, utime);
			
 
				+	stime = scale_stime((__force u64)stime, (__force u64)rtime,
			
 
				+			    (__force u64)(stime + utime));
			
 
				+
			
 
				+	/*
			
 
				+	 * Make sure stime doesn't go backwards; this preserves monotonicity
			
 
				+	 * for utime because rtime is monotonic.
			
 
				+	 *
			
 
				+	 *  utime_i+1 = rtime_i+1 - stime_i
			
 
				+	 *            = rtime_i+1 - (rtime_i - utime_i)
			
 
				+	 *            = (rtime_i+1 - rtime_i) + utime_i
			
 
				+	 *            >= utime_i
			
 
				+	 */
			
 
				+	if (stime < prev->stime)
			
 
				+		stime = prev->stime;
			
 
				+	utime = rtime - stime;
			
 
				+
			
 
				+	/*
			
 
				+	 * Make sure utime doesn't go backwards; this still preserves
			
 
				+	 * monotonicity for stime, analogous argument to above.
			
 
				+	 */
			
 
				+	if (utime < prev->utime) {
			
 
				+		utime = prev->utime;
			
 
				+		stime = rtime - utime;
			
 
				+	}
			
 
				 
			
 
				+update:
			
 
				+	prev->stime = stime;
			
 
				+	prev->utime = utime;
			
 
				 out:
			
 
				 	*ut = prev->utime;
			
 
				 	*st = prev->stime;
			
 
				+	raw_spin_unlock_irqrestore(&prev->lock, flags);
			
 
				 }
			
 
				 
			
 
				 void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)