10 years ago · 9dbbe3cfc3
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -95,7 +95,6 @@ unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src,
 
				 
			
 
				 struct pvclock_vsyscall_time_info {
			
 
				 	struct pvclock_vcpu_time_info pvti;
			
 
				-	u32 migrate_count;
			
 
				 } __attribute__((__aligned__(SMP_CACHE_BYTES)));
			
 
				 
			
 
				 #define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info)
			
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -141,46 +141,7 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
 
				 	set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
			
 
				 }
			
 
				 
			
 
				-static struct pvclock_vsyscall_time_info *pvclock_vdso_info;
			
 
				-
			
 
				-static struct pvclock_vsyscall_time_info *
			
 
				-pvclock_get_vsyscall_user_time_info(int cpu)
			
 
				-{
			
 
				-	if (!pvclock_vdso_info) {
			
 
				-		BUG();
			
 
				-		return NULL;
			
 
				-	}
			
 
				-
			
 
				-	return &pvclock_vdso_info[cpu];
			
 
				-}
			
 
				-
			
 
				-struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu)
			
 
				-{
			
 
				-	return &pvclock_get_vsyscall_user_time_info(cpu)->pvti;
			
 
				-}
			
 
				-
			
 
				 #ifdef CONFIG_X86_64
			
 
				-static int pvclock_task_migrate(struct notifier_block *nb, unsigned long l,
			
 
				-			        void *v)
			
 
				-{
			
 
				-	struct task_migration_notifier *mn = v;
			
 
				-	struct pvclock_vsyscall_time_info *pvti;
			
 
				-
			
 
				-	pvti = pvclock_get_vsyscall_user_time_info(mn->from_cpu);
			
 
				-
			
 
				-	/* this is NULL when pvclock vsyscall is not initialized */
			
 
				-	if (unlikely(pvti == NULL))
			
 
				-		return NOTIFY_DONE;
			
 
				-
			
 
				-	pvti->migrate_count++;
			
 
				-
			
 
				-	return NOTIFY_DONE;
			
 
				-}
			
 
				-
			
 
				-static struct notifier_block pvclock_migrate = {
			
 
				-	.notifier_call = pvclock_task_migrate,
			
 
				-};
			
 
				-
			
 
				 /*
			
 
				  * Initialize the generic pvclock vsyscall state.  This will allocate
			
 
				  * a/some page(s) for the per-vcpu pvclock information, set up a
			
@@ -194,17 +155,12 @@ int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i,
 
				 
			
 
				 	WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE);
			
 
				 
			
 
				-	pvclock_vdso_info = i;
			
 
				-
			
 
				 	for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) {
			
 
				 		__set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx,
			
 
				 			     __pa(i) + (idx*PAGE_SIZE),
			
 
				 			     PAGE_KERNEL_VVAR);
			
 
				 	}
			
 
				 
			
 
				-
			
 
				-	register_task_migration_notifier(&pvclock_migrate);
			
 
				-
			
 
				 	return 0;
			
 
				 }
			
 
				 #endif
			
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1669,12 +1669,28 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 
				 		&guest_hv_clock, sizeof(guest_hv_clock))))
			
 
				 		return 0;
			
 
				 
			
 
				-	/*
			
 
				-	 * The interface expects us to write an even number signaling that the
			
 
				-	 * update is finished. Since the guest won't see the intermediate
			
 
				-	 * state, we just increase by 2 at the end.
			
 
				+	/* This VCPU is paused, but it's legal for a guest to read another
			
 
				+	 * VCPU's kvmclock, so we really have to follow the specification where
			
 
				+	 * it says that version is odd if data is being modified, and even after
			
 
				+	 * it is consistent.
			
 
				+	 *
			
 
				+	 * Version field updates must be kept separate.  This is because
			
 
				+	 * kvm_write_guest_cached might use a "rep movs" instruction, and
			
 
				+	 * writes within a string instruction are weakly ordered.  So there
			
 
				+	 * are three writes overall.
			
 
				+	 *
			
 
				+	 * As a small optimization, only write the version field in the first
			
 
				+	 * and third write.  The vcpu->pv_time cache is still valid, because the
			
 
				+	 * version field is the first in the struct.
			
 
				 	 */
			
 
				-	vcpu->hv_clock.version = guest_hv_clock.version + 2;
			
 
				+	BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
			
 
				+
			
 
				+	vcpu->hv_clock.version = guest_hv_clock.version + 1;
			
 
				+	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
			
 
				+				&vcpu->hv_clock,
			
 
				+				sizeof(vcpu->hv_clock.version));
			
 
				+
			
 
				+	smp_wmb();
			
 
				 
			
 
				 	/* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
			
 
				 	pvclock_flags = (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
			
@@ -1695,6 +1711,13 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 
				 	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
			
 
				 				&vcpu->hv_clock,
			
 
				 				sizeof(vcpu->hv_clock));
			
 
				+
			
 
				+	smp_wmb();
			
 
				+
			
 
				+	vcpu->hv_clock.version++;
			
 
				+	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
			
 
				+				&vcpu->hv_clock,
			
 
				+				sizeof(vcpu->hv_clock.version));
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -82,15 +82,18 @@ static notrace cycle_t vread_pvclock(int *mode)
 
				 	cycle_t ret;
			
 
				 	u64 last;
			
 
				 	u32 version;
			
 
				-	u32 migrate_count;
			
 
				 	u8 flags;
			
 
				 	unsigned cpu, cpu1;
			
 
				 
			
 
				 
			
 
				 	/*
			
 
				-	 * When looping to get a consistent (time-info, tsc) pair, we
			
 
				-	 * also need to deal with the possibility we can switch vcpus,
			
 
				-	 * so make sure we always re-fetch time-info for the current vcpu.
			
 
				+	 * Note: hypervisor must guarantee that:
			
 
				+	 * 1. cpu ID number maps 1:1 to per-CPU pvclock time info.
			
 
				+	 * 2. that per-CPU pvclock time info is updated if the
			
 
				+	 *    underlying CPU changes.
			
 
				+	 * 3. that version is increased whenever underlying CPU
			
 
				+	 *    changes.
			
 
				+	 *
			
 
				 	 */
			
 
				 	do {
			
 
				 		cpu = __getcpu() & VGETCPU_CPU_MASK;
			
@@ -99,27 +102,20 @@ static notrace cycle_t vread_pvclock(int *mode)
 
				 		 * __getcpu() calls (Gleb).
			
 
				 		 */
			
 
				 
			
 
				-		/* Make sure migrate_count will change if we leave the VCPU. */
			
 
				-		do {
			
 
				-			pvti = get_pvti(cpu);
			
 
				-			migrate_count = pvti->migrate_count;
			
 
				-
			
 
				-			cpu1 = cpu;
			
 
				-			cpu = __getcpu() & VGETCPU_CPU_MASK;
			
 
				-		} while (unlikely(cpu != cpu1));
			
 
				+		pvti = get_pvti(cpu);
			
 
				 
			
 
				 		version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags);
			
 
				 
			
 
				 		/*
			
 
				 		 * Test we're still on the cpu as well as the version.
			
 
				-		 * - We must read TSC of pvti's VCPU.
			
 
				-		 * - KVM doesn't follow the versioning protocol, so data could
			
 
				-		 *   change before version if we left the VCPU.
			
 
				+		 * We could have been migrated just after the first
			
 
				+		 * vgetcpu but before fetching the version, so we
			
 
				+		 * wouldn't notice a version change.
			
 
				 		 */
			
 
				-		smp_rmb();
			
 
				-	} while (unlikely((pvti->pvti.version & 1) ||
			
 
				-			  pvti->pvti.version != version ||
			
 
				-			  pvti->migrate_count != migrate_count));
			
 
				+		cpu1 = __getcpu() & VGETCPU_CPU_MASK;
			
 
				+	} while (unlikely(cpu != cpu1 ||
			
 
				+			  (pvti->pvti.version & 1) ||
			
 
				+			  pvti->pvti.version != version));
			
 
				 
			
 
				 	if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT)))
			
 
				 		*mode = VCLOCK_NONE;
			
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -175,14 +175,6 @@ extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load);
 
				 extern void calc_global_load(unsigned long ticks);
			
 
				 extern void update_cpu_load_nohz(void);
			
 
				 
			
 
				-/* Notifier for when a task gets migrated to a new CPU */
			
 
				-struct task_migration_notifier {
			
 
				-	struct task_struct *task;
			
 
				-	int from_cpu;
			
 
				-	int to_cpu;
			
 
				-};
			
 
				-extern void register_task_migration_notifier(struct notifier_block *n);
			
 
				-
			
 
				 extern unsigned long get_parent_ip(unsigned long addr);
			
 
				 
			
 
				 extern void dump_cpu_task(int cpu);
			
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1016,13 +1016,6 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
 
				 		rq_clock_skip_update(rq, true);
			
 
				 }
			
 
				 
			
 
				-static ATOMIC_NOTIFIER_HEAD(task_migration_notifier);
			
 
				-
			
 
				-void register_task_migration_notifier(struct notifier_block *n)
			
 
				-{
			
 
				-	atomic_notifier_chain_register(&task_migration_notifier, n);
			
 
				-}
			
 
				-
			
 
				 #ifdef CONFIG_SMP
			
 
				 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
			
 
				 {
			
@@ -1053,18 +1046,10 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 
				 	trace_sched_migrate_task(p, new_cpu);
			
 
				 
			
 
				 	if (task_cpu(p) != new_cpu) {
			
 
				-		struct task_migration_notifier tmn;
			
 
				-
			
 
				 		if (p->sched_class->migrate_task_rq)
			
 
				 			p->sched_class->migrate_task_rq(p, new_cpu);
			
 
				 		p->se.nr_migrations++;
			
 
				 		perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0);
			
 
				-
			
 
				-		tmn.task = p;
			
 
				-		tmn.from_cpu = task_cpu(p);
			
 
				-		tmn.to_cpu = new_cpu;
			
 
				-
			
 
				-		atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn);
			
 
				 	}
			
 
				 
			
 
				 	__set_task_cpu(p, new_cpu);