|
@@ -75,17 +75,20 @@
|
|
* The waiter reads the futex value in user space and calls
|
|
* The waiter reads the futex value in user space and calls
|
|
* futex_wait(). This function computes the hash bucket and acquires
|
|
* futex_wait(). This function computes the hash bucket and acquires
|
|
* the hash bucket lock. After that it reads the futex user space value
|
|
* the hash bucket lock. After that it reads the futex user space value
|
|
- * again and verifies that the data has not changed. If it has not
|
|
|
|
- * changed it enqueues itself into the hash bucket, releases the hash
|
|
|
|
- * bucket lock and schedules.
|
|
|
|
|
|
+ * again and verifies that the data has not changed. If it has not changed
|
|
|
|
+ * it enqueues itself into the hash bucket, releases the hash bucket lock
|
|
|
|
+ * and schedules.
|
|
*
|
|
*
|
|
* The waker side modifies the user space value of the futex and calls
|
|
* The waker side modifies the user space value of the futex and calls
|
|
- * futex_wake(). This functions computes the hash bucket and acquires
|
|
|
|
- * the hash bucket lock. Then it looks for waiters on that futex in the
|
|
|
|
- * hash bucket and wakes them.
|
|
|
|
|
|
+ * futex_wake(). This function computes the hash bucket and acquires the
|
|
|
|
+ * hash bucket lock. Then it looks for waiters on that futex in the hash
|
|
|
|
+ * bucket and wakes them.
|
|
*
|
|
*
|
|
- * Note that the spin_lock serializes waiters and wakers, so that the
|
|
|
|
- * following scenario is avoided:
|
|
|
|
|
|
+ * In futex wake up scenarios where no tasks are blocked on a futex, taking
|
|
|
|
+ * the hb spinlock can be avoided and simply return. In order for this
|
|
|
|
+ * optimization to work, ordering guarantees must exist so that the waiter
|
|
|
|
+ * being added to the list is acknowledged when the list is concurrently being
|
|
|
|
+ * checked by the waker, avoiding scenarios like the following:
|
|
*
|
|
*
|
|
* CPU 0 CPU 1
|
|
* CPU 0 CPU 1
|
|
* val = *futex;
|
|
* val = *futex;
|
|
@@ -106,24 +109,52 @@
|
|
* This would cause the waiter on CPU 0 to wait forever because it
|
|
* This would cause the waiter on CPU 0 to wait forever because it
|
|
* missed the transition of the user space value from val to newval
|
|
* missed the transition of the user space value from val to newval
|
|
* and the waker did not find the waiter in the hash bucket queue.
|
|
* and the waker did not find the waiter in the hash bucket queue.
|
|
- * The spinlock serializes that:
|
|
|
|
*
|
|
*
|
|
- * CPU 0 CPU 1
|
|
|
|
|
|
+ * The correct serialization ensures that a waiter either observes
|
|
|
|
+ * the changed user space value before blocking or is woken by a
|
|
|
|
+ * concurrent waker:
|
|
|
|
+ *
|
|
|
|
+ * CPU 0 CPU 1
|
|
* val = *futex;
|
|
* val = *futex;
|
|
* sys_futex(WAIT, futex, val);
|
|
* sys_futex(WAIT, futex, val);
|
|
* futex_wait(futex, val);
|
|
* futex_wait(futex, val);
|
|
- * lock(hash_bucket(futex));
|
|
|
|
- * uval = *futex;
|
|
|
|
- * *futex = newval;
|
|
|
|
- * sys_futex(WAKE, futex);
|
|
|
|
- * futex_wake(futex);
|
|
|
|
- * lock(hash_bucket(futex));
|
|
|
|
|
|
+ *
|
|
|
|
+ * waiters++;
|
|
|
|
+ * mb(); (A) <-- paired with -.
|
|
|
|
+ * |
|
|
|
|
+ * lock(hash_bucket(futex)); |
|
|
|
|
+ * |
|
|
|
|
+ * uval = *futex; |
|
|
|
|
+ * | *futex = newval;
|
|
|
|
+ * | sys_futex(WAKE, futex);
|
|
|
|
+ * | futex_wake(futex);
|
|
|
|
+ * |
|
|
|
|
+ * `-------> mb(); (B)
|
|
* if (uval == val)
|
|
* if (uval == val)
|
|
- * queue();
|
|
|
|
|
|
+ * queue();
|
|
* unlock(hash_bucket(futex));
|
|
* unlock(hash_bucket(futex));
|
|
- * schedule(); if (!queue_empty())
|
|
|
|
- * wake_waiters(futex);
|
|
|
|
- * unlock(hash_bucket(futex));
|
|
|
|
|
|
+ * schedule(); if (waiters)
|
|
|
|
+ * lock(hash_bucket(futex));
|
|
|
|
+ * wake_waiters(futex);
|
|
|
|
+ * unlock(hash_bucket(futex));
|
|
|
|
+ *
|
|
|
|
+ * Where (A) orders the waiters increment and the futex value read -- this
|
|
|
|
+ * is guaranteed by the head counter in the hb spinlock; and where (B)
|
|
|
|
+ * orders the write to futex and the waiters read -- this is done by the
|
|
|
|
+ * barriers in get_futex_key_refs(), through either ihold or atomic_inc,
|
|
|
|
+ * depending on the futex type.
|
|
|
|
+ *
|
|
|
|
+ * This yields the following case (where X:=waiters, Y:=futex):
|
|
|
|
+ *
|
|
|
|
+ * X = Y = 0
|
|
|
|
+ *
|
|
|
|
+ * w[X]=1 w[Y]=1
|
|
|
|
+ * MB MB
|
|
|
|
+ * r[Y]=y r[X]=x
|
|
|
|
+ *
|
|
|
|
+ * Which guarantees that x==0 && y==0 is impossible; which translates back into
|
|
|
|
+ * the guarantee that we cannot both miss the futex variable change and the
|
|
|
|
+ * enqueue.
|
|
*/
|
|
*/
|
|
|
|
|
|
int __read_mostly futex_cmpxchg_enabled;
|
|
int __read_mostly futex_cmpxchg_enabled;
|
|
@@ -211,6 +242,36 @@ static unsigned long __read_mostly futex_hashsize;
|
|
|
|
|
|
static struct futex_hash_bucket *futex_queues;
|
|
static struct futex_hash_bucket *futex_queues;
|
|
|
|
|
|
|
|
+static inline void futex_get_mm(union futex_key *key)
|
|
|
|
+{
|
|
|
|
+ atomic_inc(&key->private.mm->mm_count);
|
|
|
|
+ /*
|
|
|
|
+ * Ensure futex_get_mm() implies a full barrier such that
|
|
|
|
+ * get_futex_key() implies a full barrier. This is relied upon
|
|
|
|
+ * as full barrier (B), see the ordering comment above.
|
|
|
|
+ */
|
|
|
|
+ smp_mb__after_atomic_inc();
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+static inline bool hb_waiters_pending(struct futex_hash_bucket *hb)
|
|
|
|
+{
|
|
|
|
+#ifdef CONFIG_SMP
|
|
|
|
+ /*
|
|
|
|
+ * Tasks trying to enter the critical region are most likely
|
|
|
|
+ * potential waiters that will be added to the plist. Ensure
|
|
|
|
+ * that wakers won't miss to-be-slept tasks in the window between
|
|
|
|
+ * the wait call and the actual plist_add.
|
|
|
|
+ */
|
|
|
|
+ if (spin_is_locked(&hb->lock))
|
|
|
|
+ return true;
|
|
|
|
+ smp_rmb(); /* Make sure we check the lock state first */
|
|
|
|
+
|
|
|
|
+ return !plist_head_empty(&hb->chain);
|
|
|
|
+#else
|
|
|
|
+ return true;
|
|
|
|
+#endif
|
|
|
|
+}
|
|
|
|
+
|
|
/*
|
|
/*
|
|
* We hash on the keys returned from get_futex_key (see below).
|
|
* We hash on the keys returned from get_futex_key (see below).
|
|
*/
|
|
*/
|
|
@@ -245,10 +306,10 @@ static void get_futex_key_refs(union futex_key *key)
|
|
|
|
|
|
switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
|
|
switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
|
|
case FUT_OFF_INODE:
|
|
case FUT_OFF_INODE:
|
|
- ihold(key->shared.inode);
|
|
|
|
|
|
+ ihold(key->shared.inode); /* implies MB (B) */
|
|
break;
|
|
break;
|
|
case FUT_OFF_MMSHARED:
|
|
case FUT_OFF_MMSHARED:
|
|
- atomic_inc(&key->private.mm->mm_count);
|
|
|
|
|
|
+ futex_get_mm(key); /* implies MB (B) */
|
|
break;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
@@ -322,7 +383,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
|
|
if (!fshared) {
|
|
if (!fshared) {
|
|
key->private.mm = mm;
|
|
key->private.mm = mm;
|
|
key->private.address = address;
|
|
key->private.address = address;
|
|
- get_futex_key_refs(key);
|
|
|
|
|
|
+ get_futex_key_refs(key); /* implies MB (B) */
|
|
return 0;
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
|
|
@@ -429,7 +490,7 @@ again:
|
|
key->shared.pgoff = basepage_index(page);
|
|
key->shared.pgoff = basepage_index(page);
|
|
}
|
|
}
|
|
|
|
|
|
- get_futex_key_refs(key);
|
|
|
|
|
|
+ get_futex_key_refs(key); /* implies MB (B) */
|
|
|
|
|
|
out:
|
|
out:
|
|
unlock_page(page_head);
|
|
unlock_page(page_head);
|
|
@@ -1052,6 +1113,11 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
|
|
goto out;
|
|
goto out;
|
|
|
|
|
|
hb = hash_futex(&key);
|
|
hb = hash_futex(&key);
|
|
|
|
+
|
|
|
|
+ /* Make sure we really have tasks to wakeup */
|
|
|
|
+ if (!hb_waiters_pending(hb))
|
|
|
|
+ goto out_put_key;
|
|
|
|
+
|
|
spin_lock(&hb->lock);
|
|
spin_lock(&hb->lock);
|
|
|
|
|
|
plist_for_each_entry_safe(this, next, &hb->chain, list) {
|
|
plist_for_each_entry_safe(this, next, &hb->chain, list) {
|
|
@@ -1072,6 +1138,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
|
|
}
|
|
}
|
|
|
|
|
|
spin_unlock(&hb->lock);
|
|
spin_unlock(&hb->lock);
|
|
|
|
+out_put_key:
|
|
put_futex_key(&key);
|
|
put_futex_key(&key);
|
|
out:
|
|
out:
|
|
return ret;
|
|
return ret;
|
|
@@ -1535,7 +1602,7 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
|
|
hb = hash_futex(&q->key);
|
|
hb = hash_futex(&q->key);
|
|
q->lock_ptr = &hb->lock;
|
|
q->lock_ptr = &hb->lock;
|
|
|
|
|
|
- spin_lock(&hb->lock);
|
|
|
|
|
|
+ spin_lock(&hb->lock); /* implies MB (A) */
|
|
return hb;
|
|
return hb;
|
|
}
|
|
}
|
|
|
|
|