|
@@ -802,7 +802,7 @@ static int refill_pi_state_cache(void)
|
|
|
return 0;
|
|
|
}
|
|
|
|
|
|
-static struct futex_pi_state * alloc_pi_state(void)
|
|
|
+static struct futex_pi_state *alloc_pi_state(void)
|
|
|
{
|
|
|
struct futex_pi_state *pi_state = current->pi_state_cache;
|
|
|
|
|
@@ -812,6 +812,11 @@ static struct futex_pi_state * alloc_pi_state(void)
|
|
|
return pi_state;
|
|
|
}
|
|
|
|
|
|
+static void get_pi_state(struct futex_pi_state *pi_state)
|
|
|
+{
|
|
|
+ WARN_ON_ONCE(!atomic_inc_not_zero(&pi_state->refcount));
|
|
|
+}
|
|
|
+
|
|
|
/*
|
|
|
* Drops a reference to the pi_state object and frees or caches it
|
|
|
* when the last reference is gone.
|
|
@@ -856,7 +861,7 @@ static void put_pi_state(struct futex_pi_state *pi_state)
|
|
|
* Look up the task based on what TID userspace gave us.
|
|
|
* We dont trust it.
|
|
|
*/
|
|
|
-static struct task_struct * futex_find_get_task(pid_t pid)
|
|
|
+static struct task_struct *futex_find_get_task(pid_t pid)
|
|
|
{
|
|
|
struct task_struct *p;
|
|
|
|
|
@@ -916,10 +921,12 @@ void exit_pi_state_list(struct task_struct *curr)
|
|
|
pi_state->owner = NULL;
|
|
|
raw_spin_unlock_irq(&curr->pi_lock);
|
|
|
|
|
|
- rt_mutex_unlock(&pi_state->pi_mutex);
|
|
|
-
|
|
|
+ get_pi_state(pi_state);
|
|
|
spin_unlock(&hb->lock);
|
|
|
|
|
|
+ rt_mutex_futex_unlock(&pi_state->pi_mutex);
|
|
|
+ put_pi_state(pi_state);
|
|
|
+
|
|
|
raw_spin_lock_irq(&curr->pi_lock);
|
|
|
}
|
|
|
raw_spin_unlock_irq(&curr->pi_lock);
|
|
@@ -973,6 +980,39 @@ void exit_pi_state_list(struct task_struct *curr)
|
|
|
*
|
|
|
* [10] There is no transient state which leaves owner and user space
|
|
|
* TID out of sync.
|
|
|
+ *
|
|
|
+ *
|
|
|
+ * Serialization and lifetime rules:
|
|
|
+ *
|
|
|
+ * hb->lock:
|
|
|
+ *
|
|
|
+ * hb -> futex_q, relation
|
|
|
+ * futex_q -> pi_state, relation
|
|
|
+ *
|
|
|
+ * (cannot be raw because hb can contain arbitrary amount
|
|
|
+ * of futex_q's)
|
|
|
+ *
|
|
|
+ * pi_mutex->wait_lock:
|
|
|
+ *
|
|
|
+ * {uval, pi_state}
|
|
|
+ *
|
|
|
+ * (and pi_mutex 'obviously')
|
|
|
+ *
|
|
|
+ * p->pi_lock:
|
|
|
+ *
|
|
|
+ * p->pi_state_list -> pi_state->list, relation
|
|
|
+ *
|
|
|
+ * pi_state->refcount:
|
|
|
+ *
|
|
|
+ * pi_state lifetime
|
|
|
+ *
|
|
|
+ *
|
|
|
+ * Lock order:
|
|
|
+ *
|
|
|
+ * hb->lock
|
|
|
+ * pi_mutex->wait_lock
|
|
|
+ * p->pi_lock
|
|
|
+ *
|
|
|
*/
|
|
|
|
|
|
/*
|
|
@@ -980,10 +1020,13 @@ void exit_pi_state_list(struct task_struct *curr)
|
|
|
* the pi_state against the user space value. If correct, attach to
|
|
|
* it.
|
|
|
*/
|
|
|
-static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
|
|
|
+static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
|
|
|
+ struct futex_pi_state *pi_state,
|
|
|
struct futex_pi_state **ps)
|
|
|
{
|
|
|
pid_t pid = uval & FUTEX_TID_MASK;
|
|
|
+ u32 uval2;
|
|
|
+ int ret;
|
|
|
|
|
|
/*
|
|
|
* Userspace might have messed up non-PI and PI futexes [3]
|
|
@@ -991,8 +1034,38 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
|
|
|
if (unlikely(!pi_state))
|
|
|
return -EINVAL;
|
|
|
|
|
|
+ /*
|
|
|
+ * We get here with hb->lock held, and having found a
|
|
|
+ * futex_top_waiter(). This means that futex_lock_pi() of said futex_q
|
|
|
+ * has dropped the hb->lock in between queue_me() and unqueue_me_pi(),
|
|
|
+ * which in turn means that futex_lock_pi() still has a reference on
|
|
|
+ * our pi_state.
|
|
|
+ *
|
|
|
+ * The waiter holding a reference on @pi_state also protects against
|
|
|
+ * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi()
|
|
|
+ * and futex_wait_requeue_pi() as it cannot go to 0 and consequently
|
|
|
+ * free pi_state before we can take a reference ourselves.
|
|
|
+ */
|
|
|
WARN_ON(!atomic_read(&pi_state->refcount));
|
|
|
|
|
|
+ /*
|
|
|
+ * Now that we have a pi_state, we can acquire wait_lock
|
|
|
+ * and do the state validation.
|
|
|
+ */
|
|
|
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Since {uval, pi_state} is serialized by wait_lock, and our current
|
|
|
+ * uval was read without holding it, it can have changed. Verify it
|
|
|
+ * still is what we expect it to be, otherwise retry the entire
|
|
|
+ * operation.
|
|
|
+ */
|
|
|
+ if (get_futex_value_locked(&uval2, uaddr))
|
|
|
+ goto out_efault;
|
|
|
+
|
|
|
+ if (uval != uval2)
|
|
|
+ goto out_eagain;
|
|
|
+
|
|
|
/*
|
|
|
* Handle the owner died case:
|
|
|
*/
|
|
@@ -1008,11 +1081,11 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
|
|
|
* is not 0. Inconsistent state. [5]
|
|
|
*/
|
|
|
if (pid)
|
|
|
- return -EINVAL;
|
|
|
+ goto out_einval;
|
|
|
/*
|
|
|
* Take a ref on the state and return success. [4]
|
|
|
*/
|
|
|
- goto out_state;
|
|
|
+ goto out_attach;
|
|
|
}
|
|
|
|
|
|
/*
|
|
@@ -1024,14 +1097,14 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
|
|
|
* Take a ref on the state and return success. [6]
|
|
|
*/
|
|
|
if (!pid)
|
|
|
- goto out_state;
|
|
|
+ goto out_attach;
|
|
|
} else {
|
|
|
/*
|
|
|
* If the owner died bit is not set, then the pi_state
|
|
|
* must have an owner. [7]
|
|
|
*/
|
|
|
if (!pi_state->owner)
|
|
|
- return -EINVAL;
|
|
|
+ goto out_einval;
|
|
|
}
|
|
|
|
|
|
/*
|
|
@@ -1040,11 +1113,29 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
|
|
|
* user space TID. [9/10]
|
|
|
*/
|
|
|
if (pid != task_pid_vnr(pi_state->owner))
|
|
|
- return -EINVAL;
|
|
|
-out_state:
|
|
|
- atomic_inc(&pi_state->refcount);
|
|
|
+ goto out_einval;
|
|
|
+
|
|
|
+out_attach:
|
|
|
+ get_pi_state(pi_state);
|
|
|
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
|
|
|
*ps = pi_state;
|
|
|
return 0;
|
|
|
+
|
|
|
+out_einval:
|
|
|
+ ret = -EINVAL;
|
|
|
+ goto out_error;
|
|
|
+
|
|
|
+out_eagain:
|
|
|
+ ret = -EAGAIN;
|
|
|
+ goto out_error;
|
|
|
+
|
|
|
+out_efault:
|
|
|
+ ret = -EFAULT;
|
|
|
+ goto out_error;
|
|
|
+
|
|
|
+out_error:
|
|
|
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
|
|
|
+ return ret;
|
|
|
}
|
|
|
|
|
|
/*
|
|
@@ -1095,6 +1186,9 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
|
|
|
|
|
|
/*
|
|
|
* No existing pi state. First waiter. [2]
|
|
|
+ *
|
|
|
+ * This creates pi_state, we have hb->lock held, this means nothing can
|
|
|
+ * observe this state, wait_lock is irrelevant.
|
|
|
*/
|
|
|
pi_state = alloc_pi_state();
|
|
|
|
|
@@ -1119,17 +1213,18 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
|
|
|
return 0;
|
|
|
}
|
|
|
|
|
|
-static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
|
|
|
+static int lookup_pi_state(u32 __user *uaddr, u32 uval,
|
|
|
+ struct futex_hash_bucket *hb,
|
|
|
union futex_key *key, struct futex_pi_state **ps)
|
|
|
{
|
|
|
- struct futex_q *match = futex_top_waiter(hb, key);
|
|
|
+ struct futex_q *top_waiter = futex_top_waiter(hb, key);
|
|
|
|
|
|
/*
|
|
|
* If there is a waiter on that futex, validate it and
|
|
|
* attach to the pi_state when the validation succeeds.
|
|
|
*/
|
|
|
- if (match)
|
|
|
- return attach_to_pi_state(uval, match->pi_state, ps);
|
|
|
+ if (top_waiter)
|
|
|
+ return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
|
|
|
|
|
|
/*
|
|
|
* We are the first waiter - try to look up the owner based on
|
|
@@ -1148,7 +1243,7 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
|
|
|
if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
|
|
|
return -EFAULT;
|
|
|
|
|
|
- /*If user space value changed, let the caller retry */
|
|
|
+ /* If user space value changed, let the caller retry */
|
|
|
return curval != uval ? -EAGAIN : 0;
|
|
|
}
|
|
|
|
|
@@ -1176,7 +1271,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
|
|
|
struct task_struct *task, int set_waiters)
|
|
|
{
|
|
|
u32 uval, newval, vpid = task_pid_vnr(task);
|
|
|
- struct futex_q *match;
|
|
|
+ struct futex_q *top_waiter;
|
|
|
int ret;
|
|
|
|
|
|
/*
|
|
@@ -1202,9 +1297,9 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
|
|
|
* Lookup existing state first. If it exists, try to attach to
|
|
|
* its pi_state.
|
|
|
*/
|
|
|
- match = futex_top_waiter(hb, key);
|
|
|
- if (match)
|
|
|
- return attach_to_pi_state(uval, match->pi_state, ps);
|
|
|
+ top_waiter = futex_top_waiter(hb, key);
|
|
|
+ if (top_waiter)
|
|
|
+ return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
|
|
|
|
|
|
/*
|
|
|
* No waiter and user TID is 0. We are here because the
|
|
@@ -1285,50 +1380,44 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
|
|
|
wake_q_add(wake_q, p);
|
|
|
__unqueue_futex(q);
|
|
|
/*
|
|
|
- * The waiting task can free the futex_q as soon as
|
|
|
- * q->lock_ptr = NULL is written, without taking any locks. A
|
|
|
- * memory barrier is required here to prevent the following
|
|
|
- * store to lock_ptr from getting ahead of the plist_del.
|
|
|
+ * The waiting task can free the futex_q as soon as q->lock_ptr = NULL
|
|
|
+ * is written, without taking any locks. This is possible in the event
|
|
|
+ * of a spurious wakeup, for example. A memory barrier is required here
|
|
|
+ * to prevent the following store to lock_ptr from getting ahead of the
|
|
|
+ * plist_del in __unqueue_futex().
|
|
|
*/
|
|
|
- smp_wmb();
|
|
|
- q->lock_ptr = NULL;
|
|
|
+ smp_store_release(&q->lock_ptr, NULL);
|
|
|
}
|
|
|
|
|
|
-static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
|
|
|
- struct futex_hash_bucket *hb)
|
|
|
+/*
|
|
|
+ * Caller must hold a reference on @pi_state.
|
|
|
+ */
|
|
|
+static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state)
|
|
|
{
|
|
|
- struct task_struct *new_owner;
|
|
|
- struct futex_pi_state *pi_state = this->pi_state;
|
|
|
u32 uninitialized_var(curval), newval;
|
|
|
+ struct task_struct *new_owner;
|
|
|
+ bool postunlock = false;
|
|
|
DEFINE_WAKE_Q(wake_q);
|
|
|
- bool deboost;
|
|
|
int ret = 0;
|
|
|
|
|
|
- if (!pi_state)
|
|
|
- return -EINVAL;
|
|
|
-
|
|
|
- /*
|
|
|
- * If current does not own the pi_state then the futex is
|
|
|
- * inconsistent and user space fiddled with the futex value.
|
|
|
- */
|
|
|
- if (pi_state->owner != current)
|
|
|
- return -EINVAL;
|
|
|
-
|
|
|
- raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
|
|
|
new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
|
|
|
+ if (WARN_ON_ONCE(!new_owner)) {
|
|
|
+ /*
|
|
|
+ * As per the comment in futex_unlock_pi() this should not happen.
|
|
|
+ *
|
|
|
+ * When this happens, give up our locks and try again, giving
|
|
|
+ * the futex_lock_pi() instance time to complete, either by
|
|
|
+ * waiting on the rtmutex or removing itself from the futex
|
|
|
+ * queue.
|
|
|
+ */
|
|
|
+ ret = -EAGAIN;
|
|
|
+ goto out_unlock;
|
|
|
+ }
|
|
|
|
|
|
/*
|
|
|
- * It is possible that the next waiter (the one that brought
|
|
|
- * this owner to the kernel) timed out and is no longer
|
|
|
- * waiting on the lock.
|
|
|
- */
|
|
|
- if (!new_owner)
|
|
|
- new_owner = this->task;
|
|
|
-
|
|
|
- /*
|
|
|
- * We pass it to the next owner. The WAITERS bit is always
|
|
|
- * kept enabled while there is PI state around. We cleanup the
|
|
|
- * owner died bit, because we are the owner.
|
|
|
+ * We pass it to the next owner. The WAITERS bit is always kept
|
|
|
+ * enabled while there is PI state around. We cleanup the owner
|
|
|
+ * died bit, because we are the owner.
|
|
|
*/
|
|
|
newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
|
|
|
|
|
@@ -1337,6 +1426,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
|
|
|
|
|
|
if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) {
|
|
|
ret = -EFAULT;
|
|
|
+
|
|
|
} else if (curval != uval) {
|
|
|
/*
|
|
|
* If a unconditional UNLOCK_PI operation (user space did not
|
|
@@ -1349,10 +1439,14 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
|
|
|
else
|
|
|
ret = -EINVAL;
|
|
|
}
|
|
|
- if (ret) {
|
|
|
- raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
|
|
|
- return ret;
|
|
|
- }
|
|
|
+
|
|
|
+ if (ret)
|
|
|
+ goto out_unlock;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * This is a point of no return; once we modify the uval there is no
|
|
|
+ * going back and subsequent operations must not fail.
|
|
|
+ */
|
|
|
|
|
|
raw_spin_lock(&pi_state->owner->pi_lock);
|
|
|
WARN_ON(list_empty(&pi_state->list));
|
|
@@ -1365,22 +1459,15 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
|
|
|
pi_state->owner = new_owner;
|
|
|
raw_spin_unlock(&new_owner->pi_lock);
|
|
|
|
|
|
- raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
|
|
|
+ postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
|
|
|
|
|
|
- deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
|
|
|
+out_unlock:
|
|
|
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
|
|
|
|
|
|
- /*
|
|
|
- * First unlock HB so the waiter does not spin on it once he got woken
|
|
|
- * up. Second wake up the waiter before the priority is adjusted. If we
|
|
|
- * deboost first (and lose our higher priority), then the task might get
|
|
|
- * scheduled away before the wake up can take place.
|
|
|
- */
|
|
|
- spin_unlock(&hb->lock);
|
|
|
- wake_up_q(&wake_q);
|
|
|
- if (deboost)
|
|
|
- rt_mutex_adjust_prio(current);
|
|
|
+ if (postunlock)
|
|
|
+ rt_mutex_postunlock(&wake_q);
|
|
|
|
|
|
- return 0;
|
|
|
+ return ret;
|
|
|
}
|
|
|
|
|
|
/*
|
|
@@ -1826,7 +1913,7 @@ retry_private:
|
|
|
* If that call succeeds then we have pi_state and an
|
|
|
* initial refcount on it.
|
|
|
*/
|
|
|
- ret = lookup_pi_state(ret, hb2, &key2, &pi_state);
|
|
|
+ ret = lookup_pi_state(uaddr2, ret, hb2, &key2, &pi_state);
|
|
|
}
|
|
|
|
|
|
switch (ret) {
|
|
@@ -1909,7 +1996,7 @@ retry_private:
|
|
|
* refcount on the pi_state and store the pointer in
|
|
|
* the futex_q object of the waiter.
|
|
|
*/
|
|
|
- atomic_inc(&pi_state->refcount);
|
|
|
+ get_pi_state(pi_state);
|
|
|
this->pi_state = pi_state;
|
|
|
ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
|
|
|
this->rt_waiter,
|
|
@@ -2009,20 +2096,7 @@ queue_unlock(struct futex_hash_bucket *hb)
|
|
|
hb_waiters_dec(hb);
|
|
|
}
|
|
|
|
|
|
-/**
|
|
|
- * queue_me() - Enqueue the futex_q on the futex_hash_bucket
|
|
|
- * @q: The futex_q to enqueue
|
|
|
- * @hb: The destination hash bucket
|
|
|
- *
|
|
|
- * The hb->lock must be held by the caller, and is released here. A call to
|
|
|
- * queue_me() is typically paired with exactly one call to unqueue_me(). The
|
|
|
- * exceptions involve the PI related operations, which may use unqueue_me_pi()
|
|
|
- * or nothing if the unqueue is done as part of the wake process and the unqueue
|
|
|
- * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
|
|
|
- * an example).
|
|
|
- */
|
|
|
-static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
|
|
|
- __releases(&hb->lock)
|
|
|
+static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
|
|
|
{
|
|
|
int prio;
|
|
|
|
|
@@ -2039,6 +2113,24 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
|
|
|
plist_node_init(&q->list, prio);
|
|
|
plist_add(&q->list, &hb->chain);
|
|
|
q->task = current;
|
|
|
+}
|
|
|
+
|
|
|
+/**
|
|
|
+ * queue_me() - Enqueue the futex_q on the futex_hash_bucket
|
|
|
+ * @q: The futex_q to enqueue
|
|
|
+ * @hb: The destination hash bucket
|
|
|
+ *
|
|
|
+ * The hb->lock must be held by the caller, and is released here. A call to
|
|
|
+ * queue_me() is typically paired with exactly one call to unqueue_me(). The
|
|
|
+ * exceptions involve the PI related operations, which may use unqueue_me_pi()
|
|
|
+ * or nothing if the unqueue is done as part of the wake process and the unqueue
|
|
|
+ * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
|
|
|
+ * an example).
|
|
|
+ */
|
|
|
+static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
|
|
|
+ __releases(&hb->lock)
|
|
|
+{
|
|
|
+ __queue_me(q, hb);
|
|
|
spin_unlock(&hb->lock);
|
|
|
}
|
|
|
|
|
@@ -2125,10 +2217,13 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
|
|
|
{
|
|
|
u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
|
|
|
struct futex_pi_state *pi_state = q->pi_state;
|
|
|
- struct task_struct *oldowner = pi_state->owner;
|
|
|
u32 uval, uninitialized_var(curval), newval;
|
|
|
+ struct task_struct *oldowner;
|
|
|
int ret;
|
|
|
|
|
|
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
|
|
|
+
|
|
|
+ oldowner = pi_state->owner;
|
|
|
/* Owner died? */
|
|
|
if (!pi_state->owner)
|
|
|
newtid |= FUTEX_OWNER_DIED;
|
|
@@ -2136,7 +2231,8 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
|
|
|
/*
|
|
|
* We are here either because we stole the rtmutex from the
|
|
|
* previous highest priority waiter or we are the highest priority
|
|
|
- * waiter but failed to get the rtmutex the first time.
|
|
|
+ * waiter but have failed to get the rtmutex the first time.
|
|
|
+ *
|
|
|
* We have to replace the newowner TID in the user space variable.
|
|
|
* This must be atomic as we have to preserve the owner died bit here.
|
|
|
*
|
|
@@ -2144,17 +2240,16 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
|
|
|
* because we can fault here. Imagine swapped out pages or a fork
|
|
|
* that marked all the anonymous memory readonly for cow.
|
|
|
*
|
|
|
- * Modifying pi_state _before_ the user space value would
|
|
|
- * leave the pi_state in an inconsistent state when we fault
|
|
|
- * here, because we need to drop the hash bucket lock to
|
|
|
- * handle the fault. This might be observed in the PID check
|
|
|
- * in lookup_pi_state.
|
|
|
+ * Modifying pi_state _before_ the user space value would leave the
|
|
|
+ * pi_state in an inconsistent state when we fault here, because we
|
|
|
+ * need to drop the locks to handle the fault. This might be observed
|
|
|
+ * in the PID check in lookup_pi_state.
|
|
|
*/
|
|
|
retry:
|
|
|
if (get_futex_value_locked(&uval, uaddr))
|
|
|
goto handle_fault;
|
|
|
|
|
|
- while (1) {
|
|
|
+ for (;;) {
|
|
|
newval = (uval & FUTEX_OWNER_DIED) | newtid;
|
|
|
|
|
|
if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
|
|
@@ -2169,47 +2264,60 @@ retry:
|
|
|
* itself.
|
|
|
*/
|
|
|
if (pi_state->owner != NULL) {
|
|
|
- raw_spin_lock_irq(&pi_state->owner->pi_lock);
|
|
|
+ raw_spin_lock(&pi_state->owner->pi_lock);
|
|
|
WARN_ON(list_empty(&pi_state->list));
|
|
|
list_del_init(&pi_state->list);
|
|
|
- raw_spin_unlock_irq(&pi_state->owner->pi_lock);
|
|
|
+ raw_spin_unlock(&pi_state->owner->pi_lock);
|
|
|
}
|
|
|
|
|
|
pi_state->owner = newowner;
|
|
|
|
|
|
- raw_spin_lock_irq(&newowner->pi_lock);
|
|
|
+ raw_spin_lock(&newowner->pi_lock);
|
|
|
WARN_ON(!list_empty(&pi_state->list));
|
|
|
list_add(&pi_state->list, &newowner->pi_state_list);
|
|
|
- raw_spin_unlock_irq(&newowner->pi_lock);
|
|
|
+ raw_spin_unlock(&newowner->pi_lock);
|
|
|
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
|
|
|
+
|
|
|
return 0;
|
|
|
|
|
|
/*
|
|
|
- * To handle the page fault we need to drop the hash bucket
|
|
|
- * lock here. That gives the other task (either the highest priority
|
|
|
- * waiter itself or the task which stole the rtmutex) the
|
|
|
- * chance to try the fixup of the pi_state. So once we are
|
|
|
- * back from handling the fault we need to check the pi_state
|
|
|
- * after reacquiring the hash bucket lock and before trying to
|
|
|
- * do another fixup. When the fixup has been done already we
|
|
|
- * simply return.
|
|
|
+ * To handle the page fault we need to drop the locks here. That gives
|
|
|
+ * the other task (either the highest priority waiter itself or the
|
|
|
+ * task which stole the rtmutex) the chance to try the fixup of the
|
|
|
+ * pi_state. So once we are back from handling the fault we need to
|
|
|
+ * check the pi_state after reacquiring the locks and before trying to
|
|
|
+ * do another fixup. When the fixup has been done already we simply
|
|
|
+ * return.
|
|
|
+ *
|
|
|
+ * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
|
|
|
+ * drop hb->lock since the caller owns the hb -> futex_q relation.
|
|
|
+ * Dropping the pi_mutex->wait_lock requires the state revalidate.
|
|
|
*/
|
|
|
handle_fault:
|
|
|
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
|
|
|
spin_unlock(q->lock_ptr);
|
|
|
|
|
|
ret = fault_in_user_writeable(uaddr);
|
|
|
|
|
|
spin_lock(q->lock_ptr);
|
|
|
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
|
|
|
|
|
|
/*
|
|
|
* Check if someone else fixed it for us:
|
|
|
*/
|
|
|
- if (pi_state->owner != oldowner)
|
|
|
- return 0;
|
|
|
+ if (pi_state->owner != oldowner) {
|
|
|
+ ret = 0;
|
|
|
+ goto out_unlock;
|
|
|
+ }
|
|
|
|
|
|
if (ret)
|
|
|
- return ret;
|
|
|
+ goto out_unlock;
|
|
|
|
|
|
goto retry;
|
|
|
+
|
|
|
+out_unlock:
|
|
|
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
|
|
|
+ return ret;
|
|
|
}
|
|
|
|
|
|
static long futex_wait_restart(struct restart_block *restart);
|
|
@@ -2231,57 +2339,32 @@ static long futex_wait_restart(struct restart_block *restart);
|
|
|
*/
|
|
|
static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
|
|
|
{
|
|
|
- struct task_struct *owner;
|
|
|
int ret = 0;
|
|
|
|
|
|
if (locked) {
|
|
|
/*
|
|
|
* Got the lock. We might not be the anticipated owner if we
|
|
|
* did a lock-steal - fix up the PI-state in that case:
|
|
|
+ *
|
|
|
+ * We can safely read pi_state->owner without holding wait_lock
|
|
|
+ * because we now own the rt_mutex, only the owner will attempt
|
|
|
+ * to change it.
|
|
|
*/
|
|
|
if (q->pi_state->owner != current)
|
|
|
ret = fixup_pi_state_owner(uaddr, q, current);
|
|
|
goto out;
|
|
|
}
|
|
|
|
|
|
- /*
|
|
|
- * Catch the rare case, where the lock was released when we were on the
|
|
|
- * way back before we locked the hash bucket.
|
|
|
- */
|
|
|
- if (q->pi_state->owner == current) {
|
|
|
- /*
|
|
|
- * Try to get the rt_mutex now. This might fail as some other
|
|
|
- * task acquired the rt_mutex after we removed ourself from the
|
|
|
- * rt_mutex waiters list.
|
|
|
- */
|
|
|
- if (rt_mutex_trylock(&q->pi_state->pi_mutex)) {
|
|
|
- locked = 1;
|
|
|
- goto out;
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- * pi_state is incorrect, some other task did a lock steal and
|
|
|
- * we returned due to timeout or signal without taking the
|
|
|
- * rt_mutex. Too late.
|
|
|
- */
|
|
|
- raw_spin_lock_irq(&q->pi_state->pi_mutex.wait_lock);
|
|
|
- owner = rt_mutex_owner(&q->pi_state->pi_mutex);
|
|
|
- if (!owner)
|
|
|
- owner = rt_mutex_next_owner(&q->pi_state->pi_mutex);
|
|
|
- raw_spin_unlock_irq(&q->pi_state->pi_mutex.wait_lock);
|
|
|
- ret = fixup_pi_state_owner(uaddr, q, owner);
|
|
|
- goto out;
|
|
|
- }
|
|
|
-
|
|
|
/*
|
|
|
* Paranoia check. If we did not take the lock, then we should not be
|
|
|
* the owner of the rt_mutex.
|
|
|
*/
|
|
|
- if (rt_mutex_owner(&q->pi_state->pi_mutex) == current)
|
|
|
+ if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) {
|
|
|
printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p "
|
|
|
"pi-state %p\n", ret,
|
|
|
q->pi_state->pi_mutex.owner,
|
|
|
q->pi_state->owner);
|
|
|
+ }
|
|
|
|
|
|
out:
|
|
|
return ret ? ret : locked;
|
|
@@ -2505,6 +2588,8 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
|
|
|
ktime_t *time, int trylock)
|
|
|
{
|
|
|
struct hrtimer_sleeper timeout, *to = NULL;
|
|
|
+ struct futex_pi_state *pi_state = NULL;
|
|
|
+ struct rt_mutex_waiter rt_waiter;
|
|
|
struct futex_hash_bucket *hb;
|
|
|
struct futex_q q = futex_q_init;
|
|
|
int res, ret;
|
|
@@ -2557,24 +2642,67 @@ retry_private:
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+ WARN_ON(!q.pi_state);
|
|
|
+
|
|
|
/*
|
|
|
* Only actually queue now that the atomic ops are done:
|
|
|
*/
|
|
|
- queue_me(&q, hb);
|
|
|
+ __queue_me(&q, hb);
|
|
|
|
|
|
- WARN_ON(!q.pi_state);
|
|
|
- /*
|
|
|
- * Block on the PI mutex:
|
|
|
- */
|
|
|
- if (!trylock) {
|
|
|
- ret = rt_mutex_timed_futex_lock(&q.pi_state->pi_mutex, to);
|
|
|
- } else {
|
|
|
- ret = rt_mutex_trylock(&q.pi_state->pi_mutex);
|
|
|
+ if (trylock) {
|
|
|
+ ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
|
|
|
/* Fixup the trylock return value: */
|
|
|
ret = ret ? 0 : -EWOULDBLOCK;
|
|
|
+ goto no_block;
|
|
|
}
|
|
|
|
|
|
+ rt_mutex_init_waiter(&rt_waiter);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not
|
|
|
+ * hold it while doing rt_mutex_start_proxy(), because then it will
|
|
|
+ * include hb->lock in the blocking chain, even through we'll not in
|
|
|
+ * fact hold it while blocking. This will lead it to report -EDEADLK
|
|
|
+ * and BUG when futex_unlock_pi() interleaves with this.
|
|
|
+ *
|
|
|
+ * Therefore acquire wait_lock while holding hb->lock, but drop the
|
|
|
+ * latter before calling rt_mutex_start_proxy_lock(). This still fully
|
|
|
+ * serializes against futex_unlock_pi() as that does the exact same
|
|
|
+ * lock handoff sequence.
|
|
|
+ */
|
|
|
+ raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
|
|
|
+ spin_unlock(q.lock_ptr);
|
|
|
+ ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
|
|
|
+ raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
|
|
|
+
|
|
|
+ if (ret) {
|
|
|
+ if (ret == 1)
|
|
|
+ ret = 0;
|
|
|
+
|
|
|
+ spin_lock(q.lock_ptr);
|
|
|
+ goto no_block;
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+ if (unlikely(to))
|
|
|
+ hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS);
|
|
|
+
|
|
|
+ ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
|
|
|
+
|
|
|
spin_lock(q.lock_ptr);
|
|
|
+ /*
|
|
|
+ * If we failed to acquire the lock (signal/timeout), we must
|
|
|
+ * first acquire the hb->lock before removing the lock from the
|
|
|
+ * rt_mutex waitqueue, such that we can keep the hb and rt_mutex
|
|
|
+ * wait lists consistent.
|
|
|
+ *
|
|
|
+ * In particular; it is important that futex_unlock_pi() can not
|
|
|
+ * observe this inconsistency.
|
|
|
+ */
|
|
|
+ if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
|
|
|
+ ret = 0;
|
|
|
+
|
|
|
+no_block:
|
|
|
/*
|
|
|
* Fixup the pi_state owner and possibly acquire the lock if we
|
|
|
* haven't already.
|
|
@@ -2591,12 +2719,19 @@ retry_private:
|
|
|
* If fixup_owner() faulted and was unable to handle the fault, unlock
|
|
|
* it and return the fault to userspace.
|
|
|
*/
|
|
|
- if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current))
|
|
|
- rt_mutex_unlock(&q.pi_state->pi_mutex);
|
|
|
+ if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) {
|
|
|
+ pi_state = q.pi_state;
|
|
|
+ get_pi_state(pi_state);
|
|
|
+ }
|
|
|
|
|
|
/* Unqueue and drop the lock */
|
|
|
unqueue_me_pi(&q);
|
|
|
|
|
|
+ if (pi_state) {
|
|
|
+ rt_mutex_futex_unlock(&pi_state->pi_mutex);
|
|
|
+ put_pi_state(pi_state);
|
|
|
+ }
|
|
|
+
|
|
|
goto out_put_key;
|
|
|
|
|
|
out_unlock_put_key:
|
|
@@ -2605,8 +2740,10 @@ out_unlock_put_key:
|
|
|
out_put_key:
|
|
|
put_futex_key(&q.key);
|
|
|
out:
|
|
|
- if (to)
|
|
|
+ if (to) {
|
|
|
+ hrtimer_cancel(&to->timer);
|
|
|
destroy_hrtimer_on_stack(&to->timer);
|
|
|
+ }
|
|
|
return ret != -EINTR ? ret : -ERESTARTNOINTR;
|
|
|
|
|
|
uaddr_faulted:
|
|
@@ -2633,7 +2770,7 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
|
|
|
u32 uninitialized_var(curval), uval, vpid = task_pid_vnr(current);
|
|
|
union futex_key key = FUTEX_KEY_INIT;
|
|
|
struct futex_hash_bucket *hb;
|
|
|
- struct futex_q *match;
|
|
|
+ struct futex_q *top_waiter;
|
|
|
int ret;
|
|
|
|
|
|
retry:
|
|
@@ -2657,12 +2794,37 @@ retry:
|
|
|
* all and we at least want to know if user space fiddled
|
|
|
* with the futex value instead of blindly unlocking.
|
|
|
*/
|
|
|
- match = futex_top_waiter(hb, &key);
|
|
|
- if (match) {
|
|
|
- ret = wake_futex_pi(uaddr, uval, match, hb);
|
|
|
+ top_waiter = futex_top_waiter(hb, &key);
|
|
|
+ if (top_waiter) {
|
|
|
+ struct futex_pi_state *pi_state = top_waiter->pi_state;
|
|
|
+
|
|
|
+ ret = -EINVAL;
|
|
|
+ if (!pi_state)
|
|
|
+ goto out_unlock;
|
|
|
+
|
|
|
/*
|
|
|
- * In case of success wake_futex_pi dropped the hash
|
|
|
- * bucket lock.
|
|
|
+ * If current does not own the pi_state then the futex is
|
|
|
+ * inconsistent and user space fiddled with the futex value.
|
|
|
+ */
|
|
|
+ if (pi_state->owner != current)
|
|
|
+ goto out_unlock;
|
|
|
+
|
|
|
+ get_pi_state(pi_state);
|
|
|
+ /*
|
|
|
+ * By taking wait_lock while still holding hb->lock, we ensure
|
|
|
+ * there is no point where we hold neither; and therefore
|
|
|
+ * wake_futex_pi() must observe a state consistent with what we
|
|
|
+ * observed.
|
|
|
+ */
|
|
|
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
|
|
|
+ spin_unlock(&hb->lock);
|
|
|
+
|
|
|
+ ret = wake_futex_pi(uaddr, uval, pi_state);
|
|
|
+
|
|
|
+ put_pi_state(pi_state);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Success, we're done! No tricky corner cases.
|
|
|
*/
|
|
|
if (!ret)
|
|
|
goto out_putkey;
|
|
@@ -2677,7 +2839,6 @@ retry:
|
|
|
* setting the FUTEX_WAITERS bit. Try again.
|
|
|
*/
|
|
|
if (ret == -EAGAIN) {
|
|
|
- spin_unlock(&hb->lock);
|
|
|
put_futex_key(&key);
|
|
|
goto retry;
|
|
|
}
|
|
@@ -2685,7 +2846,7 @@ retry:
|
|
|
* wake_futex_pi has detected invalid state. Tell user
|
|
|
* space.
|
|
|
*/
|
|
|
- goto out_unlock;
|
|
|
+ goto out_putkey;
|
|
|
}
|
|
|
|
|
|
/*
|
|
@@ -2695,8 +2856,10 @@ retry:
|
|
|
* preserve the WAITERS bit not the OWNER_DIED one. We are the
|
|
|
* owner.
|
|
|
*/
|
|
|
- if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))
|
|
|
+ if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0)) {
|
|
|
+ spin_unlock(&hb->lock);
|
|
|
goto pi_faulted;
|
|
|
+ }
|
|
|
|
|
|
/*
|
|
|
* If uval has changed, let user space handle it.
|
|
@@ -2710,7 +2873,6 @@ out_putkey:
|
|
|
return ret;
|
|
|
|
|
|
pi_faulted:
|
|
|
- spin_unlock(&hb->lock);
|
|
|
put_futex_key(&key);
|
|
|
|
|
|
ret = fault_in_user_writeable(uaddr);
|
|
@@ -2814,6 +2976,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
|
|
|
u32 __user *uaddr2)
|
|
|
{
|
|
|
struct hrtimer_sleeper timeout, *to = NULL;
|
|
|
+ struct futex_pi_state *pi_state = NULL;
|
|
|
struct rt_mutex_waiter rt_waiter;
|
|
|
struct futex_hash_bucket *hb;
|
|
|
union futex_key key2 = FUTEX_KEY_INIT;
|
|
@@ -2840,10 +3003,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
|
|
|
* The waiter is allocated on our stack, manipulated by the requeue
|
|
|
* code while we sleep on uaddr.
|
|
|
*/
|
|
|
- debug_rt_mutex_init_waiter(&rt_waiter);
|
|
|
- RB_CLEAR_NODE(&rt_waiter.pi_tree_entry);
|
|
|
- RB_CLEAR_NODE(&rt_waiter.tree_entry);
|
|
|
- rt_waiter.task = NULL;
|
|
|
+ rt_mutex_init_waiter(&rt_waiter);
|
|
|
|
|
|
ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
|
|
|
if (unlikely(ret != 0))
|
|
@@ -2898,8 +3058,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
|
|
|
if (q.pi_state && (q.pi_state->owner != current)) {
|
|
|
spin_lock(q.lock_ptr);
|
|
|
ret = fixup_pi_state_owner(uaddr2, &q, current);
|
|
|
- if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current)
|
|
|
- rt_mutex_unlock(&q.pi_state->pi_mutex);
|
|
|
+ if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) {
|
|
|
+ pi_state = q.pi_state;
|
|
|
+ get_pi_state(pi_state);
|
|
|
+ }
|
|
|
/*
|
|
|
* Drop the reference to the pi state which
|
|
|
* the requeue_pi() code acquired for us.
|
|
@@ -2917,10 +3079,13 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
|
|
|
*/
|
|
|
WARN_ON(!q.pi_state);
|
|
|
pi_mutex = &q.pi_state->pi_mutex;
|
|
|
- ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter);
|
|
|
- debug_rt_mutex_free_waiter(&rt_waiter);
|
|
|
+ ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter);
|
|
|
|
|
|
spin_lock(q.lock_ptr);
|
|
|
+ if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter))
|
|
|
+ ret = 0;
|
|
|
+
|
|
|
+ debug_rt_mutex_free_waiter(&rt_waiter);
|
|
|
/*
|
|
|
* Fixup the pi_state owner and possibly acquire the lock if we
|
|
|
* haven't already.
|
|
@@ -2938,13 +3103,20 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
|
|
|
* the fault, unlock the rt_mutex and return the fault to
|
|
|
* userspace.
|
|
|
*/
|
|
|
- if (ret && rt_mutex_owner(pi_mutex) == current)
|
|
|
- rt_mutex_unlock(pi_mutex);
|
|
|
+ if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) {
|
|
|
+ pi_state = q.pi_state;
|
|
|
+ get_pi_state(pi_state);
|
|
|
+ }
|
|
|
|
|
|
/* Unqueue and drop the lock. */
|
|
|
unqueue_me_pi(&q);
|
|
|
}
|
|
|
|
|
|
+ if (pi_state) {
|
|
|
+ rt_mutex_futex_unlock(&pi_state->pi_mutex);
|
|
|
+ put_pi_state(pi_state);
|
|
|
+ }
|
|
|
+
|
|
|
if (ret == -EINTR) {
|
|
|
/*
|
|
|
* We've already been requeued, but cannot restart by calling
|