|
@@ -22,9 +22,14 @@
|
|
|
|
|
|
#define _Q_SLOW_VAL (3U << _Q_LOCKED_OFFSET)
|
|
|
|
|
|
+/*
|
|
|
+ * Queue node uses: vcpu_running & vcpu_halted.
|
|
|
+ * Queue head uses: vcpu_running & vcpu_hashed.
|
|
|
+ */
|
|
|
enum vcpu_state {
|
|
|
vcpu_running = 0,
|
|
|
- vcpu_halted,
|
|
|
+ vcpu_halted, /* Used only in pv_wait_node */
|
|
|
+ vcpu_hashed, /* = pv_hash'ed + vcpu_halted */
|
|
|
};
|
|
|
|
|
|
struct pv_node {
|
|
@@ -153,7 +158,8 @@ static void pv_init_node(struct mcs_spinlock *node)
|
|
|
|
|
|
/*
|
|
|
* Wait for node->locked to become true, halt the vcpu after a short spin.
|
|
|
- * pv_kick_node() is used to wake the vcpu again.
|
|
|
+ * pv_kick_node() is used to set _Q_SLOW_VAL and fill in hash table on its
|
|
|
+ * behalf.
|
|
|
*/
|
|
|
static void pv_wait_node(struct mcs_spinlock *node)
|
|
|
{
|
|
@@ -172,9 +178,9 @@ static void pv_wait_node(struct mcs_spinlock *node)
|
|
|
*
|
|
|
* [S] pn->state = vcpu_halted [S] next->locked = 1
|
|
|
* MB MB
|
|
|
- * [L] pn->locked [RmW] pn->state = vcpu_running
|
|
|
+ * [L] pn->locked [RmW] pn->state = vcpu_hashed
|
|
|
*
|
|
|
- * Matches the xchg() from pv_kick_node().
|
|
|
+ * Matches the cmpxchg() from pv_kick_node().
|
|
|
*/
|
|
|
smp_store_mb(pn->state, vcpu_halted);
|
|
|
|
|
@@ -182,9 +188,10 @@ static void pv_wait_node(struct mcs_spinlock *node)
|
|
|
pv_wait(&pn->state, vcpu_halted);
|
|
|
|
|
|
/*
|
|
|
- * Reset the vCPU state to avoid unncessary CPU kicking
|
|
|
+ * If pv_kick_node() changed us to vcpu_hashed, retain that value
|
|
|
+ * so that pv_wait_head() knows to not also try to hash this lock.
|
|
|
*/
|
|
|
- WRITE_ONCE(pn->state, vcpu_running);
|
|
|
+ cmpxchg(&pn->state, vcpu_halted, vcpu_running);
|
|
|
|
|
|
/*
|
|
|
* If the locked flag is still not set after wakeup, it is a
|
|
@@ -194,6 +201,7 @@ static void pv_wait_node(struct mcs_spinlock *node)
|
|
|
* MCS lock will be released soon.
|
|
|
*/
|
|
|
}
|
|
|
+
|
|
|
/*
|
|
|
* By now our node->locked should be 1 and our caller will not actually
|
|
|
* spin-wait for it. We do however rely on our caller to do a
|
|
@@ -202,24 +210,35 @@ static void pv_wait_node(struct mcs_spinlock *node)
|
|
|
}
|
|
|
|
|
|
/*
|
|
|
- * Called after setting next->locked = 1, used to wake those stuck in
|
|
|
- * pv_wait_node().
|
|
|
+ * Called after setting next->locked = 1 when we're the lock owner.
|
|
|
+ *
|
|
|
+ * Instead of waking the waiters stuck in pv_wait_node() advance their state such
|
|
|
+ * that they're waiting in pv_wait_head(), this avoids a wake/sleep cycle.
|
|
|
*/
|
|
|
-static void pv_kick_node(struct mcs_spinlock *node)
|
|
|
+static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
|
|
|
{
|
|
|
struct pv_node *pn = (struct pv_node *)node;
|
|
|
+ struct __qspinlock *l = (void *)lock;
|
|
|
|
|
|
/*
|
|
|
- * Note that because node->locked is already set, this actual
|
|
|
- * mcs_spinlock entry could be re-used already.
|
|
|
+ * If the vCPU is indeed halted, advance its state to match that of
|
|
|
+ * pv_wait_node(). If OTOH this fails, the vCPU was running and will
|
|
|
+ * observe its next->locked value and advance itself.
|
|
|
*
|
|
|
- * This should be fine however, kicking people for no reason is
|
|
|
- * harmless.
|
|
|
+ * Matches with smp_store_mb() and cmpxchg() in pv_wait_node()
|
|
|
+ */
|
|
|
+ if (cmpxchg(&pn->state, vcpu_halted, vcpu_hashed) != vcpu_halted)
|
|
|
+ return;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Put the lock into the hash table and set the _Q_SLOW_VAL.
|
|
|
*
|
|
|
- * See the comment in pv_wait_node().
|
|
|
+ * As this is the same vCPU that will check the _Q_SLOW_VAL value and
|
|
|
+ * the hash table later on at unlock time, no atomic instruction is
|
|
|
+ * needed.
|
|
|
*/
|
|
|
- if (xchg(&pn->state, vcpu_running) == vcpu_halted)
|
|
|
- pv_kick(pn->cpu);
|
|
|
+ WRITE_ONCE(l->locked, _Q_SLOW_VAL);
|
|
|
+ (void)pv_hash(lock, pn);
|
|
|
}
|
|
|
|
|
|
/*
|
|
@@ -233,6 +252,13 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
|
|
|
struct qspinlock **lp = NULL;
|
|
|
int loop;
|
|
|
|
|
|
+ /*
|
|
|
+ * If pv_kick_node() already advanced our state, we don't need to
|
|
|
+ * insert ourselves into the hash table anymore.
|
|
|
+ */
|
|
|
+ if (READ_ONCE(pn->state) == vcpu_hashed)
|
|
|
+ lp = (struct qspinlock **)1;
|
|
|
+
|
|
|
for (;;) {
|
|
|
for (loop = SPIN_THRESHOLD; loop; loop--) {
|
|
|
if (!READ_ONCE(l->locked))
|
|
@@ -240,9 +266,10 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
|
|
|
cpu_relax();
|
|
|
}
|
|
|
|
|
|
- WRITE_ONCE(pn->state, vcpu_halted);
|
|
|
if (!lp) { /* ONCE */
|
|
|
+ WRITE_ONCE(pn->state, vcpu_hashed);
|
|
|
lp = pv_hash(lock, pn);
|
|
|
+
|
|
|
/*
|
|
|
* We must hash before setting _Q_SLOW_VAL, such that
|
|
|
* when we observe _Q_SLOW_VAL in __pv_queued_spin_unlock()
|
|
@@ -333,8 +360,11 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
|
|
|
/*
|
|
|
* At this point the memory pointed at by lock can be freed/reused,
|
|
|
* however we can still use the pv_node to kick the CPU.
|
|
|
+ * The other vCPU may not really be halted, but kicking an active
|
|
|
+ * vCPU is harmless other than the additional latency in completing
|
|
|
+ * the unlock.
|
|
|
*/
|
|
|
- if (READ_ONCE(node->state) == vcpu_halted)
|
|
|
+ if (READ_ONCE(node->state) == vcpu_hashed)
|
|
|
pv_kick(node->cpu);
|
|
|
}
|
|
|
/*
|