|
@@ -40,6 +40,89 @@ struct pv_node {
|
|
|
u8 state;
|
|
|
};
|
|
|
|
|
|
+/*
|
|
|
+ * By replacing the regular queued_spin_trylock() with the function below,
|
|
|
+ * it will be called once when a lock waiter enter the PV slowpath before
|
|
|
+ * being queued. By allowing one lock stealing attempt here when the pending
|
|
|
+ * bit is off, it helps to reduce the performance impact of lock waiter
|
|
|
+ * preemption without the drawback of lock starvation.
|
|
|
+ */
|
|
|
+#define queued_spin_trylock(l) pv_queued_spin_steal_lock(l)
|
|
|
+static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock)
|
|
|
+{
|
|
|
+ struct __qspinlock *l = (void *)lock;
|
|
|
+
|
|
|
+ return !(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) &&
|
|
|
+ (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0);
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ * The pending bit is used by the queue head vCPU to indicate that it
|
|
|
+ * is actively spinning on the lock and no lock stealing is allowed.
|
|
|
+ */
|
|
|
+#if _Q_PENDING_BITS == 8
|
|
|
+static __always_inline void set_pending(struct qspinlock *lock)
|
|
|
+{
|
|
|
+ struct __qspinlock *l = (void *)lock;
|
|
|
+
|
|
|
+ WRITE_ONCE(l->pending, 1);
|
|
|
+}
|
|
|
+
|
|
|
+static __always_inline void clear_pending(struct qspinlock *lock)
|
|
|
+{
|
|
|
+ struct __qspinlock *l = (void *)lock;
|
|
|
+
|
|
|
+ WRITE_ONCE(l->pending, 0);
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ * The pending bit check in pv_queued_spin_steal_lock() isn't a memory
|
|
|
+ * barrier. Therefore, an atomic cmpxchg() is used to acquire the lock
|
|
|
+ * just to be sure that it will get it.
|
|
|
+ */
|
|
|
+static __always_inline int trylock_clear_pending(struct qspinlock *lock)
|
|
|
+{
|
|
|
+ struct __qspinlock *l = (void *)lock;
|
|
|
+
|
|
|
+ return !READ_ONCE(l->locked) &&
|
|
|
+ (cmpxchg(&l->locked_pending, _Q_PENDING_VAL, _Q_LOCKED_VAL)
|
|
|
+ == _Q_PENDING_VAL);
|
|
|
+}
|
|
|
+#else /* _Q_PENDING_BITS == 8 */
|
|
|
+static __always_inline void set_pending(struct qspinlock *lock)
|
|
|
+{
|
|
|
+ atomic_set_mask(_Q_PENDING_VAL, &lock->val);
|
|
|
+}
|
|
|
+
|
|
|
+static __always_inline void clear_pending(struct qspinlock *lock)
|
|
|
+{
|
|
|
+ atomic_clear_mask(_Q_PENDING_VAL, &lock->val);
|
|
|
+}
|
|
|
+
|
|
|
+static __always_inline int trylock_clear_pending(struct qspinlock *lock)
|
|
|
+{
|
|
|
+ int val = atomic_read(&lock->val);
|
|
|
+
|
|
|
+ for (;;) {
|
|
|
+ int old, new;
|
|
|
+
|
|
|
+ if (val & _Q_LOCKED_MASK)
|
|
|
+ break;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Try to clear pending bit & set locked bit
|
|
|
+ */
|
|
|
+ old = val;
|
|
|
+ new = (val & ~_Q_PENDING_MASK) | _Q_LOCKED_VAL;
|
|
|
+ val = atomic_cmpxchg(&lock->val, old, new);
|
|
|
+
|
|
|
+ if (val == old)
|
|
|
+ return 1;
|
|
|
+ }
|
|
|
+ return 0;
|
|
|
+}
|
|
|
+#endif /* _Q_PENDING_BITS == 8 */
|
|
|
+
|
|
|
/*
|
|
|
* Include queued spinlock statistics code
|
|
|
*/
|
|
@@ -202,8 +285,8 @@ static void pv_wait_node(struct mcs_spinlock *node)
|
|
|
|
|
|
/*
|
|
|
* If pv_kick_node() changed us to vcpu_hashed, retain that
|
|
|
- * value so that pv_wait_head() knows to not also try to hash
|
|
|
- * this lock.
|
|
|
+ * value so that pv_wait_head_or_lock() knows to not also try
|
|
|
+ * to hash this lock.
|
|
|
*/
|
|
|
cmpxchg(&pn->state, vcpu_halted, vcpu_running);
|
|
|
|
|
@@ -227,8 +310,9 @@ static void pv_wait_node(struct mcs_spinlock *node)
|
|
|
/*
|
|
|
* Called after setting next->locked = 1 when we're the lock owner.
|
|
|
*
|
|
|
- * Instead of waking the waiters stuck in pv_wait_node() advance their state such
|
|
|
- * that they're waiting in pv_wait_head(), this avoids a wake/sleep cycle.
|
|
|
+ * Instead of waking the waiters stuck in pv_wait_node() advance their state
|
|
|
+ * such that they're waiting in pv_wait_head_or_lock(), this avoids a
|
|
|
+ * wake/sleep cycle.
|
|
|
*/
|
|
|
static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
|
|
|
{
|
|
@@ -257,10 +341,14 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
|
|
|
}
|
|
|
|
|
|
/*
|
|
|
- * Wait for l->locked to become clear; halt the vcpu after a short spin.
|
|
|
+ * Wait for l->locked to become clear and acquire the lock;
|
|
|
+ * halt the vcpu after a short spin.
|
|
|
* __pv_queued_spin_unlock() will wake us.
|
|
|
+ *
|
|
|
+ * The current value of the lock will be returned for additional processing.
|
|
|
*/
|
|
|
-static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
|
|
|
+static u32
|
|
|
+pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
|
|
|
{
|
|
|
struct pv_node *pn = (struct pv_node *)node;
|
|
|
struct __qspinlock *l = (void *)lock;
|
|
@@ -276,11 +364,18 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
|
|
|
lp = (struct qspinlock **)1;
|
|
|
|
|
|
for (;; waitcnt++) {
|
|
|
+ /*
|
|
|
+ * Set the pending bit in the active lock spinning loop to
|
|
|
+ * disable lock stealing before attempting to acquire the lock.
|
|
|
+ */
|
|
|
+ set_pending(lock);
|
|
|
for (loop = SPIN_THRESHOLD; loop; loop--) {
|
|
|
- if (!READ_ONCE(l->locked))
|
|
|
- return;
|
|
|
+ if (trylock_clear_pending(lock))
|
|
|
+ goto gotlock;
|
|
|
cpu_relax();
|
|
|
}
|
|
|
+ clear_pending(lock);
|
|
|
+
|
|
|
|
|
|
if (!lp) { /* ONCE */
|
|
|
lp = pv_hash(lock, pn);
|
|
@@ -296,36 +391,38 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
|
|
|
*
|
|
|
* Matches the smp_rmb() in __pv_queued_spin_unlock().
|
|
|
*/
|
|
|
- if (!cmpxchg(&l->locked, _Q_LOCKED_VAL, _Q_SLOW_VAL)) {
|
|
|
+ if (xchg(&l->locked, _Q_SLOW_VAL) == 0) {
|
|
|
/*
|
|
|
- * The lock is free and _Q_SLOW_VAL has never
|
|
|
- * been set. Therefore we need to unhash before
|
|
|
- * getting the lock.
|
|
|
+ * The lock was free and now we own the lock.
|
|
|
+ * Change the lock value back to _Q_LOCKED_VAL
|
|
|
+ * and unhash the table.
|
|
|
*/
|
|
|
+ WRITE_ONCE(l->locked, _Q_LOCKED_VAL);
|
|
|
WRITE_ONCE(*lp, NULL);
|
|
|
- return;
|
|
|
+ goto gotlock;
|
|
|
}
|
|
|
}
|
|
|
qstat_inc(qstat_pv_wait_head, true);
|
|
|
qstat_inc(qstat_pv_wait_again, waitcnt);
|
|
|
pv_wait(&l->locked, _Q_SLOW_VAL);
|
|
|
|
|
|
- if (!READ_ONCE(l->locked))
|
|
|
- return;
|
|
|
/*
|
|
|
* The unlocker should have freed the lock before kicking the
|
|
|
* CPU. So if the lock is still not free, it is a spurious
|
|
|
- * wakeup and so the vCPU should wait again after spinning for
|
|
|
- * a while.
|
|
|
+ * wakeup or another vCPU has stolen the lock. The current
|
|
|
+ * vCPU should spin again.
|
|
|
*/
|
|
|
- qstat_inc(qstat_pv_spurious_wakeup, true);
|
|
|
+ qstat_inc(qstat_pv_spurious_wakeup, READ_ONCE(l->locked));
|
|
|
}
|
|
|
|
|
|
/*
|
|
|
- * Lock is unlocked now; the caller will acquire it without waiting.
|
|
|
- * As with pv_wait_node() we rely on the caller to do a load-acquire
|
|
|
- * for us.
|
|
|
+ * The cmpxchg() or xchg() call before coming here provides the
|
|
|
+ * acquire semantics for locking. The dummy ORing of _Q_LOCKED_VAL
|
|
|
+ * here is to indicate to the compiler that the value will always
|
|
|
+ * be nozero to enable better code optimization.
|
|
|
*/
|
|
|
+gotlock:
|
|
|
+ return (u32)(atomic_read(&lock->val) | _Q_LOCKED_VAL);
|
|
|
}
|
|
|
|
|
|
/*
|
|
@@ -350,7 +447,7 @@ __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
|
|
|
* so we need a barrier to order the read of the node data in
|
|
|
* pv_unhash *after* we've read the lock being _Q_SLOW_VAL.
|
|
|
*
|
|
|
- * Matches the cmpxchg() in pv_wait_head() setting _Q_SLOW_VAL.
|
|
|
+ * Matches the cmpxchg() in pv_wait_head_or_lock() setting _Q_SLOW_VAL.
|
|
|
*/
|
|
|
smp_rmb();
|
|
|
|