|
@@ -50,10 +50,10 @@
|
|
*
|
|
*
|
|
* 1) epmutex (mutex)
|
|
* 1) epmutex (mutex)
|
|
* 2) ep->mtx (mutex)
|
|
* 2) ep->mtx (mutex)
|
|
- * 3) ep->lock (spinlock)
|
|
|
|
|
|
+ * 3) ep->wq.lock (spinlock)
|
|
*
|
|
*
|
|
* The acquire order is the one listed above, from 1 to 3.
|
|
* The acquire order is the one listed above, from 1 to 3.
|
|
- * We need a spinlock (ep->lock) because we manipulate objects
|
|
|
|
|
|
+ * We need a spinlock (ep->wq.lock) because we manipulate objects
|
|
* from inside the poll callback, that might be triggered from
|
|
* from inside the poll callback, that might be triggered from
|
|
* a wake_up() that in turn might be called from IRQ context.
|
|
* a wake_up() that in turn might be called from IRQ context.
|
|
* So we can't sleep inside the poll callback and hence we need
|
|
* So we can't sleep inside the poll callback and hence we need
|
|
@@ -85,7 +85,7 @@
|
|
* of epoll file descriptors, we use the current recursion depth as
|
|
* of epoll file descriptors, we use the current recursion depth as
|
|
* the lockdep subkey.
|
|
* the lockdep subkey.
|
|
* It is possible to drop the "ep->mtx" and to use the global
|
|
* It is possible to drop the "ep->mtx" and to use the global
|
|
- * mutex "epmutex" (together with "ep->lock") to have it working,
|
|
|
|
|
|
+ * mutex "epmutex" (together with "ep->wq.lock") to have it working,
|
|
* but having "ep->mtx" will make the interface more scalable.
|
|
* but having "ep->mtx" will make the interface more scalable.
|
|
* Events that require holding "epmutex" are very rare, while for
|
|
* Events that require holding "epmutex" are very rare, while for
|
|
* normal operations the epoll private "ep->mtx" will guarantee
|
|
* normal operations the epoll private "ep->mtx" will guarantee
|
|
@@ -182,11 +182,10 @@ struct epitem {
|
|
* This structure is stored inside the "private_data" member of the file
|
|
* This structure is stored inside the "private_data" member of the file
|
|
* structure and represents the main data structure for the eventpoll
|
|
* structure and represents the main data structure for the eventpoll
|
|
* interface.
|
|
* interface.
|
|
|
|
+ *
|
|
|
|
+ * Access to it is protected by the lock inside wq.
|
|
*/
|
|
*/
|
|
struct eventpoll {
|
|
struct eventpoll {
|
|
- /* Protect the access to this structure */
|
|
|
|
- spinlock_t lock;
|
|
|
|
-
|
|
|
|
/*
|
|
/*
|
|
* This mutex is used to ensure that files are not removed
|
|
* This mutex is used to ensure that files are not removed
|
|
* while epoll is using them. This is held during the event
|
|
* while epoll is using them. This is held during the event
|
|
@@ -210,7 +209,7 @@ struct eventpoll {
|
|
/*
|
|
/*
|
|
* This is a single linked list that chains all the "struct epitem" that
|
|
* This is a single linked list that chains all the "struct epitem" that
|
|
* happened while transferring ready events to userspace w/out
|
|
* happened while transferring ready events to userspace w/out
|
|
- * holding ->lock.
|
|
|
|
|
|
+ * holding ->wq.lock.
|
|
*/
|
|
*/
|
|
struct epitem *ovflist;
|
|
struct epitem *ovflist;
|
|
|
|
|
|
@@ -688,17 +687,17 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep,
|
|
* because we want the "sproc" callback to be able to do it
|
|
* because we want the "sproc" callback to be able to do it
|
|
* in a lockless way.
|
|
* in a lockless way.
|
|
*/
|
|
*/
|
|
- spin_lock_irqsave(&ep->lock, flags);
|
|
|
|
|
|
+ spin_lock_irqsave(&ep->wq.lock, flags);
|
|
list_splice_init(&ep->rdllist, &txlist);
|
|
list_splice_init(&ep->rdllist, &txlist);
|
|
ep->ovflist = NULL;
|
|
ep->ovflist = NULL;
|
|
- spin_unlock_irqrestore(&ep->lock, flags);
|
|
|
|
|
|
+ spin_unlock_irqrestore(&ep->wq.lock, flags);
|
|
|
|
|
|
/*
|
|
/*
|
|
* Now call the callback function.
|
|
* Now call the callback function.
|
|
*/
|
|
*/
|
|
res = (*sproc)(ep, &txlist, priv);
|
|
res = (*sproc)(ep, &txlist, priv);
|
|
|
|
|
|
- spin_lock_irqsave(&ep->lock, flags);
|
|
|
|
|
|
+ spin_lock_irqsave(&ep->wq.lock, flags);
|
|
/*
|
|
/*
|
|
* During the time we spent inside the "sproc" callback, some
|
|
* During the time we spent inside the "sproc" callback, some
|
|
* other events might have been queued by the poll callback.
|
|
* other events might have been queued by the poll callback.
|
|
@@ -740,7 +739,7 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep,
|
|
if (waitqueue_active(&ep->poll_wait))
|
|
if (waitqueue_active(&ep->poll_wait))
|
|
pwake++;
|
|
pwake++;
|
|
}
|
|
}
|
|
- spin_unlock_irqrestore(&ep->lock, flags);
|
|
|
|
|
|
+ spin_unlock_irqrestore(&ep->wq.lock, flags);
|
|
|
|
|
|
if (!ep_locked)
|
|
if (!ep_locked)
|
|
mutex_unlock(&ep->mtx);
|
|
mutex_unlock(&ep->mtx);
|
|
@@ -768,12 +767,7 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
|
|
struct file *file = epi->ffd.file;
|
|
struct file *file = epi->ffd.file;
|
|
|
|
|
|
/*
|
|
/*
|
|
- * Removes poll wait queue hooks. We _have_ to do this without holding
|
|
|
|
- * the "ep->lock" otherwise a deadlock might occur. This because of the
|
|
|
|
- * sequence of the lock acquisition. Here we do "ep->lock" then the wait
|
|
|
|
- * queue head lock when unregistering the wait queue. The wakeup callback
|
|
|
|
- * will run by holding the wait queue head lock and will call our callback
|
|
|
|
- * that will try to get "ep->lock".
|
|
|
|
|
|
+ * Removes poll wait queue hooks.
|
|
*/
|
|
*/
|
|
ep_unregister_pollwait(ep, epi);
|
|
ep_unregister_pollwait(ep, epi);
|
|
|
|
|
|
@@ -784,10 +778,10 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
|
|
|
|
|
|
rb_erase_cached(&epi->rbn, &ep->rbr);
|
|
rb_erase_cached(&epi->rbn, &ep->rbr);
|
|
|
|
|
|
- spin_lock_irqsave(&ep->lock, flags);
|
|
|
|
|
|
+ spin_lock_irqsave(&ep->wq.lock, flags);
|
|
if (ep_is_linked(&epi->rdllink))
|
|
if (ep_is_linked(&epi->rdllink))
|
|
list_del_init(&epi->rdllink);
|
|
list_del_init(&epi->rdllink);
|
|
- spin_unlock_irqrestore(&ep->lock, flags);
|
|
|
|
|
|
+ spin_unlock_irqrestore(&ep->wq.lock, flags);
|
|
|
|
|
|
wakeup_source_unregister(ep_wakeup_source(epi));
|
|
wakeup_source_unregister(ep_wakeup_source(epi));
|
|
/*
|
|
/*
|
|
@@ -837,7 +831,7 @@ static void ep_free(struct eventpoll *ep)
|
|
* Walks through the whole tree by freeing each "struct epitem". At this
|
|
* Walks through the whole tree by freeing each "struct epitem". At this
|
|
* point we are sure no poll callbacks will be lingering around, and also by
|
|
* point we are sure no poll callbacks will be lingering around, and also by
|
|
* holding "epmutex" we can be sure that no file cleanup code will hit
|
|
* holding "epmutex" we can be sure that no file cleanup code will hit
|
|
- * us during this operation. So we can avoid the lock on "ep->lock".
|
|
|
|
|
|
+ * us during this operation. So we can avoid the lock on "ep->wq.lock".
|
|
* We do not need to lock ep->mtx, either, we only do it to prevent
|
|
* We do not need to lock ep->mtx, either, we only do it to prevent
|
|
* a lockdep warning.
|
|
* a lockdep warning.
|
|
*/
|
|
*/
|
|
@@ -1017,7 +1011,6 @@ static int ep_alloc(struct eventpoll **pep)
|
|
if (unlikely(!ep))
|
|
if (unlikely(!ep))
|
|
goto free_uid;
|
|
goto free_uid;
|
|
|
|
|
|
- spin_lock_init(&ep->lock);
|
|
|
|
mutex_init(&ep->mtx);
|
|
mutex_init(&ep->mtx);
|
|
init_waitqueue_head(&ep->wq);
|
|
init_waitqueue_head(&ep->wq);
|
|
init_waitqueue_head(&ep->poll_wait);
|
|
init_waitqueue_head(&ep->poll_wait);
|
|
@@ -1122,7 +1115,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
|
|
__poll_t pollflags = key_to_poll(key);
|
|
__poll_t pollflags = key_to_poll(key);
|
|
int ewake = 0;
|
|
int ewake = 0;
|
|
|
|
|
|
- spin_lock_irqsave(&ep->lock, flags);
|
|
|
|
|
|
+ spin_lock_irqsave(&ep->wq.lock, flags);
|
|
|
|
|
|
ep_set_busy_poll_napi_id(epi);
|
|
ep_set_busy_poll_napi_id(epi);
|
|
|
|
|
|
@@ -1199,7 +1192,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
|
|
pwake++;
|
|
pwake++;
|
|
|
|
|
|
out_unlock:
|
|
out_unlock:
|
|
- spin_unlock_irqrestore(&ep->lock, flags);
|
|
|
|
|
|
+ spin_unlock_irqrestore(&ep->wq.lock, flags);
|
|
|
|
|
|
/* We have to call this outside the lock */
|
|
/* We have to call this outside the lock */
|
|
if (pwake)
|
|
if (pwake)
|
|
@@ -1484,7 +1477,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
|
|
goto error_remove_epi;
|
|
goto error_remove_epi;
|
|
|
|
|
|
/* We have to drop the new item inside our item list to keep track of it */
|
|
/* We have to drop the new item inside our item list to keep track of it */
|
|
- spin_lock_irqsave(&ep->lock, flags);
|
|
|
|
|
|
+ spin_lock_irqsave(&ep->wq.lock, flags);
|
|
|
|
|
|
/* record NAPI ID of new item if present */
|
|
/* record NAPI ID of new item if present */
|
|
ep_set_busy_poll_napi_id(epi);
|
|
ep_set_busy_poll_napi_id(epi);
|
|
@@ -1501,7 +1494,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
|
|
pwake++;
|
|
pwake++;
|
|
}
|
|
}
|
|
|
|
|
|
- spin_unlock_irqrestore(&ep->lock, flags);
|
|
|
|
|
|
+ spin_unlock_irqrestore(&ep->wq.lock, flags);
|
|
|
|
|
|
atomic_long_inc(&ep->user->epoll_watches);
|
|
atomic_long_inc(&ep->user->epoll_watches);
|
|
|
|
|
|
@@ -1527,10 +1520,10 @@ error_unregister:
|
|
* list, since that is used/cleaned only inside a section bound by "mtx".
|
|
* list, since that is used/cleaned only inside a section bound by "mtx".
|
|
* And ep_insert() is called with "mtx" held.
|
|
* And ep_insert() is called with "mtx" held.
|
|
*/
|
|
*/
|
|
- spin_lock_irqsave(&ep->lock, flags);
|
|
|
|
|
|
+ spin_lock_irqsave(&ep->wq.lock, flags);
|
|
if (ep_is_linked(&epi->rdllink))
|
|
if (ep_is_linked(&epi->rdllink))
|
|
list_del_init(&epi->rdllink);
|
|
list_del_init(&epi->rdllink);
|
|
- spin_unlock_irqrestore(&ep->lock, flags);
|
|
|
|
|
|
+ spin_unlock_irqrestore(&ep->wq.lock, flags);
|
|
|
|
|
|
wakeup_source_unregister(ep_wakeup_source(epi));
|
|
wakeup_source_unregister(ep_wakeup_source(epi));
|
|
|
|
|
|
@@ -1572,9 +1565,9 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi,
|
|
* 1) Flush epi changes above to other CPUs. This ensures
|
|
* 1) Flush epi changes above to other CPUs. This ensures
|
|
* we do not miss events from ep_poll_callback if an
|
|
* we do not miss events from ep_poll_callback if an
|
|
* event occurs immediately after we call f_op->poll().
|
|
* event occurs immediately after we call f_op->poll().
|
|
- * We need this because we did not take ep->lock while
|
|
|
|
|
|
+ * We need this because we did not take ep->wq.lock while
|
|
* changing epi above (but ep_poll_callback does take
|
|
* changing epi above (but ep_poll_callback does take
|
|
- * ep->lock).
|
|
|
|
|
|
+ * ep->wq.lock).
|
|
*
|
|
*
|
|
* 2) We also need to ensure we do not miss _past_ events
|
|
* 2) We also need to ensure we do not miss _past_ events
|
|
* when calling f_op->poll(). This barrier also
|
|
* when calling f_op->poll(). This barrier also
|
|
@@ -1593,7 +1586,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi,
|
|
* list, push it inside.
|
|
* list, push it inside.
|
|
*/
|
|
*/
|
|
if (ep_item_poll(epi, &pt, 1)) {
|
|
if (ep_item_poll(epi, &pt, 1)) {
|
|
- spin_lock_irq(&ep->lock);
|
|
|
|
|
|
+ spin_lock_irq(&ep->wq.lock);
|
|
if (!ep_is_linked(&epi->rdllink)) {
|
|
if (!ep_is_linked(&epi->rdllink)) {
|
|
list_add_tail(&epi->rdllink, &ep->rdllist);
|
|
list_add_tail(&epi->rdllink, &ep->rdllist);
|
|
ep_pm_stay_awake(epi);
|
|
ep_pm_stay_awake(epi);
|
|
@@ -1604,7 +1597,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi,
|
|
if (waitqueue_active(&ep->poll_wait))
|
|
if (waitqueue_active(&ep->poll_wait))
|
|
pwake++;
|
|
pwake++;
|
|
}
|
|
}
|
|
- spin_unlock_irq(&ep->lock);
|
|
|
|
|
|
+ spin_unlock_irq(&ep->wq.lock);
|
|
}
|
|
}
|
|
|
|
|
|
/* We have to call this outside the lock */
|
|
/* We have to call this outside the lock */
|
|
@@ -1756,7 +1749,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
|
|
* caller specified a non blocking operation.
|
|
* caller specified a non blocking operation.
|
|
*/
|
|
*/
|
|
timed_out = 1;
|
|
timed_out = 1;
|
|
- spin_lock_irqsave(&ep->lock, flags);
|
|
|
|
|
|
+ spin_lock_irqsave(&ep->wq.lock, flags);
|
|
goto check_events;
|
|
goto check_events;
|
|
}
|
|
}
|
|
|
|
|
|
@@ -1765,7 +1758,7 @@ fetch_events:
|
|
if (!ep_events_available(ep))
|
|
if (!ep_events_available(ep))
|
|
ep_busy_loop(ep, timed_out);
|
|
ep_busy_loop(ep, timed_out);
|
|
|
|
|
|
- spin_lock_irqsave(&ep->lock, flags);
|
|
|
|
|
|
+ spin_lock_irqsave(&ep->wq.lock, flags);
|
|
|
|
|
|
if (!ep_events_available(ep)) {
|
|
if (!ep_events_available(ep)) {
|
|
/*
|
|
/*
|
|
@@ -1807,11 +1800,11 @@ fetch_events:
|
|
break;
|
|
break;
|
|
}
|
|
}
|
|
|
|
|
|
- spin_unlock_irqrestore(&ep->lock, flags);
|
|
|
|
|
|
+ spin_unlock_irqrestore(&ep->wq.lock, flags);
|
|
if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
|
|
if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
|
|
timed_out = 1;
|
|
timed_out = 1;
|
|
|
|
|
|
- spin_lock_irqsave(&ep->lock, flags);
|
|
|
|
|
|
+ spin_lock_irqsave(&ep->wq.lock, flags);
|
|
}
|
|
}
|
|
|
|
|
|
__remove_wait_queue(&ep->wq, &wait);
|
|
__remove_wait_queue(&ep->wq, &wait);
|
|
@@ -1821,7 +1814,7 @@ check_events:
|
|
/* Is it worth to try to dig for events ? */
|
|
/* Is it worth to try to dig for events ? */
|
|
eavail = ep_events_available(ep);
|
|
eavail = ep_events_available(ep);
|
|
|
|
|
|
- spin_unlock_irqrestore(&ep->lock, flags);
|
|
|
|
|
|
+ spin_unlock_irqrestore(&ep->wq.lock, flags);
|
|
|
|
|
|
/*
|
|
/*
|
|
* Try to transfer events to user space. In case we get 0 events and
|
|
* Try to transfer events to user space. In case we get 0 events and
|