fs/epoll: make nesting accounting safe for -rt kernel

author Jason Baron <jbaron@akamai.com>

Tue, 7 Apr 2020 03:11:23 +0000 (20:11 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Tue, 7 Apr 2020 17:43:44 +0000 (10:43 -0700)
author Jason Baron <jbaron@akamai.com>
Tue, 7 Apr 2020 03:11:23 +0000 (20:11 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Tue, 7 Apr 2020 17:43:44 +0000 (10:43 -0700)
diff --git a/fs/eventpoll.c b/fs/eventpoll.c

index eee3c92a9ebf85d4d6509018ffa03ed211a3bdc4..8c596641a72b09733920dd12643194e2bd4b3ab1 100644 (file)
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -218,13 +218,18 @@ struct eventpoll {
         struct file *file;
  
         /* used to optimize loop detection check */
-       int visited;
         struct list_head visited_list_link;
+       int visited;
  
  #ifdef CONFIG_NET_RX_BUSY_POLL
         /* used to track busy poll napi_id */
         unsigned int napi_id;
  #endif
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+       /* tracks wakeup nests for lockdep validation */
+       u8 nests;
+#endif
  };
  
  /* Wait structure used by the poll hooks */
@@ -545,30 +550,47 @@ out_unlock:
   */
  #ifdef CONFIG_DEBUG_LOCK_ALLOC
  
-static DEFINE_PER_CPU(int, wakeup_nest);
-
-static void ep_poll_safewake(wait_queue_head_t *wq)
+static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi)
  {
+       struct eventpoll *ep_src;
         unsigned long flags;
-       int subclass;
+       u8 nests = 0;
  
-       local_irq_save(flags);
-       preempt_disable();
-       subclass = __this_cpu_read(wakeup_nest);
-       spin_lock_nested(&wq->lock, subclass + 1);
-       __this_cpu_inc(wakeup_nest);
-       wake_up_locked_poll(wq, POLLIN);
-       __this_cpu_dec(wakeup_nest);
-       spin_unlock(&wq->lock);
-       local_irq_restore(flags);
-       preempt_enable();
+       /*
+        * To set the subclass or nesting level for spin_lock_irqsave_nested()
+        * it might be natural to create a per-cpu nest count. However, since
+        * we can recurse on ep->poll_wait.lock, and a non-raw spinlock can
+        * schedule() in the -rt kernel, the per-cpu variable are no longer
+        * protected. Thus, we are introducing a per eventpoll nest field.
+        * If we are not being call from ep_poll_callback(), epi is NULL and
+        * we are at the first level of nesting, 0. Otherwise, we are being
+        * called from ep_poll_callback() and if a previous wakeup source is
+        * not an epoll file itself, we are at depth 1 since the wakeup source
+        * is depth 0. If the wakeup source is a previous epoll file in the
+        * wakeup chain then we use its nests value and record ours as
+        * nests + 1. The previous epoll file nests value is stable since its
+        * already holding its own poll_wait.lock.
+        */
+       if (epi) {
+               if ((is_file_epoll(epi->ffd.file))) {
+                       ep_src = epi->ffd.file->private_data;
+                       nests = ep_src->nests;
+               } else {
+                       nests = 1;
+               }
+       }
+       spin_lock_irqsave_nested(&ep->poll_wait.lock, flags, nests);
+       ep->nests = nests + 1;
+       wake_up_locked_poll(&ep->poll_wait, EPOLLIN);
+       ep->nests = 0;
+       spin_unlock_irqrestore(&ep->poll_wait.lock, flags);
  }
  
  #else
  
-static void ep_poll_safewake(wait_queue_head_t *wq)
+static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi)
  {
-       wake_up_poll(wq, EPOLLIN);
+       wake_up_poll(&ep->poll_wait, EPOLLIN);
  }
  
  #endif
@@ -789,7 +811,7 @@ static void ep_free(struct eventpoll *ep)
  
         /* We need to release all tasks waiting for these file */
         if (waitqueue_active(&ep->poll_wait))
-               ep_poll_safewake(&ep->poll_wait);
+               ep_poll_safewake(ep, NULL);
  
         /*
          * We need to lock this because we could be hit by
@@ -1258,7 +1280,7 @@ out_unlock:
  
         /* We have to call this outside the lock */
         if (pwake)
-               ep_poll_safewake(&ep->poll_wait);
+               ep_poll_safewake(ep, epi);
  
         if (!(epi->event.events & EPOLLEXCLUSIVE))
                 ewake = 1;
@@ -1562,7 +1584,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
  
         /* We have to call this outside the lock */
         if (pwake)
-               ep_poll_safewake(&ep->poll_wait);
+               ep_poll_safewake(ep, NULL);
  
         return 0;
  
@@ -1666,7 +1688,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi,
  
         /* We have to call this outside the lock */
         if (pwake)
-               ep_poll_safewake(&ep->poll_wait);
+               ep_poll_safewake(ep, NULL);
  
         return 0;
  }
author	Jason Baron <jbaron@akamai.com>
	Tue, 7 Apr 2020 03:11:23 +0000 (20:11 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 7 Apr 2020 17:43:44 +0000 (10:43 -0700)