]> git.baikalelectronics.ru Git - kernel.git/commitdiff
workqueue/watchdog: Make unbound workqueues aware of touch_softlockup_watchdog()
authorWang Qing <wangqing@vivo.com>
Wed, 24 Mar 2021 11:40:29 +0000 (19:40 +0800)
committerTejun Heo <tj@kernel.org>
Sun, 4 Apr 2021 17:26:49 +0000 (13:26 -0400)
84;0;0c84;0;0c
There are two workqueue-specific watchdog timestamps:

    + @wq_watchdog_touched_cpu (per-CPU) updated by
      touch_softlockup_watchdog()

    + @wq_watchdog_touched (global) updated by
      touch_all_softlockup_watchdogs()

watchdog_timer_fn() checks only the global @wq_watchdog_touched for
unbound workqueues. As a result, unbound workqueues are not aware
of touch_softlockup_watchdog(). The watchdog might report a stall
even when the unbound workqueues are blocked by a known slow code.

Solution:
touch_softlockup_watchdog() must touch also the global @wq_watchdog_touched
timestamp.

The global timestamp can no longer be used for bound workqueues because
it is now updated from all CPUs. Instead, bound workqueues have to check
only @wq_watchdog_touched_cpu and these timestamps have to be updated for
all CPUs in touch_all_softlockup_watchdogs().

Beware:
The change might cause the opposite problem. An unbound workqueue
might get blocked on CPU A because of a real softlockup. The workqueue
watchdog would miss it when the timestamp got touched on CPU B.

It is acceptable because softlockups are detected by softlockup
watchdog. The workqueue watchdog is there to detect stalls where
a work never finishes, for example, because of dependencies of works
queued into the same workqueue.

V3:
- Modify the commit message clearly according to Petr's suggestion.

Signed-off-by: Wang Qing <wangqing@vivo.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
kernel/watchdog.c
kernel/workqueue.c

index 71109065bd8ebf4ee946a902321b4cd8934a3d95..107bc38b19450afa728d89abfc6b57696d1dd09f 100644 (file)
@@ -278,9 +278,10 @@ void touch_all_softlockup_watchdogs(void)
         * update as well, the only side effect might be a cycle delay for
         * the softlockup check.
         */
-       for_each_cpu(cpu, &watchdog_allowed_mask)
+       for_each_cpu(cpu, &watchdog_allowed_mask) {
                per_cpu(watchdog_touch_ts, cpu) = SOFTLOCKUP_RESET;
-       wq_watchdog_touch(-1);
+               wq_watchdog_touch(cpu);
+       }
 }
 
 void touch_softlockup_watchdog_sync(void)
index 21fb00b52def206c9a9241ffb89973b5efbd9b49..79f2319543ced3d62ac7773e986d0f0e5c79c6e9 100644 (file)
@@ -5787,22 +5787,17 @@ static void wq_watchdog_timer_fn(struct timer_list *unused)
                        continue;
 
                /* get the latest of pool and touched timestamps */
+               if (pool->cpu >= 0)
+                       touched = READ_ONCE(per_cpu(wq_watchdog_touched_cpu, pool->cpu));
+               else
+                       touched = READ_ONCE(wq_watchdog_touched);
                pool_ts = READ_ONCE(pool->watchdog_ts);
-               touched = READ_ONCE(wq_watchdog_touched);
 
                if (time_after(pool_ts, touched))
                        ts = pool_ts;
                else
                        ts = touched;
 
-               if (pool->cpu >= 0) {
-                       unsigned long cpu_touched =
-                               READ_ONCE(per_cpu(wq_watchdog_touched_cpu,
-                                                 pool->cpu));
-                       if (time_after(cpu_touched, ts))
-                               ts = cpu_touched;
-               }
-
                /* did we stall? */
                if (time_after(jiffies, ts + thresh)) {
                        lockup_detected = true;
@@ -5826,8 +5821,8 @@ notrace void wq_watchdog_touch(int cpu)
 {
        if (cpu >= 0)
                per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies;
-       else
-               wq_watchdog_touched = jiffies;
+
+       wq_watchdog_touched = jiffies;
 }
 
 static void wq_watchdog_set_thresh(unsigned long thresh)