syzkaller was able to trigger a deadlock for NTF_MANAGED entries [0]:
kworker/0:16/14617 is trying to acquire lock:
ffffffff8d4dd370 (&tbl->lock){++-.}-{2:2}, at: ___neigh_create+0x9e1/0x2990 net/core/neighbour.c:652
[...]
but task is already holding lock:
ffffffff8d4dd370 (&tbl->lock){++-.}-{2:2}, at: neigh_managed_work+0x35/0x250 net/core/neighbour.c:1572
The neighbor entry turned to NUD_FAILED state, where __neigh_event_send()
triggered an immediate probe as per commit
38f62b815cc2 ("neigh: reduce
arp latency") via neigh_probe() given table lock was held.
One option to fix this situation is to defer the neigh_probe() back to
the neigh_timer_handler() similarly as pre
38f62b815cc2. For the case
of NTF_MANAGED, this deferral is acceptable given this only happens on
actual failure state and regular / expected state is NUD_VALID with the
entry already present.
The fix adds a parameter to __neigh_event_send() in order to communicate
whether immediate probe is allowed or disallowed. Existing call-sites
of neigh_event_send() default as-is to immediate probe. However, the
neigh_managed_work() disables it via use of neigh_event_send_probe().
[0] <TASK>
__dump_stack lib/dump_stack.c:88 [inline]
dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106
print_deadlock_bug kernel/locking/lockdep.c:2956 [inline]
check_deadlock kernel/locking/lockdep.c:2999 [inline]
validate_chain kernel/locking/lockdep.c:3788 [inline]
__lock_acquire.cold+0x149/0x3ab kernel/locking/lockdep.c:5027
lock_acquire kernel/locking/lockdep.c:5639 [inline]
lock_acquire+0x1ab/0x510 kernel/locking/lockdep.c:5604
__raw_write_lock_bh include/linux/rwlock_api_smp.h:202 [inline]
_raw_write_lock_bh+0x2f/0x40 kernel/locking/spinlock.c:334
___neigh_create+0x9e1/0x2990 net/core/neighbour.c:652
ip6_finish_output2+0x1070/0x14f0 net/ipv6/ip6_output.c:123
__ip6_finish_output net/ipv6/ip6_output.c:191 [inline]
__ip6_finish_output+0x61e/0xe90 net/ipv6/ip6_output.c:170
ip6_finish_output+0x32/0x200 net/ipv6/ip6_output.c:201
NF_HOOK_COND include/linux/netfilter.h:296 [inline]
ip6_output+0x1e4/0x530 net/ipv6/ip6_output.c:224
dst_output include/net/dst.h:451 [inline]
NF_HOOK include/linux/netfilter.h:307 [inline]
ndisc_send_skb+0xa99/0x17f0 net/ipv6/ndisc.c:508
ndisc_send_ns+0x3a9/0x840 net/ipv6/ndisc.c:650
ndisc_solicit+0x2cd/0x4f0 net/ipv6/ndisc.c:742
neigh_probe+0xc2/0x110 net/core/neighbour.c:1040
__neigh_event_send+0x37d/0x1570 net/core/neighbour.c:1201
neigh_event_send include/net/neighbour.h:470 [inline]
neigh_managed_work+0x162/0x250 net/core/neighbour.c:1574
process_one_work+0x9ac/0x1650 kernel/workqueue.c:2307
worker_thread+0x657/0x1110 kernel/workqueue.c:2454
kthread+0x2e9/0x3a0 kernel/kthread.c:377
ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:295
</TASK>
Fixes: f93271f9a4d1 ("net, neigh: Add NTF_MANAGED flag for managed neighbor entries")
Reported-by: syzbot+5239d0e1778a500d477a@syzkaller.appspotmail.com
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Roopa Prabhu <roopa@nvidia.com>
Tested-by: syzbot+5239d0e1778a500d477a@syzkaller.appspotmail.com
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/r/20220201193942.5055-1-daniel@iogearbox.net
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
return __neigh_create(tbl, pkey, dev, true);
}
void neigh_destroy(struct neighbour *neigh);
-int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb);
+int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb,
+ const bool immediate_ok);
int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, u32 flags,
u32 nlmsg_pid);
void __neigh_set_probe_once(struct neighbour *neigh);
#define neigh_hold(n) refcount_inc(&(n)->refcnt)
-static inline int neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
+static __always_inline int neigh_event_send_probe(struct neighbour *neigh,
+ struct sk_buff *skb,
+ const bool immediate_ok)
{
unsigned long now = jiffies;
-
+
if (READ_ONCE(neigh->used) != now)
WRITE_ONCE(neigh->used, now);
- if (!(neigh->nud_state&(NUD_CONNECTED|NUD_DELAY|NUD_PROBE)))
- return __neigh_event_send(neigh, skb);
+ if (!(neigh->nud_state & (NUD_CONNECTED | NUD_DELAY | NUD_PROBE)))
+ return __neigh_event_send(neigh, skb, immediate_ok);
return 0;
}
+static inline int neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
+{
+ return neigh_event_send_probe(neigh, skb, true);
+}
+
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
static inline int neigh_hh_bridge(struct hh_cache *hh, struct sk_buff *skb)
{
neigh_release(neigh);
}
-int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
+int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb,
+ const bool immediate_ok)
{
int rc;
bool immediate_probe = false;
atomic_set(&neigh->probes,
NEIGH_VAR(neigh->parms, UCAST_PROBES));
neigh_del_timer(neigh);
- neigh->nud_state = NUD_INCOMPLETE;
+ neigh->nud_state = NUD_INCOMPLETE;
neigh->updated = now;
- next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME),
- HZ/100);
+ if (!immediate_ok) {
+ next = now + 1;
+ } else {
+ immediate_probe = true;
+ next = now + max(NEIGH_VAR(neigh->parms,
+ RETRANS_TIME),
+ HZ / 100);
+ }
neigh_add_timer(neigh, next);
- immediate_probe = true;
} else {
neigh->nud_state = NUD_FAILED;
neigh->updated = jiffies;
write_lock_bh(&tbl->lock);
list_for_each_entry(neigh, &tbl->managed_list, managed_list)
- neigh_event_send(neigh, NULL);
+ neigh_event_send_probe(neigh, NULL, false);
queue_delayed_work(system_power_efficient_wq, &tbl->managed_work,
NEIGH_VAR(&tbl->parms, DELAY_PROBE_TIME));
write_unlock_bh(&tbl->lock);