最近再移植ath11k qsdk 12.2版本,移植后,启动出现打印:
NOHZ tick-stop error: Non-RCU local softirq work is pending, handler #08!!!
打开内核调试选项:
Kernel hacking ->
Debug Oops, Lockups and Hangs ->
Debug preemptible kernel
Lock Debugging (spinlocks, mutexes, etc...) ->
RCU Debugging --->
选项变化:
diff --git a/targets/ipq50xx_32/kernel.config b/targets/ipq50xx_32/kernel.config
index cad3de8..109c260 100644
--- a/targets/ipq50xx_32/kernel.config
+++ b/targets/ipq50xx_32/kernel.config
@@ -205,7 +205,7 @@ CONFIG_IO_URING=y
CONFIG_ADVISE_SYSCALLS=y
CONFIG_MEMBARRIER=y
CONFIG_KALLSYMS=y
-# CONFIG_KALLSYMS_ALL is not set
+CONFIG_KALLSYMS_ALL=y
CONFIG_KALLSYMS_BASE_RELATIVE=y
# CONFIG_USERFAULTFD is not set
CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE=y
@@ -1316,7 +1316,6 @@ CONFIG_QTI_SCM_RESTART_REASON=y
# CONFIG_TRUSTED_FOUNDATIONS is not set
# CONFIG_GOOGLE_FIRMWARE is not set
CONFIG_ARM_PSCI_FW=y
-# CONFIG_ARM_PSCI_CHECKER is not set
CONFIG_HAVE_ARM_SMCCC=y
CONFIG_HAVE_ARM_SMCCC_DISCOVERY=y
# CONFIG_ARM_SMCCC_SOC_ID is not set
@@ -3758,7 +3757,7 @@ CONFIG_CC_HAS_WORKING_NOSANITIZE_ADDRESS=y
# CONFIG_KASAN is not set
# end of Memory Debugging
-# CONFIG_DEBUG_SHIRQ is not set
+CONFIG_DEBUG_SHIRQ=y
#
# Debug Oops, Lockups and Hangs
@@ -3766,10 +3765,16 @@ CONFIG_CC_HAS_WORKING_NOSANITIZE_ADDRESS=y
# CONFIG_PANIC_ON_OOPS is not set
CONFIG_PANIC_ON_OOPS_VALUE=0
CONFIG_PANIC_TIMEOUT=0
-# CONFIG_SOFTLOCKUP_DETECTOR is not set
-# CONFIG_DETECT_HUNG_TASK is not set
-# CONFIG_WQ_WATCHDOG is not set
-# CONFIG_TEST_LOCKUP is not set
+CONFIG_LOCKUP_DETECTOR=y
+CONFIG_SOFTLOCKUP_DETECTOR=y
+# CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set
+CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE=0
+CONFIG_DETECT_HUNG_TASK=y
+CONFIG_DEFAULT_HUNG_TASK_TIMEOUT=120
+# CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set
+CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE=0
+CONFIG_WQ_WATCHDOG=y
+CONFIG_TEST_LOCKUP=m
# end of Debug Oops, Lockups and Hangs
#
@@ -3786,22 +3791,31 @@ CONFIG_DEBUG_PREEMPT=y
# Lock Debugging (spinlocks, mutexes, etc...)
#
CONFIG_LOCK_DEBUGGING_SUPPORT=y
-# CONFIG_PROVE_LOCKING is not set
-# CONFIG_LOCK_STAT is not set
-# CONFIG_DEBUG_RT_MUTEXES is not set
-# CONFIG_DEBUG_SPINLOCK is not set
-# CONFIG_DEBUG_MUTEXES is not set
-# CONFIG_DEBUG_WW_MUTEX_SLOWPATH is not set
-# CONFIG_DEBUG_RWSEMS is not set
-# CONFIG_DEBUG_LOCK_ALLOC is not set
-# CONFIG_DEBUG_ATOMIC_SLEEP is not set
-# CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set
-# CONFIG_LOCK_TORTURE_TEST is not set
-# CONFIG_WW_MUTEX_SELFTEST is not set
-# CONFIG_SCF_TORTURE_TEST is not set
+CONFIG_PROVE_LOCKING=y
+CONFIG_PROVE_RAW_LOCK_NESTING=y
+CONFIG_LOCK_STAT=y
+CONFIG_DEBUG_RT_MUTEXES=y
+CONFIG_DEBUG_SPINLOCK=y
+CONFIG_DEBUG_MUTEXES=y
+CONFIG_DEBUG_WW_MUTEX_SLOWPATH=y
+CONFIG_DEBUG_RWSEMS=y
+CONFIG_DEBUG_LOCK_ALLOC=y
+CONFIG_LOCKDEP=y
+CONFIG_LOCKDEP_BITS=15
+CONFIG_LOCKDEP_CHAINS_BITS=16
+CONFIG_LOCKDEP_STACK_TRACE_BITS=19
+CONFIG_LOCKDEP_STACK_TRACE_HASH_BITS=14
+CONFIG_LOCKDEP_CIRCULAR_QUEUE_BITS=12
+CONFIG_DEBUG_LOCKDEP=y
+CONFIG_DEBUG_ATOMIC_SLEEP=y
+CONFIG_DEBUG_LOCKING_API_SELFTESTS=y
+CONFIG_LOCK_TORTURE_TEST=y
+CONFIG_WW_MUTEX_SELFTEST=y
+CONFIG_SCF_TORTURE_TEST=y
# end of Lock Debugging (spinlocks, mutexes, etc...)
-# CONFIG_DEBUG_IRQFLAGS is not set
+CONFIG_TRACE_IRQFLAGS=y
+CONFIG_DEBUG_IRQFLAGS=y
CONFIG_STACKTRACE=y
# CONFIG_WARN_ALL_UNSEEDED_RANDOM is not set
# CONFIG_DEBUG_KOBJECT is not set
@@ -3821,6 +3835,8 @@ CONFIG_STACKTRACE=y
#
# RCU Debugging
#
+CONFIG_PROVE_RCU=y
+CONFIG_TORTURE_TEST=y
# CONFIG_RCU_SCALE_TEST is not set
# CONFIG_RCU_TORTURE_TEST is not set
# CONFIG_RCU_REF_SCALE_TEST is not set
@@ -3844,6 +3860,7 @@ CONFIG_TRACE_CLOCK=y
CONFIG_RING_BUFFER=y
CONFIG_EVENT_TRACING=y
CONFIG_CONTEXT_SWITCH_TRACER=y
+CONFIG_PREEMPTIRQ_TRACEPOINTS=y
CONFIG_TRACING=y
CONFIG_TRACING_SUPPORT=y
CONFIG_FTRACE=y
重新编译后,第一个异常打印如下:
[ 168.997295] ------------[ cut here ]------------
[ 168.997393] WARNING: CPU: 0 PID: 55 at kernel/rcu/tree_plugin.h:427 ath11k_wmi_tlv_fw_stats_parse+0x49c/0x8bc [ath11k]
[ 169.001067] Modules linked in: ath11k_ahb(O) ath11k(O) mac80211(O) cfg80211(O) compat(O) iptable_nat xt_MASQUERADE nf_nat traffic(O) nss_dp(O) ssdk(O)
[ 169.011561] CPU: 0 PID: 55 Comm: lock_torture_wr Tainted: G O 5.15.38+ #2
[ 169.024954] Hardware name: Generic DT based system
[ 169.033204] [<81310908>] (unwind_backtrace) from [<8130bc00>] (show_stack+0x10/0x14)
[ 169.037802] [<8130bc00>] (show_stack) from [<81a6942c>] (dump_stack_lvl+0x68/0x90)
[ 169.045701] [<81a6942c>] (dump_stack_lvl) from [<813212fc>] (__warn+0xec/0x15c)
[ 169.053079] [<813212fc>] (__warn) from [<81a5fd94>] (warn_slowpath_fmt+0x78/0xa8)
[ 169.060286] [<81a5fd94>] (warn_slowpath_fmt) from [<7f313adc>] (ath11k_wmi_tlv_fw_stats_parse+0x49c/0x8bc [ath11k])
[ 169.067933] [<7f313adc>] (ath11k_wmi_tlv_fw_stats_parse [ath11k]) from [<7f30f504>] (ath11k_wmi_tlv_iter+0x5c/0x138 [ath11k])
[ 169.078176] [<7f30f504>] (ath11k_wmi_tlv_iter [ath11k]) from [<7f31ecb0>] (ath11k_wmi_tlv_op_rx+0x113c/0x3150 [ath11k])
[ 169.089633] [<7f31ecb0>] (ath11k_wmi_tlv_op_rx [ath11k]) from [<7f3386c8>] (ath11k_htc_rx_completion_handler+0x380/0x4f8 [ath11k])
[ 169.100226] [<7f3386c8>] (ath11k_htc_rx_completion_handler [ath11k]) from [<7f34f558>] (ath11k_ce_per_engine_service+0x2bc/0x38c [ath11k])
[ 169.112032] [<7f34f558>] (ath11k_ce_per_engine_service [ath11k]) from [<7f355388>] (ath11k_pcic_ce_tasklet+0x18/0x34 [ath11k])
[ 169.124444] [<7f355388>] (ath11k_pcic_ce_tasklet [ath11k]) from [<8132a06c>] (tasklet_action_common.constprop.0+0xf0/0x120)
[ 169.135813] [<8132a06c>] (tasklet_action_common.constprop.0) from [<81301360>] (__do_softirq+0x138/0x67c)
[ 169.146833] [<81301360>] (__do_softirq) from [<81329a4c>] (__irq_exit_rcu+0x170/0x218)
[ 169.156552] [<81329a4c>] (__irq_exit_rcu) from [<81329c3c>] (irq_exit+0x8/0x34)
[ 169.164363] [<81329c3c>] (irq_exit) from [<813a3f28>] (handle_domain_irq+0x80/0xb0)
[ 169.171569] [<813a3f28>] (handle_domain_irq) from [<816c51b8>] (gic_handle_irq+0x8c/0xb4)
[ 169.179210] [<816c51b8>] (gic_handle_irq) from [<81300b40>] (__irq_svc+0x60/0x98)
[ 169.187539] Exception stack(0x80bbbe68 to 0x80bbbeb0)
[ 169.195006] be60: 81f22ca0 00000000 00000000 80bba000 81fde900 81f15010
[ 169.200045] be80: 00000000 00000000 81f22ca0 00000000 00000000 00000000 00000000 80bbbeb8
[ 169.208203] bea0: 81a79c8c 8138e558 600e0013 ffffffff
[ 169.216358] [<81300b40>] (__irq_svc) from [<8138e558>] (lock_acquire+0x5c/0x498)
[ 169.221395] [<8138e558>] (lock_acquire) from [<81a79c8c>] (_raw_spin_lock+0x3c/0x78)
[ 169.228861] [<81a79c8c>] (_raw_spin_lock) from [<81398a14>] (torture_spin_lock_write_lock+0x10/0x18)
[ 169.236588] [<81398a14>] (torture_spin_lock_write_lock) from [<81398c1c>] (lock_torture_writer+0xa4/0x208)
[ 169.245703] [<81398c1c>] (lock_torture_writer) from [<8134d4c0>] (kthread+0x16c/0x1a4)
[ 169.255163] [<8134d4c0>] (kthread) from [<813000fc>] (ret_from_fork+0x14/0x38)
[ 169.263061] Exception stack(0x80bbbfb0 to 0x80bbbff8)
[ 169.270265] bfa0: 00000000 00000000 00000000 00000000
[ 169.275391] bfc0: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
[ 169.283551] bfe0: 00000000 00000000 00000000 00000000 00000013 00000000
[ 169.291800] irq event stamp: 74762
[ 169.298126] hardirqs last enabled at (74761): [<81a7a24c>] _raw_spin_unlock_irqrestore+0x70/0x84
[ 169.301649] hardirqs last disabled at (74762): [<81a71998>] __schedule+0x4ec/0xa84
[ 169.310581] softirqs last enabled at (74746): [<81301598>] __do_softirq+0x370/0x67c
[ 169.318009] softirqs last disabled at (74741): [<81329a4c>] __irq_exit_rcu+0x170/0x218
[ 169.325943] ---[ end trace e9af2b181e05f0f5 ]---
[ 169.335398] BUG: sleeping function called from invalid context at kernel/torture.c:728
[ 169.338457] in_atomic(): 0, irqs_disabled(): 0, non_block: 0, pid: 55, name: lock_torture_wr
[ 169.346280] INFO: lockdep is turned off.
[ 169.354820] Preemption disabled at:
[ 169.354833] [<81398c70>] lock_torture_writer+0xf8/0x208
[ 169.361988] CPU: 1 PID: 55 Comm: lock_torture_wr Tainted: G W O 5.15.38+ #2
[ 169.367054] Hardware name: Generic DT based system
[ 169.375393] [<81310908>] (unwind_backtrace) from [<8130bc00>] (show_stack+0x10/0x14)
[ 169.379989] [<8130bc00>] (show_stack) from [<81a6942c>] (dump_stack_lvl+0x68/0x90)
[ 169.387888] [<81a6942c>] (dump_stack_lvl) from [<81356008>] (___might_sleep+0x1ec/0x2bc)
[ 169.395268] [<81356008>] (___might_sleep) from [<8145eea8>] (stutter_wait+0x58/0x1ac)
[ 169.403514] [<8145eea8>] (stutter_wait) from [<81398c7c>] (lock_torture_writer+0x104/0x208)
[ 169.411239] [<81398c7c>] (lock_torture_writer) from [<8134d4c0>] (kthread+0x16c/0x1a4)
[ 169.419398] [<8134d4c0>] (kthread) from [<813000fc>] (ret_from_fork+0x14/0x38)
[ 169.427383] Exception stack(0x80bbbfb0 to 0x80bbbff8)
[ 169.434588] bfa0: 00000000 00000000 00000000 00000000
[ 169.439715] bfc0: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
[ 169.447874] bfe0: 00000000 00000000 00000000 00000000 00000013 00000000
查看代码:
7417 rcu_read_lock();
7418
7419 ar = ath11k_mac_get_ar_by_pdev_id(ab, ev->pdev_id);
。。。
7452 }
7453 rcu_read_unlock();
7454
7455 for (i = 0; i < ev->num_vdev_stats; i++) {
。。。
7517 }
7518
7519 exit:
7520 rcu_read_unlock();
7521 return ret;
可以看到执行了两次rcu_read_unlock(),注释掉中间的后,正常了。这只是其中一个问题,并没有解决NOHZ那个打印。
根据参考文献:
NOHZ tick-stop error: Non-RCU local softirq work is pending, handler #08!!!
handler是8,表示是 NET_RX_SOFTIRQ (1 << 3)
ftrace开启softirq_raise:
echo "vec == 3" > /sys/kernel/debug/tracing/events/irq/softirq_raise/filter
echo "stacktrace if vec == 3" > /sys/kernel/debug/tracing/events/irq/softirq_raise/trigger
echo 1 >/sys/kernel/debug/tracing/events/irq/softirq_raise/enable
cat /sys/kernel/debug/tracing/trace_pipe > /tmp/trace.txt
hostapd-184 [000] d..2. 367.844820: softirq_raise: vec=3 [action=NET_RX]
hostapd-184 [000] d..3. 367.844873: <stack trace>
=> netif_rx
=> sta_info_move_state
=> sta_apply_auth_flags.constprop.0
=> sta_apply_parameters
=> ieee80211_change_station
=> nl80211_set_station
=> genl_rcv_msg
=> netlink_rcv_skb
=> genl_rcv
=> netlink_unicast
=> netlink_sendmsg
=> ____sys_sendmsg
=> ___sys_sendmsg
=> sys_sendmsg
=> ret_fast_syscall
根据参考网址,加上local_bh_disable/local_bh_enable似乎解决了。
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 4ad622f..9f0052c 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -2212,6 +2212,7 @@ int sta_info_move_state(struct sta_info *sta,
return err;
}
+ local_bh_disable();
/* reflect the change in all state variables */
switch (new_state) {
@@ -2261,6 +2262,7 @@ int sta_info_move_state(struct sta_info *sta,
sta->sta_state = new_state;
+ local_bh_enable();
return 0;
}
参考:
https://lore.kernel.org/all/87y28b9nyn.ffs@tglx/t/