ILD

NOHZ tick-stop error: Non-RCU local softirq work is pending, handler #08!!!
作者:Yuan Jianpeng 邮箱:yuanjianpeng@xiaomi.com
发布时间:2023-7-10 站点:Inside Linux Development

最近再移植ath11k qsdk 12.2版本,移植后,启动出现打印:

NOHZ tick-stop error: Non-RCU local softirq work is pending, handler #08!!!


打开内核调试选项:

Kernel hacking -> 

                    Debug Oops, Lockups and Hangs ->

                    Debug preemptible kernel

                        Lock Debugging (spinlocks, mutexes, etc...) ->

                    RCU Debugging  --->


选项变化:

diff --git a/targets/ipq50xx_32/kernel.config b/targets/ipq50xx_32/kernel.config

index cad3de8..109c260 100644

--- a/targets/ipq50xx_32/kernel.config

+++ b/targets/ipq50xx_32/kernel.config

@@ -205,7 +205,7 @@ CONFIG_IO_URING=y

 CONFIG_ADVISE_SYSCALLS=y

 CONFIG_MEMBARRIER=y

 CONFIG_KALLSYMS=y

-# CONFIG_KALLSYMS_ALL is not set

+CONFIG_KALLSYMS_ALL=y

 CONFIG_KALLSYMS_BASE_RELATIVE=y

 # CONFIG_USERFAULTFD is not set

 CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE=y

@@ -1316,7 +1316,6 @@ CONFIG_QTI_SCM_RESTART_REASON=y

 # CONFIG_TRUSTED_FOUNDATIONS is not set

 # CONFIG_GOOGLE_FIRMWARE is not set

 CONFIG_ARM_PSCI_FW=y

-# CONFIG_ARM_PSCI_CHECKER is not set

 CONFIG_HAVE_ARM_SMCCC=y

 CONFIG_HAVE_ARM_SMCCC_DISCOVERY=y

 # CONFIG_ARM_SMCCC_SOC_ID is not set

@@ -3758,7 +3757,7 @@ CONFIG_CC_HAS_WORKING_NOSANITIZE_ADDRESS=y

 # CONFIG_KASAN is not set

 # end of Memory Debugging

 

-# CONFIG_DEBUG_SHIRQ is not set

+CONFIG_DEBUG_SHIRQ=y

 

 #

 # Debug Oops, Lockups and Hangs

@@ -3766,10 +3765,16 @@ CONFIG_CC_HAS_WORKING_NOSANITIZE_ADDRESS=y

 # CONFIG_PANIC_ON_OOPS is not set

 CONFIG_PANIC_ON_OOPS_VALUE=0

 CONFIG_PANIC_TIMEOUT=0

-# CONFIG_SOFTLOCKUP_DETECTOR is not set

-# CONFIG_DETECT_HUNG_TASK is not set

-# CONFIG_WQ_WATCHDOG is not set

-# CONFIG_TEST_LOCKUP is not set

+CONFIG_LOCKUP_DETECTOR=y

+CONFIG_SOFTLOCKUP_DETECTOR=y

+# CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set

+CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE=0

+CONFIG_DETECT_HUNG_TASK=y

+CONFIG_DEFAULT_HUNG_TASK_TIMEOUT=120

+# CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set

+CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE=0

+CONFIG_WQ_WATCHDOG=y

+CONFIG_TEST_LOCKUP=m

 # end of Debug Oops, Lockups and Hangs

 

 #

@@ -3786,22 +3791,31 @@ CONFIG_DEBUG_PREEMPT=y

 # Lock Debugging (spinlocks, mutexes, etc...)

 #

 CONFIG_LOCK_DEBUGGING_SUPPORT=y

-# CONFIG_PROVE_LOCKING is not set

-# CONFIG_LOCK_STAT is not set

-# CONFIG_DEBUG_RT_MUTEXES is not set

-# CONFIG_DEBUG_SPINLOCK is not set

-# CONFIG_DEBUG_MUTEXES is not set

-# CONFIG_DEBUG_WW_MUTEX_SLOWPATH is not set

-# CONFIG_DEBUG_RWSEMS is not set

-# CONFIG_DEBUG_LOCK_ALLOC is not set

-# CONFIG_DEBUG_ATOMIC_SLEEP is not set

-# CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set

-# CONFIG_LOCK_TORTURE_TEST is not set

-# CONFIG_WW_MUTEX_SELFTEST is not set

-# CONFIG_SCF_TORTURE_TEST is not set

+CONFIG_PROVE_LOCKING=y

+CONFIG_PROVE_RAW_LOCK_NESTING=y

+CONFIG_LOCK_STAT=y

+CONFIG_DEBUG_RT_MUTEXES=y

+CONFIG_DEBUG_SPINLOCK=y

+CONFIG_DEBUG_MUTEXES=y

+CONFIG_DEBUG_WW_MUTEX_SLOWPATH=y

+CONFIG_DEBUG_RWSEMS=y

+CONFIG_DEBUG_LOCK_ALLOC=y

+CONFIG_LOCKDEP=y

+CONFIG_LOCKDEP_BITS=15

+CONFIG_LOCKDEP_CHAINS_BITS=16

+CONFIG_LOCKDEP_STACK_TRACE_BITS=19

+CONFIG_LOCKDEP_STACK_TRACE_HASH_BITS=14

+CONFIG_LOCKDEP_CIRCULAR_QUEUE_BITS=12

+CONFIG_DEBUG_LOCKDEP=y

+CONFIG_DEBUG_ATOMIC_SLEEP=y

+CONFIG_DEBUG_LOCKING_API_SELFTESTS=y

+CONFIG_LOCK_TORTURE_TEST=y

+CONFIG_WW_MUTEX_SELFTEST=y

+CONFIG_SCF_TORTURE_TEST=y

 # end of Lock Debugging (spinlocks, mutexes, etc...)

 

-# CONFIG_DEBUG_IRQFLAGS is not set

+CONFIG_TRACE_IRQFLAGS=y

+CONFIG_DEBUG_IRQFLAGS=y

 CONFIG_STACKTRACE=y

 # CONFIG_WARN_ALL_UNSEEDED_RANDOM is not set

 # CONFIG_DEBUG_KOBJECT is not set

@@ -3821,6 +3835,8 @@ CONFIG_STACKTRACE=y

 #

 # RCU Debugging

 #

+CONFIG_PROVE_RCU=y

+CONFIG_TORTURE_TEST=y

 # CONFIG_RCU_SCALE_TEST is not set

 # CONFIG_RCU_TORTURE_TEST is not set

 # CONFIG_RCU_REF_SCALE_TEST is not set

@@ -3844,6 +3860,7 @@ CONFIG_TRACE_CLOCK=y

 CONFIG_RING_BUFFER=y

 CONFIG_EVENT_TRACING=y

 CONFIG_CONTEXT_SWITCH_TRACER=y

+CONFIG_PREEMPTIRQ_TRACEPOINTS=y

 CONFIG_TRACING=y

 CONFIG_TRACING_SUPPORT=y

 CONFIG_FTRACE=y


重新编译后,第一个异常打印如下:

[  168.997295] ------------[ cut here ]------------

[  168.997393] WARNING: CPU: 0 PID: 55 at kernel/rcu/tree_plugin.h:427 ath11k_wmi_tlv_fw_stats_parse+0x49c/0x8bc [ath11k]

[  169.001067] Modules linked in: ath11k_ahb(O) ath11k(O) mac80211(O) cfg80211(O) compat(O) iptable_nat xt_MASQUERADE nf_nat traffic(O) nss_dp(O) ssdk(O)

[  169.011561] CPU: 0 PID: 55 Comm: lock_torture_wr Tainted: G           O      5.15.38+ #2

[  169.024954] Hardware name: Generic DT based system

[  169.033204] [<81310908>] (unwind_backtrace) from [<8130bc00>] (show_stack+0x10/0x14)

[  169.037802] [<8130bc00>] (show_stack) from [<81a6942c>] (dump_stack_lvl+0x68/0x90)

[  169.045701] [<81a6942c>] (dump_stack_lvl) from [<813212fc>] (__warn+0xec/0x15c)

[  169.053079] [<813212fc>] (__warn) from [<81a5fd94>] (warn_slowpath_fmt+0x78/0xa8)

[  169.060286] [<81a5fd94>] (warn_slowpath_fmt) from [<7f313adc>] (ath11k_wmi_tlv_fw_stats_parse+0x49c/0x8bc [ath11k])

[  169.067933] [<7f313adc>] (ath11k_wmi_tlv_fw_stats_parse [ath11k]) from [<7f30f504>] (ath11k_wmi_tlv_iter+0x5c/0x138 [ath11k])

[  169.078176] [<7f30f504>] (ath11k_wmi_tlv_iter [ath11k]) from [<7f31ecb0>] (ath11k_wmi_tlv_op_rx+0x113c/0x3150 [ath11k])

[  169.089633] [<7f31ecb0>] (ath11k_wmi_tlv_op_rx [ath11k]) from [<7f3386c8>] (ath11k_htc_rx_completion_handler+0x380/0x4f8 [ath11k])

[  169.100226] [<7f3386c8>] (ath11k_htc_rx_completion_handler [ath11k]) from [<7f34f558>] (ath11k_ce_per_engine_service+0x2bc/0x38c [ath11k])

[  169.112032] [<7f34f558>] (ath11k_ce_per_engine_service [ath11k]) from [<7f355388>] (ath11k_pcic_ce_tasklet+0x18/0x34 [ath11k])

[  169.124444] [<7f355388>] (ath11k_pcic_ce_tasklet [ath11k]) from [<8132a06c>] (tasklet_action_common.constprop.0+0xf0/0x120)

[  169.135813] [<8132a06c>] (tasklet_action_common.constprop.0) from [<81301360>] (__do_softirq+0x138/0x67c)

[  169.146833] [<81301360>] (__do_softirq) from [<81329a4c>] (__irq_exit_rcu+0x170/0x218)

[  169.156552] [<81329a4c>] (__irq_exit_rcu) from [<81329c3c>] (irq_exit+0x8/0x34)

[  169.164363] [<81329c3c>] (irq_exit) from [<813a3f28>] (handle_domain_irq+0x80/0xb0)

[  169.171569] [<813a3f28>] (handle_domain_irq) from [<816c51b8>] (gic_handle_irq+0x8c/0xb4)

[  169.179210] [<816c51b8>] (gic_handle_irq) from [<81300b40>] (__irq_svc+0x60/0x98)

[  169.187539] Exception stack(0x80bbbe68 to 0x80bbbeb0)

[  169.195006] be60:                   81f22ca0 00000000 00000000 80bba000 81fde900 81f15010

[  169.200045] be80: 00000000 00000000 81f22ca0 00000000 00000000 00000000 00000000 80bbbeb8

[  169.208203] bea0: 81a79c8c 8138e558 600e0013 ffffffff

[  169.216358] [<81300b40>] (__irq_svc) from [<8138e558>] (lock_acquire+0x5c/0x498)

[  169.221395] [<8138e558>] (lock_acquire) from [<81a79c8c>] (_raw_spin_lock+0x3c/0x78)

[  169.228861] [<81a79c8c>] (_raw_spin_lock) from [<81398a14>] (torture_spin_lock_write_lock+0x10/0x18)

[  169.236588] [<81398a14>] (torture_spin_lock_write_lock) from [<81398c1c>] (lock_torture_writer+0xa4/0x208)

[  169.245703] [<81398c1c>] (lock_torture_writer) from [<8134d4c0>] (kthread+0x16c/0x1a4)

[  169.255163] [<8134d4c0>] (kthread) from [<813000fc>] (ret_from_fork+0x14/0x38)

[  169.263061] Exception stack(0x80bbbfb0 to 0x80bbbff8)

[  169.270265] bfa0:                                     00000000 00000000 00000000 00000000

[  169.275391] bfc0: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000

[  169.283551] bfe0: 00000000 00000000 00000000 00000000 00000013 00000000

[  169.291800] irq event stamp: 74762

[  169.298126] hardirqs last  enabled at (74761): [<81a7a24c>] _raw_spin_unlock_irqrestore+0x70/0x84

[  169.301649] hardirqs last disabled at (74762): [<81a71998>] __schedule+0x4ec/0xa84

[  169.310581] softirqs last  enabled at (74746): [<81301598>] __do_softirq+0x370/0x67c

[  169.318009] softirqs last disabled at (74741): [<81329a4c>] __irq_exit_rcu+0x170/0x218

[  169.325943] ---[ end trace e9af2b181e05f0f5 ]---

[  169.335398] BUG: sleeping function called from invalid context at kernel/torture.c:728

[  169.338457] in_atomic(): 0, irqs_disabled(): 0, non_block: 0, pid: 55, name: lock_torture_wr

[  169.346280] INFO: lockdep is turned off.

[  169.354820] Preemption disabled at:

[  169.354833] [<81398c70>] lock_torture_writer+0xf8/0x208

[  169.361988] CPU: 1 PID: 55 Comm: lock_torture_wr Tainted: G        W  O      5.15.38+ #2

[  169.367054] Hardware name: Generic DT based system

[  169.375393] [<81310908>] (unwind_backtrace) from [<8130bc00>] (show_stack+0x10/0x14)

[  169.379989] [<8130bc00>] (show_stack) from [<81a6942c>] (dump_stack_lvl+0x68/0x90)

[  169.387888] [<81a6942c>] (dump_stack_lvl) from [<81356008>] (___might_sleep+0x1ec/0x2bc)

[  169.395268] [<81356008>] (___might_sleep) from [<8145eea8>] (stutter_wait+0x58/0x1ac)

[  169.403514] [<8145eea8>] (stutter_wait) from [<81398c7c>] (lock_torture_writer+0x104/0x208)

[  169.411239] [<81398c7c>] (lock_torture_writer) from [<8134d4c0>] (kthread+0x16c/0x1a4)

[  169.419398] [<8134d4c0>] (kthread) from [<813000fc>] (ret_from_fork+0x14/0x38)

[  169.427383] Exception stack(0x80bbbfb0 to 0x80bbbff8)

[  169.434588] bfa0:                                     00000000 00000000 00000000 00000000

[  169.439715] bfc0: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000

[  169.447874] bfe0: 00000000 00000000 00000000 00000000 00000013 00000000


查看代码:

 7417         rcu_read_lock();

 7418

 7419         ar = ath11k_mac_get_ar_by_pdev_id(ab, ev->pdev_id);

。。。

 7452         }

 7453         rcu_read_unlock();

 7454

 7455         for (i = 0; i < ev->num_vdev_stats; i++) {

。。。

 7517         }

 7518

 7519 exit:

 7520         rcu_read_unlock();

 7521         return ret;


可以看到执行了两次rcu_read_unlock(),注释掉中间的后,正常了。这只是其中一个问题,并没有解决NOHZ那个打印。


根据参考文献:

NOHZ tick-stop error: Non-RCU local softirq work is pending, handler #08!!!

handler是8,表示是 NET_RX_SOFTIRQ (1 << 3)


ftrace开启softirq_raise:

echo "vec == 3" > /sys/kernel/debug/tracing/events/irq/softirq_raise/filter

echo "stacktrace if vec == 3" > /sys/kernel/debug/tracing/events/irq/softirq_raise/trigger

echo 1 >/sys/kernel/debug/tracing/events/irq/softirq_raise/enable

cat /sys/kernel/debug/tracing/trace_pipe > /tmp/trace.txt


         hostapd-184     [000] d..2.   367.844820: softirq_raise: vec=3 [action=NET_RX]

         hostapd-184     [000] d..3.   367.844873: <stack trace>

 => netif_rx

 => sta_info_move_state

 => sta_apply_auth_flags.constprop.0

 => sta_apply_parameters

 => ieee80211_change_station

 => nl80211_set_station

 => genl_rcv_msg

 => netlink_rcv_skb

 => genl_rcv

 => netlink_unicast

 => netlink_sendmsg

 => ____sys_sendmsg

 => ___sys_sendmsg

 => sys_sendmsg

 => ret_fast_syscall


根据参考网址,加上local_bh_disable/local_bh_enable似乎解决了。


diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c

index 4ad622f..9f0052c 100644

--- a/net/mac80211/sta_info.c

+++ b/net/mac80211/sta_info.c

@@ -2212,6 +2212,7 @@ int sta_info_move_state(struct sta_info *sta,

                        return err;

        }


+       local_bh_disable();

        /* reflect the change in all state variables */


        switch (new_state) {

@@ -2261,6 +2262,7 @@ int sta_info_move_state(struct sta_info *sta,


        sta->sta_state = new_state;


+       local_bh_enable();

        return 0;

 }



参考:

https://lore.kernel.org/all/87y28b9nyn.ffs@tglx/t/


Copyright © linuxdev.cc 2017-2024. Some Rights Reserved.