ILD

gmac clk时钟频率不对导致丢包问题解决
作者:Yuan Jianpeng 邮箱:yuanjp89@163.com
发布时间:2022-8-21 站点:Inside Linux Development

ipq5018移植后,LAN口不通,WAN口通。查看lan接口eth1:


/ # ifconfig eth1

eth1      Link encap:Ethernet  HWaddr 00:00:00:00:AC:02

          inet6 addr: fe80::200:ff:fe00:ac02/64 Scope:Link

          UP BROADCAST RUNNING MULTICAST  MTU:1500  Metric:1

          RX packets:0 errors:5 dropped:5 overruns:0 frame:5

          TX packets:18 errors:0 dropped:0 overruns:0 carrier:0

          collisions:0 txqueuelen:1000

          RX bytes:0 (0.0 B)  TX bytes:2104 (2.0 KiB)

          Interrupt:105


发现有错误计数,阅读nss-dp驱动,接口统计代码:hal/gmac_ops/syn/gmac/syn_if.c

        stats->rx_crc_errors = hal_stats.RxFcsErr + hal_stats.RxJumboFcsErr;

        stats->rx_frame_errors = hal_stats.RxAllignErr +

                                 hal_stats.RxJumboAligenErr + hal_stats.RxRunt;

        stats->rx_fifo_errors = hal_stats.RxOverFlow;

        stats->rx_errors = stats->rx_crc_errors + stats->rx_frame_errors +

                           stats->rx_fifo_errors;


        stats->rx_dropped = hal_stats.RxTooLong + stats->rx_errors;


加调试后,打印出来发现是FcsErr有计数。读取switch0的2口的mib计数,发现有fcs错误包。


/ # ssdk_sh


 SSDK Init OK!

 Welcome to SSDK Shell version: , at .

dev0@qca>device id set 0


operation done.


dev0@qca>mib counter get 2


[MIB Counter]

RxBroad       <0x00000000000000>  RxPause       <0x00000000000000>  RxMulti       <0x00000000000000>

RxFcsErr      <0x0000000000001e>  RxAlignErr    <0x00000000000000>  RxRunt        <0x00000000000000>

RxFragment    <0x00000000000019>  Rx64Byte      <0x00000000000000>  Rx128Byte     <0x00000000000005>

Rx256Byte     <0x00000000000000>  Rx512Byte     <0x00000000000000>  Rx1024Byte    <0x00000000000000>

Rx1518Byte    <0x00000000000000>  RxMaxByte     <0x00000000000000>  RxTooLong     <0x00000000000000>

RxGoodByte    <0x00000000000000>  RxBadByte     <0x00000000000000>  RxOverFlow    <0x00000000000000>

Filtered      <0x00000000000000>  TxBroad       <0x00000000000014>  TxPause       <0x00000000000000>

TxMulti       <0x0000000000001a>  TxUnderRun    <0x00000000000000>  Tx64Byte      <0x00000000000012>

Tx128Byte     <0x0000000000000e>  Tx256Byte     <0x0000000000000a>  Tx512Byte     <0x00000000000004>

Tx1024Byte    <0x00000000000000>  Tx1518Byte    <0x00000000000000>  TxMaxByte     <0x00000000000000>

TxOverSize    <0x00000000000000>  TxByte        <0x000000000014b4>  TxCollision   <0x00000000000000>

TxAbortCol    <0x00000000000000>  TxMultiCol    <0x00000000000000>  TxSingleCol   <0x00000000000000>

TxExcDefer    <0x00000000000000>  TxDefer       <0x00000000000000>  TxLateCol     <0x00000000000000>

RxUniCast     <0x00000000000000>  TxUniCast     <0x00000000000000>  RxJmFcsErr    <0x00000000000000>

RxJmAligErr   <0x00000000000000>  Rx14To63      <0x00000000000000>  RxTooLongByte <0x00000000000000>

RxRuntByte    <0x00000000000000>

operation done.


继续排查

为什么有问题呢,先从对比法下手,先试试qsdk,发现没有问题,然后尝试对比device tree,发现也没问题。

手动写了发广播包的工具,来发大量包,便于快速发现是不是有错误丢包。

然后怀疑是不是ssdk的差异。对比代码是一样的,怀疑是编译选项的差异,就用Xrouter的编译脚本,在qsdk的内核上编译,将ko拷贝到

qsdk的rootfs,生成image后烧录,按f enter进入failsafe模式,手动安装ssdk,使用ssdk_sh,发现没有错误计数。

怀疑可能是其它安装模块的差异,就启动后进入failsafe模式,一个模块都不安装,只安装ssdk,发现qsdk还是没问题,Xrouter还是有问题。


找对方向

后来偶然发现initramfs的Xrouter image没问题,而ubi格式的image有问题。于是尝试比对差异,都没有头绪,睡了一个晚上,灵感来了。使用initramfs启动,是需要进uboot命令行tftp镜像然后执行bootm的,这个和ubi启动有差异,是不进命令行,而进命令行是要做eth初始化的。因此在ubi启动的时候加上初始化,发现网络好了。对初始化进行调试,定位到是uboot下面的语句影响通不通:

board/qca/arm/ipq5018/ipq5018.c


void ethernet_clock_enable(void)

{

        cmn_blk_clk_set();

        uniphy_clk_set();

        gephy_uniphy_clock_disable();

        gmac_clock_disable();

        gmac_clk_src_init();

        cmn_clock_init();

        cmnblk_enable();

        cmnblk_check_state();

        gephy_reset();

        uniphy_reset();

        gmac_reset();

        gcc_clock_enable();

}

static void gmac_clk_src_init(void)

{

        reg_val = readl(GCC_GMAC1_RX_CFG_RCGR);

        reg_val &= ~GCC_GMAC_CFG_RCGR_SRC_SEL_MASK;

        reg_val |= GCC_GMAC1_RX_SRC_SEL_UNIPHY_RX;

        writel(reg_val, GCC_GMAC1_RX_CFG_RCGR);


        reg_val = readl(GCC_GMAC1_TX_CFG_RCGR);

        reg_val &= ~GCC_GMAC_CFG_RCGR_SRC_SEL_MASK;

        reg_val |= GCC_GMAC1_TX_SRC_SEL_UNIPHY_TX;

        writel(reg_val, GCC_GMAC1_TX_CFG_RCGR);


        。。。

}


对应的寄存器

#define GCC_GMAC1_RX_CMD_RCGR                   0x01868030

#define GCC_GMAC1_RX_CFG_RCGR                   0x01868034

#define GCC_GMAC1_TX_CMD_RCGR                   0x01868038

#define GCC_GMAC1_TX_CFG_RCGR                   0x0186803C


搜索内核的代码,发现是在drivers/clk/qcom/gcc-ipq5018.c

static struct clk_rcg2 gmac1_rx_clk_src = {

        .cmd_rcgr = 0x68030,

        .parent_map = gcc_xo_uniphy_gcc_rx_uniphy_gcc_tx_ubi32_pll_gpll0_map,

        .hid_width = 5,

        .freq_tbl = ftbl_gmac1_rx_clk_src,

        .clkr.hw.init = &(struct clk_init_data){

                .name = "gmac1_rx_clk_src",

                .parent_names = gcc_xo_uniphy_gcc_rx_uniphy_gcc_tx_ubi32_pll_gpll0,

                .num_parents = 5,

                .ops = &clk_rcg2_ops,

        },

};


可以通过clk debug接口查看一个clk的速率


正常:

/sys/kernel/debug/clk/gcc_gmac1_rx_clk# cat clk_rate

125000000

/sys/kernel/debug/clk# cat gmac1_rx_clk_src/clk_parent

uniphy_gcc_rx

/sys/kernel/debug/clk# cat gmac1_tx_clk_src/clk_parent

uniphy_gcc_tx


异常

/ # cat /sys/kernel/debug/clk/gcc_gmac1_tx_clk/clk_rate

50000000


抓住真凶

加打印,clk的名字,不能打印:rcg->clkr.hw.init->name,而是要打印:rcg->clkr.hw.core->name,初始化后init指针被置空。

但是core的结构体是定义再源文件中的,没导出来,添加一个只包含name的即可。

struct clk_core {

        const char              *name;

};


异常

[    8.915347] ===clk rcg2 gmac1_tx_clk_src der rate table 5 src 1

[    8.921264] ===clk rcg2 gmac1_tx_clk_src der rate table 3 src 1

[    8.915347] ===clk rcg2 gmac1_tx_clk_src der rate table 5 src 1

[    8.921264] ===clk rcg2 gmac1_tx_clk_src der rate table 3 src 1

[    8.927165] ===clk rcg2 gmac1_tx_clk_src der rate table 3 freq 125000000

[    8.933089] ===clk rcg2 gmac1_tx_clk_src update config

[    8.940018] ===clk rcg2 gmac1_tx_clk_src update config 2

[    8.944953] ===clk rcg2 gmac1_tx_clk_src recalc rate 50000000


正常

[   14.694712] ===clk rcg2 gmac1_tx_clk_src der rate table 5 src 1

[   14.700598] ===clk rcg2 gmac1_tx_clk_src der rate table 5 src 1

[   14.706516] ===clk rcg2 gmac1_tx_clk_src der rate table 4 src 1

[   14.712416] ===clk rcg2 gmac1_tx_clk_src der rate table 4 freq 125000000

[   14.718324] ===clk rcg2 gmac1_tx_clk_src update config

[   14.725266] ===clk rcg2 gmac1_tx_clk_src update config 2

[   14.730204] ===clk rcg2 gmac1_tx_clk_src recalc rate 125000000


为什么uboot进入命令行,再启动,之后业务正常了呢?原来是clk的逻辑会像读取当前rate,如果已经满足,就不会去变更了,看日志:

[    5.440594] ===clk rcg2 gmac1_tx_clk_src get parent 1

[    5.446205] ===clk rcg2 gmac1_rx_clk_src get parent 1

[    5.451270] ===clk rcg2 gmac1_rx_clk_src recalc rate 125000000

[    5.456528] ===clk rcg2 gmac1_tx_clk_src get parent 1

[    5.462035] ===clk rcg2 gmac1_tx_clk_src recalc rate 125000000


可以看到错误的时候,选错table了,再看gcc-ipq5018.c定义的table:

static const struct freq_tbl ftbl_gmac1_tx_clk_src[] = {

        F(2500000, P_UNIPHY_TX, 12.5, 0, 0),

        F(24000000, P_XO, 1, 0, 0),

        F(25000000, P_UNIPHY_TX, 2.5, 0, 0),

        F(125000000, P_UNIPHY_TX, 2.5, 0, 0),

        F(125000000, P_UNIPHY_TX, 1, 0, 0),

        F(312500000, P_UNIPHY_TX, 1, 0, 0),

        { }

};

3和4都是125000000HZ。查看qsdk的内核提交记录,原来有处理这种重复的情况,合入试试:


Author: Praveenkumar I <ipkumar@codeaurora.org>

Date:   Wed Jul 12 21:33:32 2017 +0530


    clk: qcom: support for duplicate freq in RCG2 freq table


    Currently RCG code looks up the frequency table during set

    rate and return the first available frequency greater than

    requested rate. If CLK_SET_RATE_PARENT flag is set then the

    set_rate request will go to its parent otherwise the clock

    framework will configure pre-div, m and n according to the

    returned frequency table entry. In this case, it is assuming

    that parent clock will run in the same frequency with which

    pre-div, m and n has been derived. But it may be possible

    that the parent clock supports multiple frequency and the

    same frequency can be derived with different pre-div, m and

    n values depending upon current frequency.  Also, the same

    frequency can be derived from different parent sources and

    currently there is no option for having duplicate

    frequencies in frequency table and choosing the best one

    according to current rate.


    Now this patch adds the support for having duplicate

    frequencies in frequency table. During set rate, it will

    compare the actual rate for each entry with requested rate

    and will select the best entry in which the difference will

    be less.


    The existing functionality won’t be affected with this code

    change since this code change will hit only if frequency

    table has duplicate values.


    Change-Id: I97d9e1b55d8f3ee095f6f01729af527ba90e50e5

    Signed-off-by: Abhishek Sahu <absahu@codeaurora.org>

    (cherry picked from commit 775e7d3b69ffc97afb5bd5a6c9c423f2f4d8a0b2)

    Signed-off-by: Praveenkumar I <ipkumar@codeaurora.org>


    Change-Id: If10193fc79a3c1375ab73597813745ff1f4df0ad


合入patch后,测试ok。


Copyright © linuxdev.cc 2017-2024. Some Rights Reserved.