ILD

network packet process path part 1 ethernet driver
作者:Yuan Jianpeng 邮箱:yuanjp89@163.com
发布时间:2022-9-3 站点:Inside Linux Development

本文分析了以太网驱动初始化、收包的过程,以高通IPQ5018平台nss dp驱动为例子。


NSS DP驱动分析

nss dp有两个ops, gmac_hal_ops和data_plane_ops。


hal/gmac_ops/syn/gmac/syn_if.c

/*
 * MAC hal_ops base structure
 */
struct nss_gmac_hal_ops syn_gmac_ops = {
        .init = &syn_init,
        .start =  &syn_start,
        .stop = &syn_stop,
        .exit = &syn_exit,
        .setmacaddr = &syn_set_mac_address,
        .getmacaddr = &syn_get_mac_address,
        .rxflowcontrol = &syn_rx_flow_control,
        .txflowcontrol = &syn_tx_flow_control,
        .setmaxframe = &syn_set_max_frame_size,
        .getmaxframe = &syn_get_max_frame_size,
        .getndostats = &syn_get_netdev_stats,
        .getssetcount = &syn_get_strset_count,
        .getstrings = &syn_get_strings,
        .getethtoolstats = &syn_get_eth_stats,
        .sendpause = &syn_send_pause_frame,
};

hal/dp_ops/syn_gmac_dp/syn_dp.c

/*
 * nss_dp_gmac_ops
 *      Data plane operations for Synopsys GMAC
 */
struct nss_dp_data_plane_ops nss_dp_gmac_ops = {
        .init           = syn_dp_if_init,
        .open           = syn_dp_if_open,
        .close          = syn_dp_if_close,
        .link_state     = syn_dp_if_link_state,
        .mac_addr       = syn_dp_if_mac_addr,
        .change_mtu     = syn_dp_if_change_mtu,
        .xmit           = syn_dp_if_xmit,
        .set_features   = syn_dp_if_set_features,
        .pause_on_off   = syn_dp_if_pause_on_off,
        .get_stats      = syn_dp_if_get_stats,
        .deinit         = syn_dp_if_deinit,
};


netdev->netdev_ops = &nss_dp_netdev_ops;

/*
 * Netdevice operations
 */
static const struct net_device_ops nss_dp_netdev_ops = {
        .ndo_open = nss_dp_open,
        .ndo_stop = nss_dp_close,
        .ndo_start_xmit = nss_dp_xmit,
        .ndo_get_stats64 = nss_dp_get_stats64,
        .ndo_set_mac_address = nss_dp_set_mac_address,
        .ndo_validate_addr = eth_validate_addr,
        .ndo_change_mtu = nss_dp_change_mtu,
        .ndo_do_ioctl = nss_dp_do_ioctl,

#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 5, 0))
        .ndo_bridge_setlink = switchdev_port_bridge_setlink,
        .ndo_bridge_getlink = switchdev_port_bridge_getlink,
        .ndo_bridge_dellink = switchdev_port_bridge_dellink,
#endif
#ifndef NSS_DP_IPQ50XX
        .ndo_select_queue = nss_dp_select_queue,
#endif

#ifdef CONFIG_RFS_ACCEL
        .ndo_rx_flow_steer = nss_dp_rx_flow_steer,
#endif
};


netdev_ops实际上是调用上面的ops。


初始化

在nss_dp_probe接口中,会执行


gmac_hal_ops->init() == syn_init()

读取寄存器地址并map,执行mac_base = devm_ioremap(

开启gmac clk


data_plane_ops->init() == syn_dp_if_init()


1 初始化rx ring

syn_dp_cfg_rx_setup_rings(dev_info)


两个结构体,一个rx结构体,一个是dma desc结构体:

/*
 * dma_desc_rx
 *      Rx DMA Descriptor Structure
 *
 * Enhanced descriptor format for receive.
 */
struct dma_desc_rx {
        uint32_t status;        /* Status */
        uint32_t length;        /* Buffer 1  and Buffer 2 length */
        uint32_t buffer1;       /* Network Buffer 1 pointer (DMA-able) */
        uint32_t buffer2;       /* Network Buffer 2 pointer (DMA-able) */
        /* This data below is used only by driver */
        uint32_t extstatus;     /* Extended status of a Rx Descriptor */
        uint32_t reserved1;     /* Reserved word */
        uint32_t timestamplow;  /* Lower 32 bits of the 64
                                   bit timestamp value */
        uint32_t timestamphigh; /* Higher 32 bits of the 64
                                           bit timestamp value */
        uint32_t padding[8];    /* Pad 32 byte to align to 64B cacheline size */
};


/*
 * syn_dp_info_rx
 */
struct syn_dp_info_rx {
        struct napi_struct napi_rx;     /* Rx NAPI */
        void __iomem *mac_base;         /* MAC base for register read/write */
        struct dma_desc_rx *rx_desc;    /* start address of RX descriptors ring or
                                           chain, this is used by the driver */
        uint32_t busy_rx_desc_cnt;      /* Number of Rx Descriptors owned by
                                           DMA at any given time */
        uint32_t rx_refill_idx;         /* index of the rx descriptor owned by DMA */
        uint32_t rx_idx;                /* index of the rx descriptor next available with driver */
        struct syn_dp_rx_buf rx_buf_pool[SYN_DP_RX_DESC_SIZE];
                                        /* Rx skb pool helping RX DMA descriptors */
        struct nss_dp_hal_gmac_stats_rx rx_stats;
                                        /* GMAC driver Rx statistics */
        struct net_device *netdev;      /* Net-device corresponding to the GMAC */
        struct device *dev;             /* Platform device corresponding to the GMAC */
        struct sk_buff *head;           /* Head of the skb list in case of Scatter-Gather frame */
        struct sk_buff *tail;           /* Tail of the skb list in case of Scatter-Gather frame */
        bool page_mode;                 /* page_mode: true for nr_frag and false for fraglist */
        uint32_t alloc_buf_len;         /* Skb alloc length, depends based on page/fraglist mode */
        uint32_t prev_len;              /* Stores frame_length of previous descriptor */
};

ring是一个descriptor的数组,descriptor有指针指向要存放数据包的地址。

所以初始化工作包括两个,分配desc数组,分配每个desc的buffer。

rx descriptor的个数是128

#define SYN_DP_RX_DESC_SIZE             128


a. 分配descriptor

syn_dp_cfg_rx_setup_desc_queue(dev_info);


first_desc = kzalloc(sizeof(struct dma_desc_rx) * SYN_DP_RX_DESC_SIZE, GFP_KERNEL);

dev_info->rx_desc_dma_addr = (dma_addr_t)virt_to_phys(first_desc);

rx_info->rx_desc = first_desc;

// 将descriptor数组置0, 将最后一个descriptor的length设置为DESC_RX_DESC_END_OF_RING。

syn_dp_gmac_rx_desc_init_ring(rx_info->rx_desc, SYN_DP_RX_DESC_SIZE);

rx_info->rx_refill_idx = 0;
rx_info->rx_idx = 0;

rx_info->busy_rx_desc_cnt = 0;


b. 为descriptor分配buffer

syn_dp_rx_refill(&dev_info->dp_info_rx);


skb = __netdev_alloc_skb(netdev, rx_info->alloc_buf_len, GFP_ATOMIC);

dma_addr = (dma_addr_t)virt_to_phys(skb->data);

syn_dp_rx_refill_one_desc(rx_desc, dma_addr, inval_len);

rx_info->rx_buf_pool[rx_refill_idx].skb = skb;



c 下发descriptor物理地址给dma

syn_init_rx_desc_base(dev_info->mac_base, dev_info->rx_desc_dma_addr);


2 初始化tx ring

syn_dp_cfg_tx_setup_rings(dev_info)


同样也是两个结构体:

/*      
 * syn_dp_info_tx
 */                     
struct syn_dp_info_tx {
        struct napi_struct napi_tx;     /* Tx NAPI */
        void __iomem *mac_base;         /* MAC base for register read/write */
        struct dma_desc_tx *tx_desc;    /* start address of TX descriptors ring or
                                                chain, this is used by the driver */
        uint32_t busy_tx_desc_cnt;      /* Number of Tx Descriptors owned by
                                                DMA at any given time */
        uint32_t tx_comp_idx;           /* index of the tx descriptor owned by DMA */
        uint32_t tx_idx;                /* index of the tx descriptor next available with driver */
        struct syn_dp_tx_buf tx_buf_pool[SYN_DP_TX_DESC_SIZE];
                                        /* Tx skb pool helping TX DMA descriptors */
        struct nss_dp_hal_gmac_stats_tx tx_stats;
                                        /* GMAC driver Tx statistics */
        struct net_device *netdev;      /* Net-device corresponding to the GMAC */
        struct device *dev;             /* Platform device corresponding to the GMAC */
        struct sk_buff *skb_free_list[SYN_DP_NAPI_BUDGET_TX];
                                        /* Array to hold SKBs before free during Tx completion */
        size_t shinfo_addr_virt[SYN_DP_NAPI_BUDGET_TX];
                                        /* Array to hold SKB end pointer to be
                                                prefetched during Tx completion */
};


/*
 * dma_desc_tx
 *      Tx DMA Descriptor Structure
 *
 * Enhanced descriptor format for transmit.
 */
struct dma_desc_tx {
        uint32_t status;        /* Status */
        uint32_t length;        /* Buffer 1  and Buffer 2 length */
        uint32_t buffer1;       /* Network Buffer 1 pointer (DMA-able) */
        uint32_t buffer2;       /* Network Buffer 2 pointer (DMA-able) */
        uint32_t reserved1;     /* Reserved word */
        uint32_t reserved2;     /* Reserved word */
        uint32_t timestamplow;  /* Lower 32 bits of the 64
                                   bit timestamp value */
        uint32_t timestamphigh; /* Higher 32 bits of the 64
                                           bit timestamp value */
        uint32_t padding[8];    /* Pad 32 byte to align to 64B cacheline size */
};


tx descriptor的个数:

#define SYN_DP_TX_DESC_SIZE             1024


a. 分配descriptor

err = syn_dp_cfg_tx_setup_desc_queue(dev_info);


first_desc = dma_alloc_coherent(tx_info->dev, sizeof(struct dma_desc_tx) * SYN_DP_TX_DESC_SIZE, &dma_addr, GFP_KERNEL);

tx_info->tx_desc = first_desc;

dev_info->tx_desc_dma_addr = dma_addr;

// 将descriptor数组置0, 将最后一个descriptor的status设置为DESC_TX_DESC_END_OF_RING

syn_dp_gmac_tx_desc_init_ring(tx_info->tx_desc, SYN_DP_TX_DESC_SIZE);

tx_info->tx_comp_idx = 0;
tx_info->tx_idx = 0;
tx_info->busy_tx_desc_cnt = 0;



b. 下发descriptor

注意:这里没有为descriptor分配buffer, 因为此时还没有要tx的包。

syn_init_tx_desc_base(dev_info->mac_base, dev_info->tx_desc_dma_addr);


3 添加napi,这里添加了tx和rx两个napi

但是需要注意的是tx的napi也是在RX softirq中处理的。

netif_napi_add(netdev, &rx_info->napi_rx, syn_dp_napi_poll_rx, SYN_DP_NAPI_BUDGET_RX);

netif_napi_add(netdev, &tx_info->napi_tx, syn_dp_napi_poll_tx, SYN_DP_NAPI_BUDGET_TX);


4 注册中断

rx和tx中断是共享的

request_irq(netdev->irq, syn_dp_handle_irq, 0, "nss-dp-gmac", &gmac_dev->dp_info.syn_info);


open接口

nss_dp_open


调用:data_plane_ops->open()

执行的是syn_dp_if_open()

1 打开rx/tx napi

napi_enable(&dp_info->syn_info.dp_info_rx.napi_rx);

napi_enable(&dp_info->syn_info.dp_info_tx.napi_tx);


2 打开dma

syn_enable_dma_tx(mac_base);


syn_enable_dma_rx(mac_base);

3 打开中断


再调用:gmac_hal_ops->start()

执行的是syn_start()

配置寄存器打开gmac的rx和tx


RX处理

1 中断触发

irqreturn_t syn_dp_handle_irq(int irq, void *ctx)

读取dma状态,如果有 SYN_DMA_INT_RX_COMPLETED。则

清掉rx dma status, 关闭rx dma中断,然后,最重要调度rx napi

napi_schedule(&dp_info->dp_info_rx.napi_rx);


2 napi处理

执行:int syn_dp_napi_poll_rx(struct napi_struct *napi, int budget)

调用 work_done = syn_dp_rx(rx_info, budget); 进行收包处理,

然后调用 pending_refill = syn_dp_rx_refill(rx_info); 重新分配dma的buffer。


如果work_done等于budget, 则表示还没有收完,直接返回work done.

如果work done小于budget, 则表示收完了,需要napi设置为完成,重新打开rx中断:

napi_complete(napi);

syn_enable_rx_dma_interrupt(mac_base);


3 rx packet到kernel stack

第二步中的syn_dp_rx()是进行收包,


skb_put(rx_skb, frame_length);

rx_skb->protocol = eth_type_trans(rx_skb, netdev);


napi_gro_receive(&rx_info->napi_rx, rx_skb);

或者

netif_receive_skb(rx_skb);


TX处理

netdevice的ndo_start_xmit

nss_dp_xmit -> syn_dp_if_xmit


syn_dp_if_xmit()

调用 syn_dp_tx(tx_info, skb) , 如果成功,返回 NETDEV_TX_OK.

否则 释放skb, 增加统计计数,也返回NETDEV_TX_OK.

dev_kfree_skb_any(skb);

atomic64_inc((atomic64_t *)&tx_info->tx_stats.tx_dropped);


int syn_dp_tx(struct syn_dp_info_tx *tx_info, struct sk_buff *skb)


如果包是非线性的,调用syn_dp_tx_sg()进行处理。

if (unlikely(skb_is_nonlinear(skb)))

    return syn_dp_tx_sg(tx_info, skb);


如果可用tx descriptor的个数不足(这里是拿的上次tx complete的数据,可能已经有一些已经完成tx了,所以这个数据不实时)

则增加错误计数,并返回-1.


将skb的payload长度等设置到 tx desc

将skb信息缓存到 tx_info->tx_buf_pool[tx_idx]

更新tx desc的status.

这样就将skb下发到dma中去了。


调用 syn_resume_dma_tx(tx_info->mac_base); 开启dma tx

调用 atomic_inc((atomic_t *)&tx_info->busy_tx_desc_cnt); 增加dma未完成的个数。


TX complete处理

tx complete是用一个napi, 在rx软中断中完成的。它的目的是处理tx dma ring的完成情况,如果一个包已经被dma处理完。

就释放这个包对应的skb.


 1 中断触发

和rx共享中断,处理方式和rx中断触发是一样的,调用tx napi

napi_schedule(&dp_info->dp_info_tx.napi_tx);


2 处理 tx complete

计算出单次要处理的包的个数

busy = desc_cnt = atomic_read((atomic_t *)&tx_info->busy_tx_desc_cnt);

if (likely(busy > budget))

    busy = desc_cnt = budget;


然后处理busy个tx descriptor。拿到当前的desc, 及其状态:

desc = syn_dp_tx_comp_desc_get(tx_info);

status = desc->status;


如果status是被dma所拥有,表明还没有tx完成,此时中断处理。

if (unlikely(syn_dp_gmac_is_tx_desc_owned_by_dma(status)))

    break


如果status表明dma已经完成,则做一些统计,并缓存skb到skb_free_list.


最后,一次性释放skb_free_list中,busy个skb.


3 开启 napi

如果处理的个数小于budget, 则表明所有的dma已经complete, 此时

napi_complete(napi);

syn_enable_tx_dma_interrupt(mac_base);



总结

在注册接口的时候,调用netif_napi_add() 注册 RX/TX 两个napi. 注册一个中断。

在接口up的时候,调用napi_enable() 激活 napi,并打开中断。

在中断处理函数中,如果有rx, 则关闭rx中断,并调用napi_schedule()。

在中断处理函数中,如果有tx, 则关闭tx中断,并调用napi_schedule()。

rx的napi从dma拿包,并调用 napi_gro_receive() 或 netif_receive_skb() 将skb交给协议栈。

xmit接口,将包下发到dma中,并缓存skb到 tx_buf_pool[]

tx的napi, 处理已经完成的tx desc, 释放对应的skb



参考

https://blog.packagecloud.io/monitoring-tuning-linux-networking-stack-receiving-data/

https://epickrram.blogspot.com/2016/05/navigating-linux-kernel-network-stack.html


Copyright © linuxdev.cc 2017-2024. Some Rights Reserved.