本文分析了以太网驱动初始化、收包的过程,以高通IPQ5018平台nss dp驱动为例子。
nss dp有两个ops, gmac_hal_ops和data_plane_ops。
hal/gmac_ops/syn/gmac/syn_if.c
/*
* MAC hal_ops base structure
*/
struct nss_gmac_hal_ops syn_gmac_ops = {
.init = &syn_init,
.start = &syn_start,
.stop = &syn_stop,
.exit = &syn_exit,
.setmacaddr = &syn_set_mac_address,
.getmacaddr = &syn_get_mac_address,
.rxflowcontrol = &syn_rx_flow_control,
.txflowcontrol = &syn_tx_flow_control,
.setmaxframe = &syn_set_max_frame_size,
.getmaxframe = &syn_get_max_frame_size,
.getndostats = &syn_get_netdev_stats,
.getssetcount = &syn_get_strset_count,
.getstrings = &syn_get_strings,
.getethtoolstats = &syn_get_eth_stats,
.sendpause = &syn_send_pause_frame,
};
hal/dp_ops/syn_gmac_dp/syn_dp.c
/*
* nss_dp_gmac_ops
* Data plane operations for Synopsys GMAC
*/
struct nss_dp_data_plane_ops nss_dp_gmac_ops = {
.init = syn_dp_if_init,
.open = syn_dp_if_open,
.close = syn_dp_if_close,
.link_state = syn_dp_if_link_state,
.mac_addr = syn_dp_if_mac_addr,
.change_mtu = syn_dp_if_change_mtu,
.xmit = syn_dp_if_xmit,
.set_features = syn_dp_if_set_features,
.pause_on_off = syn_dp_if_pause_on_off,
.get_stats = syn_dp_if_get_stats,
.deinit = syn_dp_if_deinit,
};
netdev->netdev_ops = &nss_dp_netdev_ops;
/*
* Netdevice operations
*/
static const struct net_device_ops nss_dp_netdev_ops = {
.ndo_open = nss_dp_open,
.ndo_stop = nss_dp_close,
.ndo_start_xmit = nss_dp_xmit,
.ndo_get_stats64 = nss_dp_get_stats64,
.ndo_set_mac_address = nss_dp_set_mac_address,
.ndo_validate_addr = eth_validate_addr,
.ndo_change_mtu = nss_dp_change_mtu,
.ndo_do_ioctl = nss_dp_do_ioctl,
#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 5, 0))
.ndo_bridge_setlink = switchdev_port_bridge_setlink,
.ndo_bridge_getlink = switchdev_port_bridge_getlink,
.ndo_bridge_dellink = switchdev_port_bridge_dellink,
#endif
#ifndef NSS_DP_IPQ50XX
.ndo_select_queue = nss_dp_select_queue,
#endif
#ifdef CONFIG_RFS_ACCEL
.ndo_rx_flow_steer = nss_dp_rx_flow_steer,
#endif
};
netdev_ops实际上是调用上面的ops。
在nss_dp_probe接口中,会执行
gmac_hal_ops->init() == syn_init()
读取寄存器地址并map,执行mac_base = devm_ioremap(
开启gmac clk
data_plane_ops->init() == syn_dp_if_init()
syn_dp_cfg_rx_setup_rings(dev_info)
两个结构体,一个rx结构体,一个是dma desc结构体:
/*
* dma_desc_rx
* Rx DMA Descriptor Structure
*
* Enhanced descriptor format for receive.
*/
struct dma_desc_rx {
uint32_t status; /* Status */
uint32_t length; /* Buffer 1 and Buffer 2 length */
uint32_t buffer1; /* Network Buffer 1 pointer (DMA-able) */
uint32_t buffer2; /* Network Buffer 2 pointer (DMA-able) */
/* This data below is used only by driver */
uint32_t extstatus; /* Extended status of a Rx Descriptor */
uint32_t reserved1; /* Reserved word */
uint32_t timestamplow; /* Lower 32 bits of the 64
bit timestamp value */
uint32_t timestamphigh; /* Higher 32 bits of the 64
bit timestamp value */
uint32_t padding[8]; /* Pad 32 byte to align to 64B cacheline size */
};
/*
* syn_dp_info_rx
*/
struct syn_dp_info_rx {
struct napi_struct napi_rx; /* Rx NAPI */
void __iomem *mac_base; /* MAC base for register read/write */
struct dma_desc_rx *rx_desc; /* start address of RX descriptors ring or
chain, this is used by the driver */
uint32_t busy_rx_desc_cnt; /* Number of Rx Descriptors owned by
DMA at any given time */
uint32_t rx_refill_idx; /* index of the rx descriptor owned by DMA */
uint32_t rx_idx; /* index of the rx descriptor next available with driver */
struct syn_dp_rx_buf rx_buf_pool[SYN_DP_RX_DESC_SIZE];
/* Rx skb pool helping RX DMA descriptors */
struct nss_dp_hal_gmac_stats_rx rx_stats;
/* GMAC driver Rx statistics */
struct net_device *netdev; /* Net-device corresponding to the GMAC */
struct device *dev; /* Platform device corresponding to the GMAC */
struct sk_buff *head; /* Head of the skb list in case of Scatter-Gather frame */
struct sk_buff *tail; /* Tail of the skb list in case of Scatter-Gather frame */
bool page_mode; /* page_mode: true for nr_frag and false for fraglist */
uint32_t alloc_buf_len; /* Skb alloc length, depends based on page/fraglist mode */
uint32_t prev_len; /* Stores frame_length of previous descriptor */
};
ring是一个descriptor的数组,descriptor有指针指向要存放数据包的地址。
所以初始化工作包括两个,分配desc数组,分配每个desc的buffer。
rx descriptor的个数是128
#define SYN_DP_RX_DESC_SIZE 128
a. 分配descriptor
syn_dp_cfg_rx_setup_desc_queue(dev_info);
first_desc = kzalloc(sizeof(struct dma_desc_rx) * SYN_DP_RX_DESC_SIZE, GFP_KERNEL);
dev_info->rx_desc_dma_addr = (dma_addr_t)virt_to_phys(first_desc);
rx_info->rx_desc = first_desc;
// 将descriptor数组置0, 将最后一个descriptor的length设置为DESC_RX_DESC_END_OF_RING。
syn_dp_gmac_rx_desc_init_ring(rx_info->rx_desc, SYN_DP_RX_DESC_SIZE);
rx_info->rx_refill_idx = 0;
rx_info->rx_idx = 0;
rx_info->busy_rx_desc_cnt = 0;
b. 为descriptor分配buffer
syn_dp_rx_refill(&dev_info->dp_info_rx);
skb = __netdev_alloc_skb(netdev, rx_info->alloc_buf_len, GFP_ATOMIC);
dma_addr = (dma_addr_t)virt_to_phys(skb->data);
syn_dp_rx_refill_one_desc(rx_desc, dma_addr, inval_len);
rx_info->rx_buf_pool[rx_refill_idx].skb = skb;
c 下发descriptor物理地址给dma
syn_init_rx_desc_base(dev_info->mac_base, dev_info->rx_desc_dma_addr);
syn_dp_cfg_tx_setup_rings(dev_info)
同样也是两个结构体:
/*
* syn_dp_info_tx
*/
struct syn_dp_info_tx {
struct napi_struct napi_tx; /* Tx NAPI */
void __iomem *mac_base; /* MAC base for register read/write */
struct dma_desc_tx *tx_desc; /* start address of TX descriptors ring or
chain, this is used by the driver */
uint32_t busy_tx_desc_cnt; /* Number of Tx Descriptors owned by
DMA at any given time */
uint32_t tx_comp_idx; /* index of the tx descriptor owned by DMA */
uint32_t tx_idx; /* index of the tx descriptor next available with driver */
struct syn_dp_tx_buf tx_buf_pool[SYN_DP_TX_DESC_SIZE];
/* Tx skb pool helping TX DMA descriptors */
struct nss_dp_hal_gmac_stats_tx tx_stats;
/* GMAC driver Tx statistics */
struct net_device *netdev; /* Net-device corresponding to the GMAC */
struct device *dev; /* Platform device corresponding to the GMAC */
struct sk_buff *skb_free_list[SYN_DP_NAPI_BUDGET_TX];
/* Array to hold SKBs before free during Tx completion */
size_t shinfo_addr_virt[SYN_DP_NAPI_BUDGET_TX];
/* Array to hold SKB end pointer to be
prefetched during Tx completion */
};
/*
* dma_desc_tx
* Tx DMA Descriptor Structure
*
* Enhanced descriptor format for transmit.
*/
struct dma_desc_tx {
uint32_t status; /* Status */
uint32_t length; /* Buffer 1 and Buffer 2 length */
uint32_t buffer1; /* Network Buffer 1 pointer (DMA-able) */
uint32_t buffer2; /* Network Buffer 2 pointer (DMA-able) */
uint32_t reserved1; /* Reserved word */
uint32_t reserved2; /* Reserved word */
uint32_t timestamplow; /* Lower 32 bits of the 64
bit timestamp value */
uint32_t timestamphigh; /* Higher 32 bits of the 64
bit timestamp value */
uint32_t padding[8]; /* Pad 32 byte to align to 64B cacheline size */
};
tx descriptor的个数:
#define SYN_DP_TX_DESC_SIZE 1024
a. 分配descriptor
err = syn_dp_cfg_tx_setup_desc_queue(dev_info);
first_desc = dma_alloc_coherent(tx_info->dev, sizeof(struct dma_desc_tx) * SYN_DP_TX_DESC_SIZE, &dma_addr, GFP_KERNEL);
tx_info->tx_desc = first_desc;
dev_info->tx_desc_dma_addr = dma_addr;
// 将descriptor数组置0, 将最后一个descriptor的status设置为DESC_TX_DESC_END_OF_RING
syn_dp_gmac_tx_desc_init_ring(tx_info->tx_desc, SYN_DP_TX_DESC_SIZE);
tx_info->tx_comp_idx = 0;
tx_info->tx_idx = 0;
tx_info->busy_tx_desc_cnt = 0;
b. 下发descriptor
注意:这里没有为descriptor分配buffer, 因为此时还没有要tx的包。
syn_init_tx_desc_base(dev_info->mac_base, dev_info->tx_desc_dma_addr);
但是需要注意的是tx的napi也是在RX softirq中处理的。
netif_napi_add(netdev, &rx_info->napi_rx, syn_dp_napi_poll_rx, SYN_DP_NAPI_BUDGET_RX);
netif_napi_add(netdev, &tx_info->napi_tx, syn_dp_napi_poll_tx, SYN_DP_NAPI_BUDGET_TX);
rx和tx中断是共享的
request_irq(netdev->irq, syn_dp_handle_irq, 0, "nss-dp-gmac", &gmac_dev->dp_info.syn_info);
nss_dp_open
调用:data_plane_ops->open()
执行的是syn_dp_if_open()
1 打开rx/tx napi
napi_enable(&dp_info->syn_info.dp_info_rx.napi_rx);
napi_enable(&dp_info->syn_info.dp_info_tx.napi_tx);
2 打开dma
syn_enable_dma_tx(mac_base);
syn_enable_dma_rx(mac_base);
3 打开中断
再调用:gmac_hal_ops->start()
执行的是syn_start()
配置寄存器打开gmac的rx和tx
1 中断触发
irqreturn_t syn_dp_handle_irq(int irq, void *ctx)
读取dma状态,如果有 SYN_DMA_INT_RX_COMPLETED。则
清掉rx dma status, 关闭rx dma中断,然后,最重要调度rx napi
napi_schedule(&dp_info->dp_info_rx.napi_rx);
2 napi处理
执行:int syn_dp_napi_poll_rx(struct napi_struct *napi, int budget)
调用 work_done = syn_dp_rx(rx_info, budget); 进行收包处理,
然后调用 pending_refill = syn_dp_rx_refill(rx_info); 重新分配dma的buffer。
如果work_done等于budget, 则表示还没有收完,直接返回work done.
如果work done小于budget, 则表示收完了,需要napi设置为完成,重新打开rx中断:
napi_complete(napi);
syn_enable_rx_dma_interrupt(mac_base);
3 rx packet到kernel stack
第二步中的syn_dp_rx()是进行收包,
skb_put(rx_skb, frame_length);
rx_skb->protocol = eth_type_trans(rx_skb, netdev);
napi_gro_receive(&rx_info->napi_rx, rx_skb);
或者
netif_receive_skb(rx_skb);
netdevice的ndo_start_xmit
nss_dp_xmit -> syn_dp_if_xmit
syn_dp_if_xmit()
调用 syn_dp_tx(tx_info, skb) , 如果成功,返回 NETDEV_TX_OK.
否则 释放skb, 增加统计计数,也返回NETDEV_TX_OK.
dev_kfree_skb_any(skb);
atomic64_inc((atomic64_t *)&tx_info->tx_stats.tx_dropped);
int syn_dp_tx(struct syn_dp_info_tx *tx_info, struct sk_buff *skb)
如果包是非线性的,调用syn_dp_tx_sg()进行处理。
if (unlikely(skb_is_nonlinear(skb)))
return syn_dp_tx_sg(tx_info, skb);
如果可用tx descriptor的个数不足(这里是拿的上次tx complete的数据,可能已经有一些已经完成tx了,所以这个数据不实时)
则增加错误计数,并返回-1.
将skb的payload长度等设置到 tx desc
将skb信息缓存到 tx_info->tx_buf_pool[tx_idx]
更新tx desc的status.
这样就将skb下发到dma中去了。
调用 syn_resume_dma_tx(tx_info->mac_base); 开启dma tx
调用 atomic_inc((atomic_t *)&tx_info->busy_tx_desc_cnt); 增加dma未完成的个数。
tx complete是用一个napi, 在rx软中断中完成的。它的目的是处理tx dma ring的完成情况,如果一个包已经被dma处理完。
就释放这个包对应的skb.
1 中断触发
和rx共享中断,处理方式和rx中断触发是一样的,调用tx napi
napi_schedule(&dp_info->dp_info_tx.napi_tx);
2 处理 tx complete
计算出单次要处理的包的个数
busy = desc_cnt = atomic_read((atomic_t *)&tx_info->busy_tx_desc_cnt);
if (likely(busy > budget))
busy = desc_cnt = budget;
然后处理busy个tx descriptor。拿到当前的desc, 及其状态:
desc = syn_dp_tx_comp_desc_get(tx_info);
status = desc->status;
如果status是被dma所拥有,表明还没有tx完成,此时中断处理。
if (unlikely(syn_dp_gmac_is_tx_desc_owned_by_dma(status)))
break
如果status表明dma已经完成,则做一些统计,并缓存skb到skb_free_list.
最后,一次性释放skb_free_list中,busy个skb.
3 开启 napi
如果处理的个数小于budget, 则表明所有的dma已经complete, 此时
napi_complete(napi);
syn_enable_tx_dma_interrupt(mac_base);
在注册接口的时候,调用netif_napi_add() 注册 RX/TX 两个napi. 注册一个中断。
在接口up的时候,调用napi_enable() 激活 napi,并打开中断。
在中断处理函数中,如果有rx, 则关闭rx中断,并调用napi_schedule()。
在中断处理函数中,如果有tx, 则关闭tx中断,并调用napi_schedule()。
rx的napi从dma拿包,并调用 napi_gro_receive() 或 netif_receive_skb() 将skb交给协议栈。
xmit接口,将包下发到dma中,并缓存skb到 tx_buf_pool[]
tx的napi, 处理已经完成的tx desc, 释放对应的skb
参考
https://blog.packagecloud.io/monitoring-tuning-linux-networking-stack-receiving-data/
https://epickrram.blogspot.com/2016/05/navigating-linux-kernel-network-stack.html