qos2移植到另外一个机型后,桥模式qos模块限速失败,排查发现是找不到wan接口,导致tc规则配置不到wan口。
wan接口的解析方法是:
1 通过默认路由,拿到网关ip。
2 通过查询邻居表,从网关ip解析出网关mac。
3 通过查询bridge fdb表,从网关mac解析到桥port接口。
上述第3步失败了。第三步是c语言实现的,大致:
1 sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE)
2 发送请求:
nlmsg_type: RTM_GETNEIGH
nlmsg_flags: NLM_F_REQUEST|NLM_F_DUMP,
ifinfomsg.ifi_family = PF_BRIDGE
3 接收消息,解析 struct ndmsg
发现死活解析不到网关mac的fdb条目,编译 bridge工具,使用 bridge fdb show发现也解析不到。
# ./bridge-mt7621 fdb show
8c:de:f9:53:4c:21 dev wl1 master br-lan permanent
01:00:5e:00:00:01 dev wl1 self permanent
8c:de:f9:53:4c:22 dev wl0 master br-lan permanent
01:00:5e:00:00:01 dev wl0 self permanent
33:33:00:00:00:01 dev wl2 self permanent
33:33:00:00:00:02 dev wl2 self permanent
33:33:00:00:00:01 dev wl4 self permanent
33:33:00:00:00:02 dev wl4 self permanent
33:33:00:00:00:01 dev wl6 self permanent
33:33:00:00:00:02 dev wl6 self permanent
33:33:00:00:00:01 dev wl8 self permanent
33:33:00:00:00:02 dev wl8 self permanent
33:33:00:00:00:01 dev wl15 self permanent
33:33:00:00:00:02 dev wl15 self permanent
33:33:00:00:00:01 dev apcli0 self permanent
33:33:00:00:00:02 dev apcli0 self permanent
33:33:00:00:00:01 dev apclix0 self permanent
33:33:00:00:00:02 dev apclix0 self permanent
# ip -4 route
default via 192.168.3.1 dev br-lan proto static
192.168.3.0/24 dev br-lan proto kernel scope link src 192.168.3.80
192.168.10.0/24 dev br-lan proto kernel scope link src 192.168.10.1
# ip -4 neigh
192.168.3.100 dev br-lan lladdr 9c:7b:ef:45:a5:82 STALE
192.168.3.1 dev br-lan lladdr 4c:c6:4c:61:41:2a REACHABLE
# brctl show
bridge name bridge id STP enabled interfaces
br-lan 7fff.a439b34001e2 no eth1
eth0
wl1
wl0
难道是真的没有条目,如果没有条目,桥应该会广播,再接入一台pc,在路由器上ping网关,pc上抓不到ping包。证明网络是ok的,没有广播。写一个内核模块打印fdb表,这个内核模块是4.4内核运行,新的内核模块fdb表的数据格式完全变化了。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 | #include <linux/init.h> #include <linux/module.h> #include <linux/netdevice.h> #include "../net/bridge/br_private.h" static void dump_fdb( struct net_device *dev) { struct net_bridge *br; int i; struct net_bridge_fdb_entry *fdb; br = netdev_priv(dev); for (i = 0; i < BR_HASH_SIZE; i++) { hlist_for_each_entry_rcu(fdb, &br->hash[i], hlist) { unsigned char *m = fdb->addr.addr; printk( "bridge %s port %s mac %02x:%02x:%02x:%02x:%02x:%02x\n" , dev->name, fdb->dst->dev->name, m[0], m[1], m[2], m[3], m[4], m[5]); } } } static int __init test_init( void ) { struct net_device *dev; dev = dev_get_by_name_rcu(&init_net, "br-lan" ); if (!dev) { printk( "no br-lan dev\n" ); return -1; } if (!netif_is_bridge_master(dev)) { printk( "br-lan not bridge\n" ); return -1; } dump_fdb(dev); return -1; } static void __exit test_exit( void ) { } module_init(test_init); module_exit(test_exit); MODULE_LICENSE( "GPL" ); MODULE_AUTHOR( "Jianpeng Yuan" ); MODULE_DESCRIPTION( "test module" ); |
# insmod ./test.ko
[55333.978921] bridge br-lan port eth0 mac 8c:de:f9:53:4c:20
[55333.984361] bridge br-lan port wl0 mac 8c:de:f9:53:4c:22
[55333.989762] bridge br-lan port eth1 mac 9c:7b:ef:45:a5:82
[55333.996045] bridge br-lan port eth1 mac a4:39:b3:40:01:e2
[55334.001477] bridge br-lan port wl1 mac 8c:de:f9:53:4c:21
[55334.006852] bridge br-lan port eth1 mac 4c:c6:4c:61:41:2a
可以看到网关mac 4c:c6:4c:61:41:2a是在eth1上面。证明内核的fdb表是没有问题的。是用户态dump出了问题。于是尝试给内核添加调试信息。
net/core/rtnetlink.c
rtnetlink_init() 注册了桥fdb的dump函数:
rtnl_register(PF_BRIDGE, RTM_GETNEIGH, NULL, rtnl_fdb_dump, NULL);
给rtnl_fdb_dump添加调试信息,如下所有的printk都是添加的调试信息:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 | static int rtnl_fdb_dump( struct sk_buff *skb, struct netlink_callback *cb) { struct net_device *dev; struct nlattr *tb[IFLA_MAX+1]; struct net_device *br_dev = NULL; const struct net_device_ops *ops = NULL; const struct net_device_ops *cops = NULL; struct ifinfomsg *ifm = nlmsg_data(cb->nlh); struct net *net = sock_net(skb->sk); int brport_idx = 0; int br_idx = 0; int idx = 0; printk( "start dump cb idx %d\n" , cb->args[0]); if (nlmsg_parse(cb->nlh, sizeof ( struct ifinfomsg), tb, IFLA_MAX, ifla_policy) == 0) { if (tb[IFLA_MASTER]) br_idx = nla_get_u32(tb[IFLA_MASTER]); } brport_idx = ifm->ifi_index; if (br_idx) { br_dev = __dev_get_by_index(net, br_idx); if (!br_dev) return -ENODEV; ops = br_dev->netdev_ops; printk( "use br %s\n" , br_dev->name); } for_each_netdev(net, dev) { if (brport_idx && (dev->ifindex != brport_idx)) continue ; if (!br_idx) { /* user did not specify a specific bridge */ if (dev->priv_flags & IFF_BRIDGE_PORT) { br_dev = netdev_master_upper_dev_get(dev); cops = br_dev->netdev_ops; printk( "dev %s br %s\n" , dev->name, br_dev->name); } } else { if (dev != br_dev && !(dev->priv_flags & IFF_BRIDGE_PORT)) { printk( "skip dev %s\n" , dev->name); continue ; } if (br_dev != netdev_master_upper_dev_get(dev) && !(dev->priv_flags & IFF_EBRIDGE)) { printk( "skip dev2 %s\n" , dev->name); continue ; } printk( "use br2 %s\n" , br_dev->name); cops = ops; } if (dev->priv_flags & IFF_BRIDGE_PORT) { if (cops && cops->ndo_fdb_dump) { printk( "cops->ndo_fdb_dump br %s dev %s idx %d\n" , br_dev->name, dev->name, idx); idx = cops->ndo_fdb_dump(skb, cb, br_dev, dev, idx); printk( "idx1 %d\n" , idx); } } if (dev->netdev_ops->ndo_fdb_dump) { printk( "dev->netdev_ops->ndo_fdb_dump %s idx %d\n" , dev->name, idx); idx = dev->netdev_ops->ndo_fdb_dump(skb, cb, dev, NULL, idx); printk( "idx2 %d\n" , idx); } else { printk( "ndo_dflt_fdb_dump %s idx %d\n" , dev->name, idx); idx = ndo_dflt_fdb_dump(skb, cb, dev, NULL, idx); printk( "idx3 %d\n" , idx); } cops = NULL; } printk( "dump end idx %d len %d\n" , idx, skb->len); cb->args[0] = idx; return skb->len; } |
简单介绍一下这里的逻辑,遍历所有的接口,
1 如果是桥port,先执行桥的dump函数:idx = cops->ndo_fdb_dump(skb, cb, br_dev, dev, idx) 这个函数就会打印出这个桥里面这个port的fdb条目。
2 执行本接口的dump函数
idx = dev->netdev_ops->ndo_fdb_dump(skb, cb, dev, NULL, idx);
如果没有dump函数,则执行 idx = ndo_dflt_fdb_dump(skb, cb, dev, NULL, idx);
idx保存到cb->args[0]。为什么要保存idx,因为单次dump,可能skb不够大,dump到一半就满了,此时把idx保存起来,netlink还会再次调用dump,直到skb->len为0。下次dump的时候,对于idx小于cb->args[0]的则跳过。每次dump都会遍历一遍,效率比较低,后续内核有优化。
桥的fdb dump函数是br_fdb_dump,在net/bridge/br_device.c里面初始化
.ndo_fdb_dump = br_fdb_dump,
br_fdb_dump()定义在net/bridge/br_fdb.c,给它也添加调试信息:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 | int br_fdb_dump( struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, struct net_device *filter_dev, int idx) { struct net_bridge *br = netdev_priv(dev); int i; if (!(dev->priv_flags & IFF_EBRIDGE)) goto out; if (!filter_dev) { printk( "ndo_dflt_fdb_dump %s at br_fdb_dump idx %d\n" , dev->name, idx); idx = ndo_dflt_fdb_dump(skb, cb, dev, NULL, idx); printk( "idx %d\n" , idx); } else { printk( "br_fdb_dump %s has filter %s idx %d\n" , dev->name, filter_dev->name, idx); } for (i = 0; i < BR_HASH_SIZE; i++) { struct net_bridge_fdb_entry *f; hlist_for_each_entry_rcu(f, &br->hash[i], hlist) { unsigned char *m = f->addr.addr; printk( "bridge %s port %s mac %02x:%02x:%02x:%02x:%02x:%02x\n" , dev->name, f->dst->dev->name, m[0], m[1], m[2], m[3], m[4], m[5]); if (idx < cb->args[0]) { printk( "skip 1\n" ); goto skip; } if (filter_dev && (!f->dst || f->dst->dev != filter_dev)) { if (filter_dev != dev) { printk( "skip 2\n" ); goto skip; } /* !f->dst is a special case for bridge * It means the MAC belongs to the bridge * Therefore need a little more filtering * we only want to dump the !f->dst case */ if (f->dst) { printk( "skip 3\n" ); goto skip; } } if (!filter_dev && f->dst) { printk( "skip 4\n" ); goto skip; } printk( "fill %d\n" , idx); if (fdb_fill_info(skb, br, f, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWNEIGH, NLM_F_MULTI) < 0) { printk( "fill breaked\n" ); break ; } skip: ++idx; } } out: return idx; } |
添加好调试后,重新编译烧录,然后执行bridge fdb show。内核输出如下:
start dump cb idx 0
ndo_dflt_fdb_dump lo idx 0
idx3 -22
dev eth0 br br-lan
cops->ndo_fdb_dump br br-lan dev eth0 idx -22
br_fdb_dump br-lan has filter eth0 idx -22
bridge br-lan port eth0 mac 8c:de:f9:53:4c:20
skip 1
bridge br-lan port wl0 mac 8c:de:f9:53:4c:22
skip 1
bridge br-lan port eth0 mac d4:3a:65:08:db:96
skip 1
bridge br-lan port eth1 mac 9c:7b:ef:45:a5:82
skip 1
bridge br-lan port eth1 mac a4:39:b3:40:01:e2
skip 1
。。。
问题已然非常清晰了,可以看到,打印eth0在桥br-lan中的fdb时,是有fdb条目的,但是被跳过了。跳过的原因是:
idx < cb->args[0]
也就是内核认为这个条目已经dump过了。但是idx的值是-22,显然是异常的。这个-22,是
idx = ndo_dflt_fdb_dump(skb, cb, dev, NULL, idx);
打印lo接口导致的。查看ndo_dflt_fdb_dump:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 | int ndo_dflt_fdb_dump( struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, struct net_device *filter_dev, int idx) { int err; if (dev->type != ARPHRD_ETHER) return -EINVAL; netif_addr_lock_bh(dev); err = nlmsg_populate_fdb(skb, cb, dev, &idx, &dev->uc); if (err) goto out; nlmsg_populate_fdb(skb, cb, dev, &idx, &dev->mc); out: netif_addr_unlock_bh(dev); return idx; } EXPORT_SYMBOL(ndo_dflt_fdb_dump); |
非常明朗了,lo是环回接口,不是ARP类型,所以dev->type != ARPHRD_ETHER,返回了-EINVAL,显然这里不应该返回一个错误,返回错误,导致idx的值变成了负数。
那这个提交是怎么引入,又怎么revert的呢?
查看内核linux-4.4.y分支,git 提交记录,
$ git log --oneline --first-parent beijing/linux-4.4.y -- net/core/rtnetlink.c | cut -f 1 -d' ' | while read commit ; do echo $commit ; git show $commit:net/core/rtnetlink.c | grep -A 15 "int ndo_dflt_fdb_dump" ; done > /work/c.log
发现:
commit 266b50e76449bf4a2391aabd9cc8ec364f8e0589
Author: Eric Dumazet <edumazet@google.com>
Date: Tue Dec 4 09:40:35 2018 -0800
rtnetlink: ndo_dflt_fdb_dump() only work for ARPHRD_ETHER devices
[ Upstream commit 688838934c231bb08f46db687e57f6d8bf82709c ]
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index d2a46ffe6382..d52b633164c9 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2931,6 +2931,9 @@ int ndo_dflt_fdb_dump(struct sk_buff *skb,
{
int err;
+ if (dev->type != ARPHRD_ETHER)
+ return -EINVAL;
+
$ git show 266b50e7:Makefile | head -n 3
VERSION = 4
PATCHLEVEL = 4
SUBLEVEL = 167
这个提交是4.4.167之后引入的。
但是查看4.5.y分支,没有改动,查看master分支
$ git log --oneline --first-parent master -- net/core/rtnetlink.c | cut -f 1 -d' ' | while read commit ; do echo $commit ; git show $commit:net/core/rtnetlink.c | grep -A 15 "int ndo_dflt_fdb_dump" ; done > /work/b.log
发现是下面提交第一次引入
commit d48f782e4fb20dc7ec935ca0ca41ae31e4a69362
Merge: 8586ca8a2144 35cc3cefc4de
Author: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun Dec 9 15:12:33 2018 -0800
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net
查看版本:
$ git show d48f782e4fb2:Makefile | head -n 5
# SPDX-License-Identifier: GPL-2.0
VERSION = 4
PATCHLEVEL = 20
SUBLEVEL = 0
EXTRAVERSION = -rc5
后续都没有删除,那就清晰了,这个提交是4.20分支合入主线的。4.4作为long term维护分支,以bug fix的形式在4.4.168合入了这个改动。但是这个改动只适用于新的内核,在旧的内核产生了错误。需要revert。这个功能,在同4.4内核的高通ipq5018上面,没有问题,查看高通内核,发现其是4.4.60版本,此改动还未合入。
小知识:git log -p -m选项,可以查看merge的代码改动。