ILD

4.4 kernel bridge fdb show missing fdb bug
作者:Yuan Jianpeng 邮箱:yuanjp89@163.com
发布时间:2023-9-15 站点:Inside Linux Development

qos2移植到另外一个机型后,桥模式qos模块限速失败,排查发现是找不到wan接口,导致tc规则配置不到wan口。


wan接口的解析方法是:

1 通过默认路由,拿到网关ip。

2 通过查询邻居表,从网关ip解析出网关mac。

3 通过查询bridge fdb表,从网关mac解析到桥port接口。


上述第3步失败了。第三步是c语言实现的,大致:

    1 sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE)

    2 发送请求:

nlmsg_type: RTM_GETNEIGH

nlmsg_flags: NLM_F_REQUEST|NLM_F_DUMP,

ifinfomsg.ifi_family = PF_BRIDGE

    3 接收消息,解析 struct ndmsg


发现死活解析不到网关mac的fdb条目,编译 bridge工具,使用 bridge fdb show发现也解析不到。

# ./bridge-mt7621 fdb show

8c:de:f9:53:4c:21 dev wl1 master br-lan permanent

01:00:5e:00:00:01 dev wl1 self permanent

8c:de:f9:53:4c:22 dev wl0 master br-lan permanent

01:00:5e:00:00:01 dev wl0 self permanent

33:33:00:00:00:01 dev wl2 self permanent

33:33:00:00:00:02 dev wl2 self permanent

33:33:00:00:00:01 dev wl4 self permanent

33:33:00:00:00:02 dev wl4 self permanent

33:33:00:00:00:01 dev wl6 self permanent

33:33:00:00:00:02 dev wl6 self permanent

33:33:00:00:00:01 dev wl8 self permanent

33:33:00:00:00:02 dev wl8 self permanent

33:33:00:00:00:01 dev wl15 self permanent

33:33:00:00:00:02 dev wl15 self permanent

33:33:00:00:00:01 dev apcli0 self permanent

33:33:00:00:00:02 dev apcli0 self permanent

33:33:00:00:00:01 dev apclix0 self permanent

33:33:00:00:00:02 dev apclix0 self permanent


# ip -4 route

default via 192.168.3.1 dev br-lan  proto static

192.168.3.0/24 dev br-lan  proto kernel  scope link  src 192.168.3.80

192.168.10.0/24 dev br-lan  proto kernel  scope link  src 192.168.10.1


# ip -4 neigh

192.168.3.100 dev br-lan lladdr 9c:7b:ef:45:a5:82 STALE

192.168.3.1 dev br-lan lladdr 4c:c6:4c:61:41:2a REACHABLE


# brctl show

bridge name     bridge id               STP enabled     interfaces

br-lan          7fff.a439b34001e2       no              eth1

                                                        eth0

                                                        wl1

                                                        wl0


难道是真的没有条目,如果没有条目,桥应该会广播,再接入一台pc,在路由器上ping网关,pc上抓不到ping包。证明网络是ok的,没有广播。写一个内核模块打印fdb表,这个内核模块是4.4内核运行,新的内核模块fdb表的数据格式完全变化了。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
#include <linux/init.h>
#include <linux/module.h>
#include <linux/netdevice.h>
#include "../net/bridge/br_private.h"
 
static void dump_fdb(struct net_device *dev)
{
        struct net_bridge *br;
        int i;
        struct net_bridge_fdb_entry *fdb;
 
        br = netdev_priv(dev);
 
        for (i = 0; i < BR_HASH_SIZE; i++) {
 
                hlist_for_each_entry_rcu(fdb, &br->hash[i], hlist) {
 
                        unsigned char *m = fdb->addr.addr;
 
                        printk("bridge %s port %s mac %02x:%02x:%02x:%02x:%02x:%02x\n",
                                dev->name, fdb->dst->dev->name,
                                m[0], m[1], m[2], m[3], m[4], m[5]);
 
                }
        }
}
 
static int __init test_init(void)
{
        struct net_device *dev;
 
        dev = dev_get_by_name_rcu(&init_net, "br-lan");
        if (!dev) {
                printk("no br-lan dev\n");
                return -1;
        }
 
        if (!netif_is_bridge_master(dev)) {
                printk("br-lan not bridge\n");
                return -1;
        }
 
        dump_fdb(dev);
        return -1;
}
 
static void __exit test_exit(void)
{
}
 
module_init(test_init);
module_exit(test_exit);
 
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Jianpeng Yuan");
MODULE_DESCRIPTION("test module");


# insmod ./test.ko

[55333.978921] bridge br-lan port eth0 mac 8c:de:f9:53:4c:20

[55333.984361] bridge br-lan port wl0 mac 8c:de:f9:53:4c:22

[55333.989762] bridge br-lan port eth1 mac 9c:7b:ef:45:a5:82

[55333.996045] bridge br-lan port eth1 mac a4:39:b3:40:01:e2

[55334.001477] bridge br-lan port wl1 mac 8c:de:f9:53:4c:21

[55334.006852] bridge br-lan port eth1 mac 4c:c6:4c:61:41:2a


可以看到网关mac 4c:c6:4c:61:41:2a是在eth1上面。证明内核的fdb表是没有问题的。是用户态dump出了问题。于是尝试给内核添加调试信息。


net/core/rtnetlink.c

rtnetlink_init() 注册了桥fdb的dump函数:

rtnl_register(PF_BRIDGE, RTM_GETNEIGH, NULL, rtnl_fdb_dump, NULL);


给rtnl_fdb_dump添加调试信息,如下所有的printk都是添加的调试信息:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
    struct net_device *dev;
    struct nlattr *tb[IFLA_MAX+1];
    struct net_device *br_dev = NULL;
    const struct net_device_ops *ops = NULL;
    const struct net_device_ops *cops = NULL;
    struct ifinfomsg *ifm = nlmsg_data(cb->nlh);
    struct net *net = sock_net(skb->sk);
    int brport_idx = 0;
    int br_idx = 0;
    int idx = 0;
 
    printk("start dump cb idx %d\n", cb->args[0]);
 
    if (nlmsg_parse(cb->nlh, sizeof(struct ifinfomsg), tb, IFLA_MAX,
            ifla_policy) == 0) {
        if (tb[IFLA_MASTER])
            br_idx = nla_get_u32(tb[IFLA_MASTER]);
    }
 
    brport_idx = ifm->ifi_index;
 
    if (br_idx) {
        br_dev = __dev_get_by_index(net, br_idx);
        if (!br_dev)
            return -ENODEV;
 
        ops = br_dev->netdev_ops;
        printk("use br %s\n", br_dev->name);
    }
 
    for_each_netdev(net, dev) {
        if (brport_idx && (dev->ifindex != brport_idx))
            continue;
 
        if (!br_idx) { /* user did not specify a specific bridge */
            if (dev->priv_flags & IFF_BRIDGE_PORT) {
                br_dev = netdev_master_upper_dev_get(dev);
                cops = br_dev->netdev_ops;
                printk("dev %s br %s\n", dev->name, br_dev->name);
            }
 
        else {
            if (dev != br_dev &&
                !(dev->priv_flags & IFF_BRIDGE_PORT)) {
                printk("skip dev %s\n", dev->name);
                continue;
            }
 
            if (br_dev != netdev_master_upper_dev_get(dev) &&
                !(dev->priv_flags & IFF_EBRIDGE)) {
                printk("skip dev2 %s\n", dev->name);
                continue;
            }
 
            printk("use br2 %s\n", br_dev->name);
            cops = ops;
        }
 
        if (dev->priv_flags & IFF_BRIDGE_PORT) {
            if (cops && cops->ndo_fdb_dump) {
                printk("cops->ndo_fdb_dump br %s dev %s idx %d\n", br_dev->name, dev->name, idx);
                idx = cops->ndo_fdb_dump(skb, cb, br_dev, dev,
                             idx);
                printk("idx1 %d\n", idx);
            }
        }
 
        if (dev->netdev_ops->ndo_fdb_dump) {
            printk("dev->netdev_ops->ndo_fdb_dump %s idx %d\n", dev->name, idx);
            idx = dev->netdev_ops->ndo_fdb_dump(skb, cb, dev, NULL,
                                idx);
            printk("idx2 %d\n", idx);
        }
        else {
            printk("ndo_dflt_fdb_dump %s idx %d\n", dev->name, idx);
            idx = ndo_dflt_fdb_dump(skb, cb, dev, NULL, idx);
            printk("idx3 %d\n", idx);
        }
 
        cops = NULL;
    }
 
    printk("dump end idx %d len %d\n", idx, skb->len);
    cb->args[0] = idx;
    return skb->len;
}


简单介绍一下这里的逻辑,遍历所有的接口,

1 如果是桥port,先执行桥的dump函数:idx = cops->ndo_fdb_dump(skb, cb, br_dev, dev, idx) 这个函数就会打印出这个桥里面这个port的fdb条目。

2 执行本接口的dump函数

    idx = dev->netdev_ops->ndo_fdb_dump(skb, cb, dev, NULL, idx);

    如果没有dump函数,则执行 idx = ndo_dflt_fdb_dump(skb, cb, dev, NULL, idx);

idx保存到cb->args[0]。为什么要保存idx,因为单次dump,可能skb不够大,dump到一半就满了,此时把idx保存起来,netlink还会再次调用dump,直到skb->len为0。下次dump的时候,对于idx小于cb->args[0]的则跳过。每次dump都会遍历一遍,效率比较低,后续内核有优化。


桥的fdb dump函数是br_fdb_dump,在net/bridge/br_device.c里面初始化

            .ndo_fdb_dump            = br_fdb_dump,


br_fdb_dump()定义在net/bridge/br_fdb.c,给它也添加调试信息:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
int br_fdb_dump(struct sk_buff *skb,
        struct netlink_callback *cb,
        struct net_device *dev,
        struct net_device *filter_dev,
        int idx)
{
    struct net_bridge *br = netdev_priv(dev);
    int i;
 
    if (!(dev->priv_flags & IFF_EBRIDGE))
        goto out;
 
    if (!filter_dev) {
        printk("ndo_dflt_fdb_dump %s at br_fdb_dump idx %d\n", dev->name, idx);
        idx = ndo_dflt_fdb_dump(skb, cb, dev, NULL, idx);
        printk("idx %d\n", idx);
    }
    else {
        printk("br_fdb_dump %s has filter %s idx %d\n", dev->name, filter_dev->name, idx);
    }
 
    for (i = 0; i < BR_HASH_SIZE; i++) {
        struct net_bridge_fdb_entry *f;
 
        hlist_for_each_entry_rcu(f, &br->hash[i], hlist) {
 
            unsigned char *m = f->addr.addr;
 
            printk("bridge %s port %s mac %02x:%02x:%02x:%02x:%02x:%02x\n",
                    dev->name, f->dst->dev->name,
                    m[0], m[1], m[2], m[3], m[4], m[5]);
 
            if (idx < cb->args[0]) {
                printk("skip 1\n");
                goto skip;
            }
 
            if (filter_dev &&
                (!f->dst || f->dst->dev != filter_dev)) {
                if (filter_dev != dev) {
                    printk("skip 2\n");
                    goto skip;
                }
                /* !f->dst is a special case for bridge
                 * It means the MAC belongs to the bridge
                 * Therefore need a little more filtering
                 * we only want to dump the !f->dst case
                 */
                if (f->dst) {
                    printk("skip 3\n");
                    goto skip;
                }
            }
            if (!filter_dev && f->dst) {
                printk("skip 4\n");
                goto skip;
            }
 
            printk("fill %d\n", idx);
 
            if (fdb_fill_info(skb, br, f,
                      NETLINK_CB(cb->skb).portid,
                      cb->nlh->nlmsg_seq,
                      RTM_NEWNEIGH,
                      NLM_F_MULTI) < 0) {
                printk("fill breaked\n");
                break;
            }
skip:
            ++idx;
        }
    }
 
out:
    return idx;
}


添加好调试后,重新编译烧录,然后执行bridge fdb show。内核输出如下:


start dump cb idx 0

ndo_dflt_fdb_dump lo idx 0

idx3 -22

dev eth0 br br-lan

cops->ndo_fdb_dump br br-lan dev eth0 idx -22

br_fdb_dump br-lan has filter eth0 idx -22

bridge br-lan port eth0 mac 8c:de:f9:53:4c:20

skip 1

bridge br-lan port wl0 mac 8c:de:f9:53:4c:22

skip 1

bridge br-lan port eth0 mac d4:3a:65:08:db:96

skip 1

bridge br-lan port eth1 mac 9c:7b:ef:45:a5:82

skip 1

bridge br-lan port eth1 mac a4:39:b3:40:01:e2

skip 1

。。。


问题已然非常清晰了,可以看到,打印eth0在桥br-lan中的fdb时,是有fdb条目的,但是被跳过了。跳过的原因是:

idx < cb->args[0]

也就是内核认为这个条目已经dump过了。但是idx的值是-22,显然是异常的。这个-22,是

idx = ndo_dflt_fdb_dump(skb, cb, dev, NULL, idx);

打印lo接口导致的。查看ndo_dflt_fdb_dump:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
int ndo_dflt_fdb_dump(struct sk_buff *skb,
              struct netlink_callback *cb,
              struct net_device *dev,
              struct net_device *filter_dev,
              int idx)
{
    int err;
 
    if (dev->type != ARPHRD_ETHER)
        return -EINVAL;
 
    netif_addr_lock_bh(dev);
    err = nlmsg_populate_fdb(skb, cb, dev, &idx, &dev->uc);
    if (err)
        goto out;
    nlmsg_populate_fdb(skb, cb, dev, &idx, &dev->mc);
out:
    netif_addr_unlock_bh(dev);
    return idx;
}
EXPORT_SYMBOL(ndo_dflt_fdb_dump);


非常明朗了,lo是环回接口,不是ARP类型,所以dev->type != ARPHRD_ETHER,返回了-EINVAL,显然这里不应该返回一个错误,返回错误,导致idx的值变成了负数。


那这个提交是怎么引入,又怎么revert的呢?


查看内核linux-4.4.y分支,git 提交记录,

$ git log --oneline --first-parent beijing/linux-4.4.y  -- net/core/rtnetlink.c | cut -f 1 -d' ' | while read commit ; do echo $commit ; git show $commit:net/core/rtnetlink.c | grep -A 15 "int ndo_dflt_fdb_dump" ;  done > /work/c.log


发现:

commit 266b50e76449bf4a2391aabd9cc8ec364f8e0589

Author: Eric Dumazet <edumazet@google.com>

Date:   Tue Dec 4 09:40:35 2018 -0800


    rtnetlink: ndo_dflt_fdb_dump() only work for ARPHRD_ETHER devices

    [ Upstream commit 688838934c231bb08f46db687e57f6d8bf82709c ]


diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c

index d2a46ffe6382..d52b633164c9 100644

--- a/net/core/rtnetlink.c

+++ b/net/core/rtnetlink.c

@@ -2931,6 +2931,9 @@ int ndo_dflt_fdb_dump(struct sk_buff *skb,

 {

        int err;


+       if (dev->type != ARPHRD_ETHER)

+               return -EINVAL;

+


$ git show 266b50e7:Makefile  | head -n 3

VERSION = 4

PATCHLEVEL = 4

SUBLEVEL = 167


这个提交是4.4.167之后引入的。


但是查看4.5.y分支,没有改动,查看master分支

$ git log --oneline --first-parent master -- net/core/rtnetlink.c | cut -f 1 -d' ' | while read commit ; do echo $commit ; git show $commit:net/core/rtnetlink.c | grep -A 15 "int ndo_dflt_fdb_dump" ;  done > /work/b.log


发现是下面提交第一次引入

commit d48f782e4fb20dc7ec935ca0ca41ae31e4a69362

Merge: 8586ca8a2144 35cc3cefc4de

Author: Linus Torvalds <torvalds@linux-foundation.org>

Date:   Sun Dec 9 15:12:33 2018 -0800


    Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net


查看版本:

$ git show d48f782e4fb2:Makefile  | head -n 5

# SPDX-License-Identifier: GPL-2.0

VERSION = 4

PATCHLEVEL = 20

SUBLEVEL = 0

EXTRAVERSION = -rc5


后续都没有删除,那就清晰了,这个提交是4.20分支合入主线的。4.4作为long term维护分支,以bug fix的形式在4.4.168合入了这个改动。但是这个改动只适用于新的内核,在旧的内核产生了错误。需要revert。这个功能,在同4.4内核的高通ipq5018上面,没有问题,查看高通内核,发现其是4.4.60版本,此改动还未合入。


小知识:git log -p -m选项,可以查看merge的代码改动。

Copyright © linuxdev.cc 2017-2024. Some Rights Reserved.