ILD

Queue discipline
作者:Yuan Jianpeng 邮箱:yuanjp89@163.com
发布时间:2024-6-19 站点:Inside Linux Development

queue discipline (QDisc) 是Linux内核用来实现 QoS (Quality of Servic) 的技术方案。qdisc的目的是:traffic shaping。qdisc分为两种,classful和classless,qdisc的具体细节后面分析,先来看下5.15内核源码。

源码阅读

qdisc位于协议栈和设备驱动之间。当内核协议栈有skb要发送时,调用:

    int dev_queue_xmit(struct sk_buff *skb)


当然也有接口,可以跳过qdisc,将skb直接发送给设备驱动。

    struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
                                    struct netdev_queue *txq, int *ret)

        ->

            static int xmit_one(struct sk_buff *skb, struct net_device *dev,
                    struct netdev_queue *txq, bool more)

            ->

                static inline netdev_tx_t netdev_start_xmit(struct sk_buff *skb, struct net_device *dev,
                                            struct netdev_queue *txq, bool more)

                ->

                    ops->ndo_start_xmit(skb, dev);


dev_queue_xmit

qdisc挂在net device的 tx queue下面。所以首先选择一个tx queue,然后得到qdisc,如果qdisc有enqueue接口,就调用__dev_xmit_skb()进行处理,关键代码如下:


static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
{

        struct net_device *dev = skb->dev;
        struct netdev_queue *txq;
        struct Qdisc *q;

        txq = netdev_core_pick_tx(dev, skb, sb_dev);
        q = rcu_dereference_bh(txq->qdisc);

        trace_net_dev_queue(skb);
        if (q->enqueue) {
                rc = __dev_xmit_skb(skb, q, dev, txq);
                goto out;
        }
}


__dev_xmit_skb的核心是调用dev_qdisc_enqueue()入队,然后调用qdisc_run()调度。


static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
                                 struct net_device *dev,
                                 struct netdev_queue *txq)
{

                rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
                qdisc_run(q)

}

static int dev_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *q,
                             struct sk_buff **to_free,
                             struct netdev_queue *txq)
{
        int rc;

        rc = q->enqueue(skb, q, to_free) & NET_XMIT_MASK;
        if (rc == NET_XMIT_SUCCESS)
                trace_qdisc_enqueue(q, txq, skb);
        return rc;
}


Qdisc

qdisc实例用Qdisc结构体表示:


struct Qdisc {
        int                     (*enqueue)(struct sk_buff *skb,
                                           struct Qdisc *sch,
                                           struct sk_buff **to_free);
        struct sk_buff *        (*dequeue)(struct Qdisc *sch);
        unsigned int            flags;
        u32                     limit;
        const struct Qdisc_ops  *ops;


Qdisc_ops 结构体表示一种qdisc,它定义了qdisc相关操作的接口

struct Qdisc_ops {
        struct Qdisc_ops        *next;
        const struct Qdisc_class_ops    *cl_ops;
        char                    id[IFNAMSIZ];
        int                     priv_size;
        unsigned int            static_flags;

        int                     (*enqueue)(struct sk_buff *skb,
                                           struct Qdisc *sch,
                                           struct sk_buff **to_free);
        struct sk_buff *        (*dequeue)(struct Qdisc *);
        struct sk_buff *        (*peek)(struct Qdisc *);

        int                     (*init)(struct Qdisc *sch, struct nlattr *arg,
                                        struct netlink_ext_ack *extack);
        void                    (*reset)(struct Qdisc *);
        void                    (*destroy)(struct Qdisc *);
        int                     (*change)(struct Qdisc *sch,
                                          struct nlattr *arg,
                                          struct netlink_ext_ack *extack);
        void                    (*attach)(struct Qdisc *sch);

Schedule Qdisc

qdisc的调度函数是qdisc_run(),qdisc的traffic shaping是在dequeue实现的,enqueue只是入队。


static inline bool qdisc_restart(struct Qdisc *q, int *packets)
{
        spinlock_t *root_lock = NULL;
        struct netdev_queue *txq;
        struct net_device *dev;
        struct sk_buff *skb;
        bool validate;

        /* Dequeue packet */
        skb = dequeue_skb(q, &validate, packets);
        if (unlikely(!skb))
                return false;

        if (!(q->flags & TCQ_F_NOLOCK))
                root_lock = qdisc_lock(q);

        dev = qdisc_dev(q);
        txq = skb_get_tx_queue(dev, skb);

        return sch_direct_xmit(skb, q, dev, txq, root_lock, validate);
}

void __qdisc_run(struct Qdisc *q)
{
        int quota = dev_tx_weight;
        int packets;

        while (qdisc_restart(q, &packets)) {
                quota -= packets;
                if (quota <= 0) {
                        if (q->flags & TCQ_F_NOLOCK)
                                set_bit(__QDISC_STATE_MISSED, &q->state);
                        else
                                __netif_schedule(q);

                        break;
                }
        }
}


static inline void qdisc_run(struct Qdisc *q)
{
        if (qdisc_run_begin(q)) {
                __qdisc_run(q);
                qdisc_run_end(q);
        }
}


从内核源码看,qdisc有2个调度路径:


static __latent_entropy void net_tx_action(struct softirq_action *h)

{

                rcu_read_lock();
                while (head) {
                        struct Qdisc *q = head;
                        head = head->next_sched;


                        clear_bit(__QDISC_STATE_SCHED, &q->state);
                        qdisc_run(q);

                }

                rcu_read_unlock()

}

How qdisc deal multiqueue devices

普通的qdisc,应用到所有的tx queue。有mq qdisc,为每个tx queue创建qdsic。


参考

【1】 queueing in linux-htb. https://unix.stackexchange.com/questions/503563/queueing-in-linux-htb


Classful disciplines are (as the answer by sourcejedi says) flexible. They allow you to attach children classful qdiscs to them and can share bandwidth with other classes, when possible. Leaf classes have a classless qdisc (elementary/fundamental qdisc) attached to them (also called an elementary qdisc). The queues managed by these elementary qdiscs is where the packets get queued and dequeued. The packets are dequeued and enqueued from these classes by an algorithm corresponding to the class. Examples of classful qdiscs are: HTB and CBQ.

Classless qdiscs are the fundamental or the elementary qdiscs, which are rigid in the sense that they cannot have children qdiscs attached to them, nor can they share bandwidth. In naive terms, they are on their own. These qdiscs own a queue from which they queue and dequeue packets according to the algorithm corresponding the qdisc. Examples of classless qdisc: pfifo, bfifo, pfifo_fast (default used by Linux tc), tbf, sfq and a few more.


When a leaf node is created (in context of HTB qdisc), pfifo qdisc is attached to the leaf class by default.  This pfifo is initialized with a queue limit of txqueuelen of the interface.  This can be found in the function htb_change_class() in sch_htb.c, line 1395:


The kernel interacts directly with the root qdisc (maybe classful or classless) when it wants to queue or dequeue a packet. If the root qdisc is classful and has children, then it first classifies the packet (decides which child to send the packet to)


static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,
                                      int *qerr)

On reading the comments, one can easily infer that this function returns one of the following:
    NULL, if the packet should be dropped
    -1, if the packet should be queued into direct_queue
    a leaf node (which contains an elementary qdisc, where the packets actually end up) This function traverses through all the interior nodes (Classes) of the tree until it returns a leaf node, where the packet should be queued.

While dequeuing, each of the classes follow the algo associated with their qdisc to decide which of the children to dequeue from, and children do the same thing, until a packet is dequeued from an elementary qdisc attached to a leaf class. This also ensures that the rate of a child class is no more than its parent. (Since the parent will decide whether the packet will pass through or not). I have not gone through the source of dequeuing in htb, so I can't provide a source for that.

Direct queue:

It is a special internal fifo queue maintained by the HTB qdisc from which the packets are dequeued at hardware speed. Its queue length is txqueuelen. A packet ends up in a direct queue if HTB is unable to classify it into one of the children qdiscs, and the default is not specified.


【2】lartc (Linux Advanced Routing & Traffic Control) official. https://www.lartc.org/

【3】lartc pdf. https://www.lartc.org/lartc.pdf


【4】htb home. http://luxik.cdi.cz/~devik/qos/htb/

        HTB Linux queuing discipline manual - user guide. http://luxik.cdi.cz/~devik/qos/htb/manual/userg.htm

        Hierachical token bucket theory. http://luxik.cdi.cz/~devik/qos/htb/manual/theory.htm


【5】tcio (Linux Network Traffic Control, Implementation Overview). http://luxik.cdi.cz/~devik/qos/papers/tcio-current.ps.gz


【6】HTB: quantum vs. burst. https://lartc.vger.kernel.narkive.com/zTL150Sl/htb-quantum-vs-burst


Stef Coene

Quantum and bursts have nothing common. Quantum is used to share remaining
bandwidth between child classes. So each class can send "quantum" bytes.
Each class is controlled with 2 buckets, one for the rate, one for the ceil.
These buckets have also a burst and this is burst for rate and cburst for
ceil.

Very simplified situation as example : So even if you have a big quantum of
let's say 60.000 bytes and you have a ceil of 6.000 bytes/s, you can only
send 6.000 packets / second so it takes 10 seconds to send. But if you have
a burst of 30.000 byts/s and you have a very fast connection, you can send
30.000 bytes very fast, but the remaining packets are send at ceil speed so
6.000 bytes/s. So it will take 5 seconds to send all the data.


【7】Traffic Shaping tc-htb, burst has no effect.

https://stackoverflow.com/questions/46049140/traffic-shaping-tc-htb-burst-has-no-effect


'ceil' specifies how much bandwidth a traffic class can borrow from a parent class if there is spare bandwidth available from peer classes. However ,when applied to the root qdisc there is no parent to borrow from - so specifying ceil different to rate is meaningless for a class on a root qdisc.

'burst' specifies the amount of packets that are sent (at full link speed) from one class before stopping to serve another class, & the rate shaping being achieved by averaging the bursts over time. If applied to root with no child classes, it will only affect the accuracy of the averaging (smoothing), & won't do anything to the true average rate.


【8】HOWTO for multiqueue network device support.

https://www.kernel.org/doc/html/v5.8/networking/multiqueue.html


【9】Linux TX Multiqueue Implementation.

http://vger.kernel.org/~davem/davem_seattle08.pdf


 

Copyright © linuxdev.cc 2017-2024. Some Rights Reserved.