Linux内核中网络设备连接状态监测

Linux中,网络设备会定时地检测设备是否处于可传递状态。当状态发生变化时,网络设备驱动程序会调用netif_carrier_on()或者netif_carrier_off()函数来通知内核。插拔网络设备网线或者另一端的设备关闭或禁止,都会导致连接状态改变。

netif_carrier_on() - 设备驱动监测到设备传递信号时调用
netif_carrier_off() - 设备驱动监测到设备丢失信号时调用

Linux内核中实现上述两个函数的代码位于net/sched/sch_generic.c文件中。它们均会调用linkwatch_fire_event()将事件加入到事件队列进行调度。

netif_carrier_on()为例,函数调用过程如下:

netif_carrier_on() - net/sched/sch_generic.c
    linkwatch_fire_event() - net/core/link_watch.c
        linkwatch_add_event() - net/core/link_watch.c
        linkwatch_schedule_work() - net/core/link_watch.c
            mod_delayed_work() - include/linux/workqueue.h
                mod_delayed_work_on() - kernel/workqueue.c
                    __queue_delayed_work() - kernel/workqueue.c
            schedule_delayed_work() - include/linux/workqueue.h
                queue_delayed_work() - include/linux/workqueue.h
                    queue_delayed_work_on() - kernel/workqueue.c
                        __queue_delayed_work() - kernel/workqueue.c

接收到信号

当监测到设备传递信号时函数netif_carrier_on()会被调用,并调用linkwatch_fire_event()函数将设备加入到事件处理队列进行处理。

/**
 *	netif_carrier_on - set carrier
 *	@dev: network device
 *
 * Device has detected acquisition of carrier.
 */
void netif_carrier_on(struct net_device *dev)
{
    // 清除nocarrier状态
	if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
        // 设备未注册则直接返回
		if (dev->reg_state == NETREG_UNINITIALIZED)
			return;
        // 增加设备carrier up状态次数
		atomic_inc(&dev->carrier_up_count);
        // 加入事件处理队列进行处理
		linkwatch_fire_event(dev);
		if (netif_running(dev))
			__netdev_watchdog_up(dev);
	}
}
EXPORT_SYMBOL(netif_carrier_on);

丢失信号

当监测到设备信号丢失时函数netif_carrier_off()会被调用,并调用linkwatch_fire_event()函数将设备加入到事件处理队列进行处理。

/**
 *	netif_carrier_off - clear carrier
 *	@dev: network device
 *
 * Device has detected loss of carrier.
 */
void netif_carrier_off(struct net_device *dev)
{
    // 设置nocarrier状态
	if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
        // 设备未注册则直接返回
		if (dev->reg_state == NETREG_UNINITIALIZED)
			return;
        // 增加设备carrier down状态次数
		atomic_inc(&dev->carrier_down_count);
        // 加入事件处理队列进行处理
		linkwatch_fire_event(dev);
	}
}
EXPORT_SYMBOL(netif_carrier_off);

加入队列

linkwatch_fire_event()函数将设备加入到事件队列,并且进行事件调度,调度中会根据是否为紧急事件做不同处理。

void linkwatch_fire_event(struct net_device *dev)
{
    // 事件是否紧急
	bool urgent = linkwatch_urgent_event(dev);

    // 设置待处理(pending)事件标记
	if (!test_and_set_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state)) {
        // 添加事件到事件列表
		linkwatch_add_event(dev);
	} else if (!urgent)
        // 设置pending后,不紧急,直接返回
		return;

    // 事件调度
	linkwatch_schedule_work(urgent);
}
EXPORT_SYMBOL(linkwatch_fire_event);

static bool linkwatch_urgent_event(struct net_device *dev)
{
    // 设备未运行,不紧急
	if (!netif_running(dev))
		return false;

    // 设备的索引号与链路索引号不等,紧急
	if (dev->ifindex != dev_get_iflink(dev))
		return true;

    // 设备作为bond接口或者team接口,紧急 
	if (netif_is_lag_port(dev) || netif_is_lag_master(dev))
		return true;

    // 连接与否 && 发送队列排队规则改变与否,结果作为是否紧急标志
	return netif_carrier_ok(dev) &&	qdisc_tx_changing(dev);
}

static void linkwatch_add_event(struct net_device *dev)
{
	unsigned long flags;

	spin_lock_irqsave(&lweventlist_lock, flags);
    // 若未添加,则添加设备到事件列表lweventlist
	if (list_empty(&dev->link_watch_list)) {
		list_add_tail(&dev->link_watch_list, &lweventlist);
		dev_hold_track(dev, &dev->linkwatch_dev_tracker, GFP_ATOMIC);
	}
	spin_unlock_irqrestore(&lweventlist_lock, flags);
}

static void linkwatch_schedule_work(int urgent)
{
    // 当前时间到下次的延迟
	unsigned long delay = linkwatch_nextevent - jiffies;

    // 已经设置了紧急标记,则返回
	if (test_bit(LW_URGENT, &linkwatch_flags))
		return;

	/* Minimise down-time: drop delay for up event. */
    // 紧急,delay = 0,立即执行
	if (urgent) {
        // 已经设置了紧急标记,则返回
		if (test_and_set_bit(LW_URGENT, &linkwatch_flags))
			return;
        // 紧急,设置delay = 0,立即执行
		delay = 0;
	}

	/* If we wrap around we'll delay it by at most HZ. */
    // 如果大于1s则立即执行
	if (delay > HZ)
		delay = 0;

	/*
	 * If urgent, schedule immediate execution; otherwise, don't
	 * override the existing timer.
	 */
    // 如果设置了紧急标记,则立即执行
	if (test_bit(LW_URGENT, &linkwatch_flags))
		mod_delayed_work(system_wq, &linkwatch_work, 0);
    // 未设置紧急标记,则按照delay执行
	else
		schedule_delayed_work(&linkwatch_work, delay);
}

事件处理

net/core/link_watch.c中声明了delayed_work结构(即linkwatch_work)和事件处理函数linkwatch_event()。 static DECLARE_DELAYED_WORK(linkwatch_work, linkwatch_event);

static void linkwatch_event(struct work_struct *dummy)
{
	rtnl_lock();
	__linkwatch_run_queue(time_after(linkwatch_nextevent, jiffies));
	rtnl_unlock();
}

static void __linkwatch_run_queue(int urgent_only)
{
    // urgent_only: 1 - 未到达下一次调度时间, 0 - 已到达下次调度时间
#define MAX_DO_DEV_PER_LOOP	100

    // 每次处理的设备个数,最大100个
	int do_dev = MAX_DO_DEV_PER_LOOP;
	struct net_device *dev;
	LIST_HEAD(wrk);

	/* Give urgent case more budget */
    // 紧急,则加大为200个
	if (urgent_only)
		do_dev += MAX_DO_DEV_PER_LOOP;

	/*
	 * Limit the number of linkwatch events to one
	 * per second so that a runaway driver does not
	 * cause a storm of messages on the netlink
	 * socket.  This limit does not apply to up events
	 * while the device qdisc is down.
	 */
    // 已达到调度时间
	if (!urgent_only)
		linkwatch_nextevent = jiffies + HZ;
	/* Limit wrap-around effect on delay. */
    // 未到达调度时间,并且下一次调度在当前时间的1s以后,那么设置调度时间是当前时间
	else if (time_after(linkwatch_nextevent, jiffies + HZ))
		linkwatch_nextevent = jiffies;

    // 清除紧急标志
	clear_bit(LW_URGENT, &linkwatch_flags);

	spin_lock_irq(&lweventlist_lock);
    // 将两个链表进行合并为一个链表并初始化为空表,合并后为wrk,清空lweventlist
	list_splice_init(&lweventlist, &wrk);

    // 遍历合并后的链表wrk
	while (!list_empty(&wrk) && do_dev > 0) {

        // 获取设备
		dev = list_first_entry(&wrk, struct net_device, link_watch_list);
        // 从链表移除设备
		list_del_init(&dev->link_watch_list);

        // 设备不存在/未达到调度时间且不紧急
		if (!netif_device_present(dev) ||
		    (urgent_only && !linkwatch_urgent_event(dev))) {
            // 添加到链表尾部,继续处理
			list_add_tail(&dev->link_watch_list, &lweventlist);
			continue;
		}
		/* We must free netdev tracker under
		 * the spinlock protection.
		 */
		netdev_tracker_free(dev, &dev->linkwatch_dev_tracker);
		spin_unlock_irq(&lweventlist_lock);
        // 处理设备状态
		linkwatch_do_dev(dev);
		do_dev--;
		spin_lock_irq(&lweventlist_lock);
	}

	/* Add the remaining work back to lweventlist */
    // 将两个链表进行合并为一个链表并初始化为空表,合并后为lweventlist,清空wrk
	list_splice_init(&wrk, &lweventlist);

    // 链表有未处理事件,则以非紧急状态调度队列
	if (!list_empty(&lweventlist))
		linkwatch_schedule_work(0);
	spin_unlock_irq(&lweventlist_lock);
}

static void linkwatch_do_dev(struct net_device *dev)
{
	/*
	 * Make sure the above read is complete since it can be
	 * rewritten as soon as we clear the bit below.
	 */
	smp_mb__before_atomic();

	/* We are about to handle this device,
	 * so new events can be accepted
	 */
    // 清除待处理(pending)标志
	clear_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state);

	rfc2863_policy(dev);
    // 设备是up状态
	if (dev->flags & IFF_UP) {
        // 链路连接
		if (netif_carrier_ok(dev))
            // 启用流控排队规则qdisc
			dev_activate(dev);
		else
            // 关闭流控排队规则qdisc
			dev_deactivate(dev);

        // 改变设备状态
		netdev_state_change(dev);
	}
	/* Note: our callers are responsible for calling netdev_tracker_free().
	 * This is the reason we use __dev_put() instead of dev_put().
	 */
	__dev_put(dev);
}