概述

1、macvlan是一种网卡虚拟化解决方案,能够将一块物理网卡虚拟成多块虚拟网卡。

2、macvlan 这种技术听起来有点像 VLAN,但它们的实现机制是完全不一样的。macvlan 子接口和原来的主接口是完全独立的,可以单独配置 MAC 地址和 IP 地址,而 VLAN 子接口和主接口共用相同的 MAC 地址。VLAN 用来划分广播域,而 macvlan 共享同一个广播域。

3、macvlan 会根据收到包的目的 MAC 地址判断这个包需要交给哪个虚拟网卡,虚拟网卡再把包交给上层的协议栈处理。

macvlan模式

根据 macvlan 子接口之间的通信模式,macvlan 有四种网络模式:

private 模式

vepa(virtual ethernet port aggregator) 模式

bridge 模式

passthru 模式

private模式

这种模式下,同一主接口下的子接口之间彼此隔离,不能通信。即使从外部的物理交换机导流,也会被无情地丢掉。

vepa模式

这种模式下,子接口之间的通信流量需要导到外部支持 802.1Qbg/VPEA 功能的交换机上(可以是物理的或者虚拟的),经由外部交换机转发,再绕回来。

注:802.1Qbg/VPEA 功能简单说就是交换机要支持 发夹(``hairpin``) 功能,也就是数据包从一个接口上收上来之后还能再扔回去。

bridge 模式

这种模式下,模拟的是 Linux bridge 的功能,但比 bridge 要好的一点是每个接口的 MAC 地址是已知的,不用学习。所以,这种模式下,子接口之间就是直接可以通信的。

passthru 模式

这种模式,只允许单个子接口连接主接口,且必须设置成混杂模式,一般用于子接口桥接和创建 VLAN 子接口的场景。

macvlan使用命令举例

x
ip link add link ethX(主设备) name ethX_1(虚拟macvlan设备) type macvlan mode private

虚拟macvlan(ethX_1)设备创建

void __init rtnetlink_init(void)
{
	if (register_pernet_subsys(&rtnetlink_net_ops))
		panic("rtnetlink_init: cannot initialize rtnetlink\n");

	register_netdevice_notifier(&rtnetlink_dev_notifier);
	
	rtnl_register(PF_UNSPEC, RTM_GETLINK, rtnl_getlink,
		      rtnl_dump_ifinfo, rtnl_calcit);
	rtnl_register(PF_UNSPEC, RTM_SETLINK, rtnl_setlink, NULL, NULL);
	rtnl_register(PF_UNSPEC, RTM_NEWLINK, rtnl_newlink, NULL, NULL);
	rtnl_register(PF_UNSPEC, RTM_DELLINK, rtnl_dellink, NULL, NULL);
	
	rtnl_register(PF_UNSPEC, RTM_GETADDR, NULL, rtnl_dump_all, NULL);
	rtnl_register(PF_UNSPEC, RTM_GETROUTE, NULL, rtnl_dump_all, NULL);
	
	rtnl_register(PF_BRIDGE, RTM_NEWNEIGH, rtnl_fdb_add, NULL, NULL);
	rtnl_register(PF_BRIDGE, RTM_DELNEIGH, rtnl_fdb_del, NULL, NULL);
	rtnl_register(PF_BRIDGE, RTM_GETNEIGH, NULL, rtnl_fdb_dump, NULL);
	
	rtnl_register(PF_BRIDGE, RTM_GETLINK, NULL, rtnl_bridge_getlink, NULL);
	rtnl_register(PF_BRIDGE, RTM_DELLINK, rtnl_bridge_dellink, NULL, NULL);
	rtnl_register(PF_BRIDGE, RTM_SETLINK, rtnl_bridge_setlink, NULL, NULL);

}
static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh)
{
		.....
		dev = rtnl_create_link(dest_net, ifname, ops, tb);
		.....
}


struct net_device *rtnl_create_link(struct net *net,
	char *ifname, const struct rtnl_link_ops *ops, struct nlattr *tb[])
{
		......
		dev = alloc_netdev_mqs(ops->priv_size, ifname, ops->setup,
			       num_tx_queues, num_rx_queues);
		.....
}

其中 ops->priv_size = sizeof(struct macvlan_dev)

int macvlan_link_register(struct rtnl_link_ops *ops)
{
	/* common fields */
	ops->priv_size		= sizeof(struct macvlan_dev);
	ops->validate		= macvlan_validate;
	ops->maxtype		= IFLA_MACVLAN_MAX;
	ops->policy		= macvlan_policy;
	ops->changelink		= macvlan_changelink;
	ops->get_size		= macvlan_get_size;
	ops->fill_info		= macvlan_fill_info;

	return rtnl_link_register(ops);

};

macvlan代码实现

关键数据结构

macvlan_port

与主设备关联;其成员dev即主设备net_device;vlan_hash、vlans记录了所有的macvlan子设备类别,一个用hash链表组织,一个采用普通链表组织。

struct macvlan_port {
	struct net_device	*dev;
	struct hlist_head	vlan_hash[MACVLAN_HASH_SIZE];
	struct list_head	vlans;
	struct rcu_head		rcu;
	bool 			passthru;
	int			count;
};

macvlan_dev

与虚拟子设备关联;macvlan_dev 存储在net_device的priv部分(dev->priv)。

其中最重要的是receive和forward两个函数指针,receive是子设备收包函数,forward函数用于子设备之间互相转发。

struct macvlan_dev *vlan = netdev_priv(dev);

struct macvlan_dev {
	struct net_device	*dev;
	struct list_head	list;
	struct hlist_node	hlist;
	struct macvlan_port	*port;
	struct net_device	*lowerdev;
	struct macvlan_pcpu_stats __percpu *pcpu_stats;

	DECLARE_BITMAP(mc_filter, MACVLAN_MC_FILTER_SZ);
	
	enum macvlan_mode	mode;
	u16			flags;
	int (*receive)(struct sk_buff *skb);
	int (*forward)(struct net_device *dev, struct sk_buff *skb);
	struct macvtap_queue	*taps[MAX_MACVTAP_QUEUES];
	int			numvtaps;
	int			minor;

};

macvlan模块注册

static int __init macvlan_init_module(void)
{
	int err;

	register_netdevice_notifier(&macvlan_notifier_block);
	
	err = macvlan_link_register(&macvlan_link_ops);
	if (err < 0)
		goto err1;
	return 0;

err1:
	unregister_netdevice_notifier(&macvlan_notifier_block);
	return err;
}
static struct rtnl_link_ops macvlan_link_ops = {
	.kind		= "macvlan",
	.setup		= macvlan_setup,
	.newlink	= macvlan_newlink,
	.dellink	= macvlan_dellink,
};

macvlan虚拟子设备与主设备建立关联关系

macvlan_newlink

dev为虚拟子设备

static int macvlan_newlink(struct net *src_net, struct net_device *dev,
struct nlattr *tb[], struct nlattr *data[])
{
return macvlan_common_newlink(src_net, dev, tb, data,
netif_rx,
dev_forward_skb);
}

macvlan_common_newlink

int macvlan_common_newlink(struct net *src_net, struct net_device *dev,
			   struct nlattr *tb[], struct nlattr *data[],
			   int (*receive)(struct sk_buff *skb),
			   int (*forward)(struct net_device *dev,
					  struct sk_buff *skb))
{
	struct macvlan_dev *vlan = netdev_priv(dev);
	struct macvlan_port *port;
	struct net_device *lowerdev;
	int err;

	if (!tb[IFLA_LINK])
		return -EINVAL;
	
	lowerdev = __dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK])); /* 从入参获取主设备 */
	if (lowerdev == NULL)
		return -ENODEV;
	
	/* When creating macvlans on top of other macvlans - use
	 * the real device as the lowerdev.
	 */
	if (lowerdev->rtnl_link_ops == dev->rtnl_link_ops) {
		struct macvlan_dev *lowervlan = netdev_priv(lowerdev);
		lowerdev = lowervlan->lowerdev;
	}
	
	if (!tb[IFLA_MTU])
		dev->mtu = lowerdev->mtu;
	else if (dev->mtu > lowerdev->mtu)
		return -EINVAL;
	
	if (!tb[IFLA_ADDRESS])
		eth_hw_addr_random(dev);
	
	/* 如果主设备没有macvlan子设备,则创建 macvlan_port,与主设备关联起来 */
	if (!macvlan_port_exists(lowerdev)) {
		err = macvlan_port_create(lowerdev);
		if (err < 0)
			return err;
	}
	port = macvlan_port_get_rtnl(lowerdev);
	
	/* Only 1 macvlan device can be created in passthru mode */
	if (port->passthru)
		return -EINVAL;
	
	vlan->lowerdev = lowerdev;
	vlan->dev      = dev;
	vlan->port     = port;
	vlan->receive  = receive;
	vlan->forward  = forward;
	
	vlan->mode     = MACVLAN_MODE_VEPA;
	if (data && data[IFLA_MACVLAN_MODE])
		vlan->mode = nla_get_u32(data[IFLA_MACVLAN_MODE]);
	
	if (data && data[IFLA_MACVLAN_FLAGS])
		vlan->flags = nla_get_u16(data[IFLA_MACVLAN_FLAGS]);
	
	if (vlan->mode == MACVLAN_MODE_PASSTHRU) {
		if (port->count)
			return -EINVAL;
		port->passthru = true;
		memcpy(dev->dev_addr, lowerdev->dev_addr, ETH_ALEN);
	}
	
	err = netdev_upper_dev_link(lowerdev, dev);
	if (err)
		goto destroy_port;
	
	port->count += 1;
	err = register_netdevice(dev);
	if (err < 0)
		goto upper_dev_unlink;
	
	list_add_tail_rcu(&vlan->list, &port->vlans);
	netif_stacked_transfer_operstate(lowerdev, dev);
	
	return 0;

upper_dev_unlink:
	netdev_upper_dev_unlink(lowerdev, dev);
destroy_port:
	port->count -= 1;
	if (!port->count)
		macvlan_port_destroy(lowerdev);

	return err;

}

macvlan_port_create

下面创建macvlan_port的函数macvlan_port_create入参就是主设备的net_device。

static int macvlan_port_create(struct net_device *dev)
{
	struct macvlan_port *port;
	unsigned int i;
	int err;

	if (dev->type != ARPHRD_ETHER || dev->flags & IFF_LOOPBACK)
		return -EINVAL;
	
	port = kzalloc(sizeof(*port), GFP_KERNEL);
	if (port == NULL)
		return -ENOMEM;
	
	port->passthru = false;
	port->dev = dev; /* 与主设备关联 */
	INIT_LIST_HEAD(&port->vlans);
	for (i = 0; i < MACVLAN_HASH_SIZE; i++)
		INIT_HLIST_HEAD(&port->vlan_hash[i]);
	
	err = netdev_rx_handler_register(dev, macvlan_handle_frame, port); /* 注册了rx_handler */
	if (err)
		kfree(port);
	else
		dev->priv_flags |= IFF_MACVLAN_PORT;
	return err;

}

macvlan_handle_frame:主设备 rx_handler

err = netdev_rx_handler_register(dev, macvlan_handle_frame, port);

数据包收发

macvlan rx

macvlan的rx是通过在主设备上面注册rx_handle, rx_handle的处理函数为macvlan_handle_frame。macvlan收到数据包按照如下过程处理:

static rx_handler_result_t macvlan_handle_frame(struct sk_buff **pskb)
{
	struct macvlan_port *port;
	struct sk_buff *skb = *pskb;
	const struct ethhdr *eth = eth_hdr(skb);
	const struct macvlan_dev *vlan;
	const struct macvlan_dev *src;
	struct net_device *dev;
	unsigned int len = 0;
	int ret = NET_RX_DROP;

	port = macvlan_port_get_rcu(skb->dev);
	/*
	广播、多播处理
	1、利用源mac去macvlan_port中查找是否是端口forward的数据包
	2、如果源没有找到广播给所有端口
	3、如果源被找到了说明是bridge/evpa模式,如果源端口时evpa模式则叫广播扩散到evpa和bridge口,如果源端口是bridge模式则扩散给evpa口(bridge口在rx的时候早已转发),其它模式给该设备本身
	*/
	if (is_multicast_ether_addr(eth->h_dest)) {
		skb = ip_check_defrag(skb, IP_DEFRAG_MACVLAN);
		if (!skb)
			return RX_HANDLER_CONSUMED;
		eth = eth_hdr(skb);
		src = macvlan_hash_lookup(port, eth->h_source);
		if (!src)
			/* frame comes from an external address */
			macvlan_broadcast(skb, port, NULL,
					  MACVLAN_MODE_PRIVATE |
					  MACVLAN_MODE_VEPA    |
					  MACVLAN_MODE_PASSTHRU|
					  MACVLAN_MODE_BRIDGE);
		else if (src->mode == MACVLAN_MODE_VEPA)
			/* flood to everyone except source */
			macvlan_broadcast(skb, port, src->dev,
					  MACVLAN_MODE_VEPA |
					  MACVLAN_MODE_BRIDGE);
		else if (src->mode == MACVLAN_MODE_BRIDGE)
			/*
			 * flood only to VEPA ports, bridge ports
			 * already saw the frame on the way out.
			 */
			macvlan_broadcast(skb, port, src->dev,
					  MACVLAN_MODE_VEPA);
		else {
			/* forward to original port. */
			vlan = src;
			ret = macvlan_broadcast_one(skb, vlan, eth, 0);
			goto out;
		}
	
		return RX_HANDLER_PASS;
	}
	
	/*
	单播数据处理
	1、macvlan处于passthrough模式下,则数据包直接交给该port
	2、利用目的mac去macvlan_port中查找是否存在匹配的mac地址,查找到则调用macvlan_dev中的receive(这个和以太网不接收mac不是自己的数据一致)
	3、没有匹配的macvlan_dev则将数据交给master
	*/
	if (port->passthru)
		vlan = list_first_or_null_rcu(&port->vlans,
					      struct macvlan_dev, list);
	else
		vlan = macvlan_hash_lookup(port, eth->h_dest);
	
	if (vlan == NULL)
		return RX_HANDLER_PASS;
	
	dev = vlan->dev;
	if (unlikely(!(dev->flags & IFF_UP))) {
		kfree_skb(skb);
		return RX_HANDLER_CONSUMED;
	}
	len = skb->len + ETH_HLEN;
	skb = skb_share_check(skb, GFP_ATOMIC);
	if (!skb)
		goto out;
	
	skb->dev = dev;
	skb->pkt_type = PACKET_HOST;
	
	ret = vlan->receive(skb);

out:
	macvlan_count_rx(vlan, len, ret == NET_RX_SUCCESS, 0);
	return RX_HANDLER_CONSUMED;
}

macvlan tx

macvlan的tx和模式有关,当macvlan是bridge模式的时候,tx的时候会将广播包或者单播目的mac地址是其它网卡的设备通过macvlan_dev的forward函数转发给其它的虚拟网卡。其它模式的时候将直接从主设备TX出去。

static const struct net_device_ops macvlan_netdev_ops = {
	.ndo_init		= macvlan_init,
	.ndo_uninit		= macvlan_uninit,
	.ndo_open		= macvlan_open,
	.ndo_stop		= macvlan_stop,
	.ndo_start_xmit		= macvlan_start_xmit,
	.ndo_change_mtu		= macvlan_change_mtu,
	.ndo_change_rx_flags	= macvlan_change_rx_flags,
	.ndo_set_mac_address	= macvlan_set_mac_address,
	.ndo_set_rx_mode	= macvlan_set_mac_lists,
	.ndo_get_stats64	= macvlan_dev_get_stats64,
	.ndo_validate_addr	= eth_validate_addr,
	.ndo_vlan_rx_add_vid	= macvlan_vlan_rx_add_vid,
	.ndo_vlan_rx_kill_vid	= macvlan_vlan_rx_kill_vid,
	.ndo_fdb_add		= macvlan_fdb_add,
	.ndo_fdb_del		= macvlan_fdb_del,
	.ndo_fdb_dump		= ndo_dflt_fdb_dump,
};
netdev_tx_t macvlan_start_xmit(struct sk_buff *skb,
			       struct net_device *dev)
{
	unsigned int len = skb->len;
	int ret;
	const struct macvlan_dev *vlan = netdev_priv(dev);

	ret = macvlan_queue_xmit(skb, dev);
	if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) {
		struct macvlan_pcpu_stats *pcpu_stats;
	
		pcpu_stats = this_cpu_ptr(vlan->pcpu_stats);
		u64_stats_update_begin(&pcpu_stats->syncp);
		pcpu_stats->tx_packets++;
		pcpu_stats->tx_bytes += len;
		u64_stats_update_end(&pcpu_stats->syncp);
	} else {
		this_cpu_inc(vlan->pcpu_stats->tx_dropped);
	}
	return ret;

}
static int macvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev)
{
	const struct macvlan_dev *vlan = netdev_priv(dev);
	const struct macvlan_port *port = vlan->port;
	const struct macvlan_dev *dest;
	__u8 ip_summed = skb->ip_summed;

	if (vlan->mode == MACVLAN_MODE_BRIDGE) {
		const struct ethhdr *eth = (void *)skb->data;
		skb->ip_summed = CHECKSUM_UNNECESSARY;
	
		/* send to other bridge ports directly */
		if (is_multicast_ether_addr(eth->h_dest)) {
			macvlan_broadcast(skb, port, dev, MACVLAN_MODE_BRIDGE);
			goto xmit_world;
		}
	
		dest = macvlan_hash_lookup(port, eth->h_dest);
		if (dest && dest->mode == MACVLAN_MODE_BRIDGE) {
			/* send to lowerdev first for its network taps */
			dev_forward_skb(vlan->lowerdev, skb);
	
			return NET_XMIT_SUCCESS;
		}
	}

xmit_world:
	skb->ip_summed = ip_summed;
	skb->dev = vlan->lowerdev;
	return dev_queue_xmit(skb);
}