概述
1、macvlan是一种网卡虚拟化解决方案,能够将一块物理网卡虚拟成多块虚拟网卡。
2、macvlan 这种技术听起来有点像 VLAN,但它们的实现机制是完全不一样的。macvlan 子接口和原来的主接口是完全独立的,可以单独配置 MAC 地址和 IP 地址,而 VLAN 子接口和主接口共用相同的 MAC 地址。VLAN 用来划分广播域,而 macvlan 共享同一个广播域。
3、macvlan 会根据收到包的目的 MAC 地址判断这个包需要交给哪个虚拟网卡,虚拟网卡再把包交给上层的协议栈处理。
macvlan模式
根据 macvlan 子接口之间的通信模式,macvlan 有四种网络模式:
private 模式
vepa(virtual ethernet port aggregator) 模式
bridge 模式
passthru 模式
private模式
这种模式下,同一主接口下的子接口之间彼此隔离,不能通信。即使从外部的物理交换机导流,也会被无情地丢掉。
vepa模式
这种模式下,子接口之间的通信流量需要导到外部支持 802.1Qbg/VPEA
功能的交换机上(可以是物理的或者虚拟的),经由外部交换机转发,再绕回来。
注:802.1Qbg/VPEA
功能简单说就是交换机要支持 发夹(``hairpin``)
功能,也就是数据包从一个接口上收上来之后还能再扔回去。
bridge 模式
这种模式下,模拟的是 Linux bridge 的功能,但比 bridge 要好的一点是每个接口的 MAC 地址是已知的,不用学习。所以,这种模式下,子接口之间就是直接可以通信的。
passthru 模式
这种模式,只允许单个子接口连接主接口,且必须设置成混杂模式,一般用于子接口桥接和创建 VLAN 子接口的场景。
macvlan使用命令举例
x
ip link add link ethX(主设备) name ethX_1(虚拟macvlan设备) type macvlan mode private
虚拟macvlan(ethX_1)设备创建
void __init rtnetlink_init(void)
{
if (register_pernet_subsys(&rtnetlink_net_ops))
panic("rtnetlink_init: cannot initialize rtnetlink\n");
register_netdevice_notifier(&rtnetlink_dev_notifier);
rtnl_register(PF_UNSPEC, RTM_GETLINK, rtnl_getlink,
rtnl_dump_ifinfo, rtnl_calcit);
rtnl_register(PF_UNSPEC, RTM_SETLINK, rtnl_setlink, NULL, NULL);
rtnl_register(PF_UNSPEC, RTM_NEWLINK, rtnl_newlink, NULL, NULL);
rtnl_register(PF_UNSPEC, RTM_DELLINK, rtnl_dellink, NULL, NULL);
rtnl_register(PF_UNSPEC, RTM_GETADDR, NULL, rtnl_dump_all, NULL);
rtnl_register(PF_UNSPEC, RTM_GETROUTE, NULL, rtnl_dump_all, NULL);
rtnl_register(PF_BRIDGE, RTM_NEWNEIGH, rtnl_fdb_add, NULL, NULL);
rtnl_register(PF_BRIDGE, RTM_DELNEIGH, rtnl_fdb_del, NULL, NULL);
rtnl_register(PF_BRIDGE, RTM_GETNEIGH, NULL, rtnl_fdb_dump, NULL);
rtnl_register(PF_BRIDGE, RTM_GETLINK, NULL, rtnl_bridge_getlink, NULL);
rtnl_register(PF_BRIDGE, RTM_DELLINK, rtnl_bridge_dellink, NULL, NULL);
rtnl_register(PF_BRIDGE, RTM_SETLINK, rtnl_bridge_setlink, NULL, NULL);
}
static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh)
{
.....
dev = rtnl_create_link(dest_net, ifname, ops, tb);
.....
}
struct net_device *rtnl_create_link(struct net *net,
char *ifname, const struct rtnl_link_ops *ops, struct nlattr *tb[])
{
......
dev = alloc_netdev_mqs(ops->priv_size, ifname, ops->setup,
num_tx_queues, num_rx_queues);
.....
}
其中 ops->priv_size = sizeof(struct macvlan_dev)
int macvlan_link_register(struct rtnl_link_ops *ops)
{
/* common fields */
ops->priv_size = sizeof(struct macvlan_dev);
ops->validate = macvlan_validate;
ops->maxtype = IFLA_MACVLAN_MAX;
ops->policy = macvlan_policy;
ops->changelink = macvlan_changelink;
ops->get_size = macvlan_get_size;
ops->fill_info = macvlan_fill_info;
return rtnl_link_register(ops);
};
macvlan代码实现
关键数据结构
macvlan_port
与主设备关联;其成员dev即主设备net_device;vlan_hash、vlans记录了所有的macvlan子设备类别,一个用hash链表组织,一个采用普通链表组织。
struct macvlan_port {
struct net_device *dev;
struct hlist_head vlan_hash[MACVLAN_HASH_SIZE];
struct list_head vlans;
struct rcu_head rcu;
bool passthru;
int count;
};
macvlan_dev
与虚拟子设备关联;macvlan_dev 存储在net_device的priv部分(dev->priv)。
其中最重要的是receive和forward两个函数指针,receive是子设备收包函数,forward函数用于子设备之间互相转发。
struct macvlan_dev *vlan = netdev_priv(dev);
struct macvlan_dev {
struct net_device *dev;
struct list_head list;
struct hlist_node hlist;
struct macvlan_port *port;
struct net_device *lowerdev;
struct macvlan_pcpu_stats __percpu *pcpu_stats;
DECLARE_BITMAP(mc_filter, MACVLAN_MC_FILTER_SZ);
enum macvlan_mode mode;
u16 flags;
int (*receive)(struct sk_buff *skb);
int (*forward)(struct net_device *dev, struct sk_buff *skb);
struct macvtap_queue *taps[MAX_MACVTAP_QUEUES];
int numvtaps;
int minor;
};
macvlan模块注册
static int __init macvlan_init_module(void)
{
int err;
register_netdevice_notifier(&macvlan_notifier_block);
err = macvlan_link_register(&macvlan_link_ops);
if (err < 0)
goto err1;
return 0;
err1:
unregister_netdevice_notifier(&macvlan_notifier_block);
return err;
}
static struct rtnl_link_ops macvlan_link_ops = {
.kind = "macvlan",
.setup = macvlan_setup,
.newlink = macvlan_newlink,
.dellink = macvlan_dellink,
};
macvlan虚拟子设备与主设备建立关联关系
macvlan_newlink
dev为虚拟子设备
static int macvlan_newlink(struct net *src_net, struct net_device *dev,
struct nlattr *tb[], struct nlattr *data[])
{
return macvlan_common_newlink(src_net, dev, tb, data,
netif_rx,
dev_forward_skb);
}
macvlan_common_newlink
int macvlan_common_newlink(struct net *src_net, struct net_device *dev,
struct nlattr *tb[], struct nlattr *data[],
int (*receive)(struct sk_buff *skb),
int (*forward)(struct net_device *dev,
struct sk_buff *skb))
{
struct macvlan_dev *vlan = netdev_priv(dev);
struct macvlan_port *port;
struct net_device *lowerdev;
int err;
if (!tb[IFLA_LINK])
return -EINVAL;
lowerdev = __dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK])); /* 从入参获取主设备 */
if (lowerdev == NULL)
return -ENODEV;
/* When creating macvlans on top of other macvlans - use
* the real device as the lowerdev.
*/
if (lowerdev->rtnl_link_ops == dev->rtnl_link_ops) {
struct macvlan_dev *lowervlan = netdev_priv(lowerdev);
lowerdev = lowervlan->lowerdev;
}
if (!tb[IFLA_MTU])
dev->mtu = lowerdev->mtu;
else if (dev->mtu > lowerdev->mtu)
return -EINVAL;
if (!tb[IFLA_ADDRESS])
eth_hw_addr_random(dev);
/* 如果主设备没有macvlan子设备,则创建 macvlan_port,与主设备关联起来 */
if (!macvlan_port_exists(lowerdev)) {
err = macvlan_port_create(lowerdev);
if (err < 0)
return err;
}
port = macvlan_port_get_rtnl(lowerdev);
/* Only 1 macvlan device can be created in passthru mode */
if (port->passthru)
return -EINVAL;
vlan->lowerdev = lowerdev;
vlan->dev = dev;
vlan->port = port;
vlan->receive = receive;
vlan->forward = forward;
vlan->mode = MACVLAN_MODE_VEPA;
if (data && data[IFLA_MACVLAN_MODE])
vlan->mode = nla_get_u32(data[IFLA_MACVLAN_MODE]);
if (data && data[IFLA_MACVLAN_FLAGS])
vlan->flags = nla_get_u16(data[IFLA_MACVLAN_FLAGS]);
if (vlan->mode == MACVLAN_MODE_PASSTHRU) {
if (port->count)
return -EINVAL;
port->passthru = true;
memcpy(dev->dev_addr, lowerdev->dev_addr, ETH_ALEN);
}
err = netdev_upper_dev_link(lowerdev, dev);
if (err)
goto destroy_port;
port->count += 1;
err = register_netdevice(dev);
if (err < 0)
goto upper_dev_unlink;
list_add_tail_rcu(&vlan->list, &port->vlans);
netif_stacked_transfer_operstate(lowerdev, dev);
return 0;
upper_dev_unlink:
netdev_upper_dev_unlink(lowerdev, dev);
destroy_port:
port->count -= 1;
if (!port->count)
macvlan_port_destroy(lowerdev);
return err;
}
macvlan_port_create
下面创建macvlan_port的函数macvlan_port_create入参就是主设备的net_device。
static int macvlan_port_create(struct net_device *dev)
{
struct macvlan_port *port;
unsigned int i;
int err;
if (dev->type != ARPHRD_ETHER || dev->flags & IFF_LOOPBACK)
return -EINVAL;
port = kzalloc(sizeof(*port), GFP_KERNEL);
if (port == NULL)
return -ENOMEM;
port->passthru = false;
port->dev = dev; /* 与主设备关联 */
INIT_LIST_HEAD(&port->vlans);
for (i = 0; i < MACVLAN_HASH_SIZE; i++)
INIT_HLIST_HEAD(&port->vlan_hash[i]);
err = netdev_rx_handler_register(dev, macvlan_handle_frame, port); /* 注册了rx_handler */
if (err)
kfree(port);
else
dev->priv_flags |= IFF_MACVLAN_PORT;
return err;
}
macvlan_handle_frame:主设备 rx_handler
err = netdev_rx_handler_register(dev, macvlan_handle_frame, port);
数据包收发
macvlan rx
macvlan的rx是通过在主设备上面注册rx_handle, rx_handle的处理函数为macvlan_handle_frame。macvlan收到数据包按照如下过程处理:
static rx_handler_result_t macvlan_handle_frame(struct sk_buff **pskb)
{
struct macvlan_port *port;
struct sk_buff *skb = *pskb;
const struct ethhdr *eth = eth_hdr(skb);
const struct macvlan_dev *vlan;
const struct macvlan_dev *src;
struct net_device *dev;
unsigned int len = 0;
int ret = NET_RX_DROP;
port = macvlan_port_get_rcu(skb->dev);
/*
广播、多播处理
1、利用源mac去macvlan_port中查找是否是端口forward的数据包
2、如果源没有找到广播给所有端口
3、如果源被找到了说明是bridge/evpa模式,如果源端口时evpa模式则叫广播扩散到evpa和bridge口,如果源端口是bridge模式则扩散给evpa口(bridge口在rx的时候早已转发),其它模式给该设备本身
*/
if (is_multicast_ether_addr(eth->h_dest)) {
skb = ip_check_defrag(skb, IP_DEFRAG_MACVLAN);
if (!skb)
return RX_HANDLER_CONSUMED;
eth = eth_hdr(skb);
src = macvlan_hash_lookup(port, eth->h_source);
if (!src)
/* frame comes from an external address */
macvlan_broadcast(skb, port, NULL,
MACVLAN_MODE_PRIVATE |
MACVLAN_MODE_VEPA |
MACVLAN_MODE_PASSTHRU|
MACVLAN_MODE_BRIDGE);
else if (src->mode == MACVLAN_MODE_VEPA)
/* flood to everyone except source */
macvlan_broadcast(skb, port, src->dev,
MACVLAN_MODE_VEPA |
MACVLAN_MODE_BRIDGE);
else if (src->mode == MACVLAN_MODE_BRIDGE)
/*
* flood only to VEPA ports, bridge ports
* already saw the frame on the way out.
*/
macvlan_broadcast(skb, port, src->dev,
MACVLAN_MODE_VEPA);
else {
/* forward to original port. */
vlan = src;
ret = macvlan_broadcast_one(skb, vlan, eth, 0);
goto out;
}
return RX_HANDLER_PASS;
}
/*
单播数据处理
1、macvlan处于passthrough模式下,则数据包直接交给该port
2、利用目的mac去macvlan_port中查找是否存在匹配的mac地址,查找到则调用macvlan_dev中的receive(这个和以太网不接收mac不是自己的数据一致)
3、没有匹配的macvlan_dev则将数据交给master
*/
if (port->passthru)
vlan = list_first_or_null_rcu(&port->vlans,
struct macvlan_dev, list);
else
vlan = macvlan_hash_lookup(port, eth->h_dest);
if (vlan == NULL)
return RX_HANDLER_PASS;
dev = vlan->dev;
if (unlikely(!(dev->flags & IFF_UP))) {
kfree_skb(skb);
return RX_HANDLER_CONSUMED;
}
len = skb->len + ETH_HLEN;
skb = skb_share_check(skb, GFP_ATOMIC);
if (!skb)
goto out;
skb->dev = dev;
skb->pkt_type = PACKET_HOST;
ret = vlan->receive(skb);
out:
macvlan_count_rx(vlan, len, ret == NET_RX_SUCCESS, 0);
return RX_HANDLER_CONSUMED;
}
macvlan tx
macvlan的tx和模式有关,当macvlan是bridge模式的时候,tx的时候会将广播包或者单播目的mac地址是其它网卡的设备通过macvlan_dev的forward函数转发给其它的虚拟网卡。其它模式的时候将直接从主设备TX出去。
static const struct net_device_ops macvlan_netdev_ops = {
.ndo_init = macvlan_init,
.ndo_uninit = macvlan_uninit,
.ndo_open = macvlan_open,
.ndo_stop = macvlan_stop,
.ndo_start_xmit = macvlan_start_xmit,
.ndo_change_mtu = macvlan_change_mtu,
.ndo_change_rx_flags = macvlan_change_rx_flags,
.ndo_set_mac_address = macvlan_set_mac_address,
.ndo_set_rx_mode = macvlan_set_mac_lists,
.ndo_get_stats64 = macvlan_dev_get_stats64,
.ndo_validate_addr = eth_validate_addr,
.ndo_vlan_rx_add_vid = macvlan_vlan_rx_add_vid,
.ndo_vlan_rx_kill_vid = macvlan_vlan_rx_kill_vid,
.ndo_fdb_add = macvlan_fdb_add,
.ndo_fdb_del = macvlan_fdb_del,
.ndo_fdb_dump = ndo_dflt_fdb_dump,
};
netdev_tx_t macvlan_start_xmit(struct sk_buff *skb,
struct net_device *dev)
{
unsigned int len = skb->len;
int ret;
const struct macvlan_dev *vlan = netdev_priv(dev);
ret = macvlan_queue_xmit(skb, dev);
if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) {
struct macvlan_pcpu_stats *pcpu_stats;
pcpu_stats = this_cpu_ptr(vlan->pcpu_stats);
u64_stats_update_begin(&pcpu_stats->syncp);
pcpu_stats->tx_packets++;
pcpu_stats->tx_bytes += len;
u64_stats_update_end(&pcpu_stats->syncp);
} else {
this_cpu_inc(vlan->pcpu_stats->tx_dropped);
}
return ret;
}
static int macvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev)
{
const struct macvlan_dev *vlan = netdev_priv(dev);
const struct macvlan_port *port = vlan->port;
const struct macvlan_dev *dest;
__u8 ip_summed = skb->ip_summed;
if (vlan->mode == MACVLAN_MODE_BRIDGE) {
const struct ethhdr *eth = (void *)skb->data;
skb->ip_summed = CHECKSUM_UNNECESSARY;
/* send to other bridge ports directly */
if (is_multicast_ether_addr(eth->h_dest)) {
macvlan_broadcast(skb, port, dev, MACVLAN_MODE_BRIDGE);
goto xmit_world;
}
dest = macvlan_hash_lookup(port, eth->h_dest);
if (dest && dest->mode == MACVLAN_MODE_BRIDGE) {
/* send to lowerdev first for its network taps */
dev_forward_skb(vlan->lowerdev, skb);
return NET_XMIT_SUCCESS;
}
}
xmit_world:
skb->ip_summed = ip_summed;
skb->dev = vlan->lowerdev;
return dev_queue_xmit(skb);
}