原文出处:
http://blog.chinaunix.net/uid-22362479-id-3220107.html
http://blog.chinaunix.net/uid-22362479-id-3220136.html
使用socket(AF_PACKET, SOCK_RAW, ETH_P_ALL)创建的套接字到底为何于众不同,今日追踪了一下。使用Linux 3.2.5版内核
net/socket.c
点击(此处)折叠或打开
SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)
{
......
retval = sock_create(family, type, protocol, &sock);
......
}
点击(此处)折叠或打开
int sock_create(int family, int type, int protocol, struct socket **res)
{
return __sock_create(current->nsproxy->net_ns, family, type, protocol, res, 0);
}
点击(此处)折叠或打开
int __sock_create(struct net *net, int family, int type, int protocol,
struct socket **res, int kern)
{
......
pf = rcu_dereference(net_families[family]);
......
err = pf->create(net, sock, protocol, kern);
......
}
点击(此处)折叠或打开
static const struct net_proto_family __rcu *net_families[NPROTO] __read_mostly;
int sock_register(const struct net_proto_family *ops)
{
......
if (rcu_dereference_protected(net_families[ops->family],
lockdep_is_held(&net_family_lock)))
err = -EEXIST;
else {
rcu_assign_pointer(net_families[ops->family], ops);
err = 0;
}
......
}
net/packet/af_packet.c
点击(此处)折叠或打开
static const struct net_proto_family packet_family_ops = {
.family = PF_PACKET,
.create = packet_create,
.owner = THIS_MODULE,
};
static int __init packet_init(void)
{
......
sock_register(&packet_family_ops);
......
}
module_init(packet_init);
点击(此处)折叠或打开
static int packet_create(struct net *net, struct socket *sock, int protocol,
int kern)
{
struct sock *sk;
struct packet_sock *po;
__be16 proto = (__force __be16)protocol; /* weird, but documented */
......
sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
......
sock->ops = &packet_ops;
......
po = pkt_sk(sk);
sk->sk_family = PF_PACKET;po->num = proto;
......
po->prot_hook.func = packet_rcv;
......
po->prot_hook.af_packet_priv = sk;
if (proto) {
po->prot_hook.type = proto;
register_prot_hook(sk);
}
......
}
AF_PACKET套接字的功能来源于prot_hook,其本身是struct packet_type类型:
1:type成员设定为了socket()传递的参数(这里是ETH_P_ALL)
2:过滤得到的包的处理函数保存于func成员,这里被设定为 packet_rcv()
3:dev成员用于对net_device的过滤,可以在bind()中指定
点击(此处)折叠或打开
static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
{
......
if(sll->sll_ifindex){
err =-ENODEV;
dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
if(dev ==NULL)
goto out;
}
err =packet_do_bind(sk, dev, sll->sll_protocol ?: pkt_sk(sk)->num);
......
}
点击(此处)折叠或打开
static intpacket_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
{
struct packet_sock *po = pkt_sk(sk);
......
po->prot_hook.dev = dev;
po->ifindex = dev ? dev->ifindex : 0;
......
if(!dev ||(dev->flags & IFF_UP)){
register_prot_hook(sk);
......
}
可见在bind()中若指定了绑定的net_device同样会触发prot_hook的注册动作。
点击(此处)折叠或打开
static void register_prot_hook(struct sock *sk)
{
struct packet_sock *po = pkt_sk(sk);
if(!po->running){
if(po->fanout)
__fanout_link(sk, po);
else
dev_add_pack(&po->prot_hook);
sock_hold(sk);
po->running = 1;
}
}
net/core/dev.c
点击(此处)折叠或打开
void dev_add_pack(struct packet_type *pt)
{
struct list_head *head =ptype_head(pt);
spin_lock(&ptype_lock);
list_add_rcu(&pt->list, head);
spin_unlock(&ptype_lock);
}
点击(此处)折叠或打开
static struct list_head ptype_all __read_mostly; /* Taps */
static inline struct list_head *ptype_head(const struct packet_type *pt)
{
if(pt->type == htons(ETH_P_ALL))
return &ptype_all;
else
return &ptype_base[ntohs(pt->type)& PTYPE_HASH_MASK];
}
历尽千辛万苦,终于知道AF_PACKET套接字把自己的prot_hook挂到了ptype_all链表上或ptype_base链表上。当AF_PACKET套接字注册了prot_hook后,怎样进行监听呢,先来看发送:
当协议栈准备将数据交给net_device发送时,它将调用dev_queue_xmit():
点击(此处)折叠或打开
int dev_queue_xmit(struct sk_buff *skb)
{
struct net_device *dev = skb->dev;
......
rc = dev_hard_start_xmit(skb, dev, txq);
......
}
点击(此处)折叠或打开
int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
struct netdev_queue *txq)
{
......
if(!list_empty(&ptype_all))
dev_queue_xmit_nit(skb, dev);
......
}
由于AF_PACKET套接字注册了prot_hook,将导致dev_queue_xmit_nit()被调用:
点击(此处)折叠或打开
static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
{
struct packet_type *ptype;
struct sk_buff *skb2 =NULL;
struct packet_type *pt_prev =NULL;
rcu_read_lock();
list_for_each_entry_rcu(ptype,&ptype_all, list){
/* Never send packets back to the socket
* they originated from - MvS (miquels@drinkel.ow.org)
*/
if((ptype->dev == dev ||!ptype->dev)&&
(ptype->af_packet_priv ==NULL||
(struct sock *)ptype->af_packet_priv != skb->sk)){
if(pt_prev){
deliver_skb(skb2, pt_prev, skb->dev);
pt_prev = ptype;
continue;
}
skb2 = skb_clone(skb, GFP_ATOMIC);
if(!skb2)
break;
......
}
}
if(pt_prev)
pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
rcu_read_unlock();
}
在遍历ptype_all链表时,这里有几点需要着重说明:
1:对于发送的包过滤条件有:
1). net_device是否是prot_hook指定的dev(NULL代表全部匹配)。
2). ptype->af_packet_priv在packet_creat()中被设定为自己,故自己发送的包不会被监听。
2:遍历ptype_all时,第一次会复制skb;只有ptype_all中不止1个entry时,将调用deliver_skb()。
3:当退出遍历时,将调用prot_hook的func成员,即packet_rcv()。
4: deliver_skb()仅仅是在调用prot_hook的func成员前增加skb的引用计算数。
点击(此处)折叠或打开
static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
struct packet_type *pt, struct net_device *orig_dev)
{
......
__skb_queue_tail(&sk->sk_receive_queue, skb);
......
}
经过packet_rcv(),发送的数据包被加入到了AF_PACKET套接字的接收队列,等待我们的读取。。。一切就是这么简单!