这里就不贴源码了,源码分析的话,网上一大堆,我这里只是简要的描述下epoll的实现和一些关键的代码片段。
相关的文件在 fs/eventpoll.c中,我看的是2.6.38的内核代码.
1 epoll在创建的时候会调用anon_inode_getfd新建一个file instance,也就是epoll可以看成一个文件。因此我们可以看到epoll_create会返回一个fd.
1
2
|
error = anon_inode_getfd( "[eventpoll]" , &eventpoll_fops, ep, O_RDWR | (flags & O_CLOEXEC)); |
2 epoll所管理的所有的句柄都是放在一个大的结构eventpoll(红黑树)中,而这个结构是保存在file 的private_data域中的(因为epoll本身就是一个文件).这样每次通过epoll fd就可以直接得到eventpoll.
1
2
3
4
5
|
file = fget(epfd); /* Get the "struct file *" for the target file */ tfile = fget(fd); ................................... ep = file->private_data; |
3 每一个加入到epoll监听的句柄(也就是红黑树的一个节点)都是一个epitem.它包含了一个 struct eventpoll *ep,也就是它所属于的eventpoll(epoll实例).
1
2
3
4
5
6
7
8
9
10
11
12
|
if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL))) return -ENOMEM; /* Item initialization follow here ... */ INIT_LIST_HEAD(&epi->rdllink); INIT_LIST_HEAD(&epi->fllink); INIT_LIST_HEAD(&epi->pwqlist); epi->ep = ep; ep_set_ffd(&epi->ffd, tfile, fd); epi->event = *event; epi->nwait = 0; epi->next = EP_UNACTIVE_PTR; |
4 在eventpoll中包含两个wait queue,一个是被epoll_wait使用的(wq),一个是被file->poll使用的(poll_wait).这两个都是属于 eventpoll.而在epitem也有一个wait queue(pwqlist),这个queue是fd私有的wait queue,所以它是保存在epitem中的。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
|
struct epitem { /* RB tree node used to link this structure to the eventpoll RB tree */ struct rb_node rbn; ................................................ /* List containing poll wait queues */ struct list_head pwqlist; /* The "container" of this item */ struct eventpoll *ep; ............................................. }; struct eventpoll { /* Protect the this structure access */ spinlock_t lock; /* * This mutex is used to ensure that files are not removed * while epoll is using them. This is held during the event * collection loop, the file cleanup path, the epoll file exit * code and the ctl operations. */ struct mutex mtx; /* Wait queue used by sys_epoll_wait() */ wait_queue_head_t wq; /* Wait queue used by file->poll() */ wait_queue_head_t poll_wait; /* List of ready file descriptors */ struct list_head rdllist; /* RB tree root used to store monitored fd structs */ struct rb_root rbr; /* * This is a single linked list that chains all the "struct epitem" that * happened while transfering ready events to userspace w/out * holding ->lock. */ struct epitem *ovflist; ................................................. }; |
5 当我们添加一个句柄到一个epoll fd的时候,默认是会包含POLLERR和POLLHUP事件.并将epitem插入到红黑树中(eventpoll).然后会初始化一个 poll_table,然后设置它的回调函数为ep_poll_callback,紧接着调用file->poll,如果是socket,则会调用 tcp_poll,这个函数将会调用ep_poll_callback.
1
2
3
4
5
6
7
8
9
|
switch (op) { case EPOLL_CTL_ADD: if (!epi) { epds.events |= POLLERR | POLLHUP; error = ep_insert(ep, &epds, tfile, fd); } else error = -EEXIST; break ; ............................................. |
6 ep_poll_callback这个函数主要是绑定epitem的wait queue的回调为ep_poll_callback.也就是对应fd如果有事件,则就会调用ep_poll_callback。
7 epoll中保存了一个read list(rdllist),所有的已经有通知事件的句柄,都会放到这个list中,而对应的操作就是在ep_poll_callback中,在 ep_poll_callback中主要就是由wait queue指针来取得对应的epitem,然后再取得eventpoll,并将这个epitem加入到ready list,唤醒epoll_wait(wq).这里可以看到由于一个句柄只会对应一个epitem,所以在rdllist中,也不会有重复的 epitem,在ep_poll_callback会判断是否rdllist中是否已经包含了将要插入的epitem,如果包含,则直接返回.
1
2
3
4
5
6
7
8
|
static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key) { struct epitem *epi = ep_item_from_wait(wait); struct eventpoll *ep = epi->ep; ............................................... /* If this file is already in the ready list we exit soon */ if (!ep_is_linked(&epi->rdllink)) list_add_tail(&epi->rdllink, &ep->rdllist); |
8 当系统调用epoll_wait,如果ready list(rdllist)为空,则休眠等待被唤醒。当被唤醒之后(第7条),将rdllist复制(指针)到一个新的list(主要是针对LT),然后 调用ep_send_events_proc对这个新的list进行遍历(对应的会从rdllist中删除这个epitem).遍历完毕后,最终会返回给 用户对应的数据.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
|
if (list_empty(&ep->rdllist)) { /* * We don't have any available event to return to the caller. * We need to sleep here, and we will be wake up by * ep_poll_callback() when events will become available. */ init_waitqueue_entry(&wait, current); __add_wait_queue_exclusive(&ep->wq, &wait); for (;;) { /* * We don't want to sleep if the ep_poll_callback() sends us * a wakeup in between. That's why we set the task state * to TASK_INTERRUPTIBLE before doing the checks. */ set_current_state(TASK_INTERRUPTIBLE); if (!list_empty(&ep->rdllist) || timed_out) break ; if (signal_pending(current)) { res = -EINTR; break ; } spin_unlock_irqrestore(&ep->lock, flags); if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) timed_out = 1; spin_lock_irqsave(&ep->lock, flags); } __remove_wait_queue(&ep->wq, &wait); set_current_state(TASK_RUNNING); } |
9 LT和ET的区别是在ep_send_events_proc中处理的,如果是LT,不但会将对应的数据返回给用户,并且会将当前的epitem再次加入到rdllist中。这样子,如果下次再次被唤醒就会给用户空间再次返回事件.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
|
static int ep_send_events_proc( struct eventpoll *ep, struct list_head *head, void *priv) { .................................................... if (__put_user(revents, &uevent->events) || __put_user(epi->event.data, &uevent->data)) { list_add(&epi->rdllink, head); return eventcnt ? eventcnt : -EFAULT; } eventcnt++; uevent++; if (epi->event.events & EPOLLONESHOT) epi->event.events &= EP_PRIVATE_BITS; else if (!(epi->event.events & EPOLLET)) { /* * If this file has been added with Level * Trigger mode, we need to insert back inside * the ready list, so that the next call to * epoll_wait() will check again the events * availability. At this point, noone can insert * into ep->rdllist besides us. The epoll_ctl() * callers are locked out by * ep_scan_ready_list() holding "mtx" and the * poll callback will queue them in ep->ovflist. */ list_add_tail(&epi->rdllink, &ep->rdllist); } .............................. } |
10 eventpol还有一个list,叫做ovflist,主要是解决当内核在传输数据给用户空间(ep_send_events_proc)时的锁(eventpoll->mtx),此时epoll就是将这个时候传递上来的事件保存到ovflist中。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
|
static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key) { ...................................... /* * If we are trasfering events to userspace, we can hold no locks * (because we're accessing user memory, and because of linux f_op->poll() * semantics). All the events that happens during that period of time are * chained in ep->ovflist and requeued later on. */ if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) { if (epi->next == EP_UNACTIVE_PTR) { epi->next = ep->ovflist; ep->ovflist = epi; } goto out_unlock; } ......................... } |
11 当从ep_send_events_proc返回(ep_scan_ready_list)后,会遍历ovflist,然后将ready的epitem保存到rdllist,以便与下次再次被唤醒时进行操作.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
|
static int ep_scan_ready_list( struct eventpoll *ep, int (*sproc)( struct eventpoll *, struct list_head *, void *), void *priv) { ........................................... /* * During the time we spent inside the "sproc" callback, some * other events might have been queued by the poll callback. * We re-insert them inside the main ready-list here. */ for (nepi = ep->ovflist; (epi = nepi) != NULL; nepi = epi->next, epi->next = EP_UNACTIVE_PTR) { /* * We need to check if the item is already in the list. * During the "sproc" callback execution time, items are * queued into ->ovflist but the "txlist" might already * contain them, and the list_splice() below takes care of them. */ if (!ep_is_linked(&epi->rdllink)) list_add_tail(&epi->rdllink, &ep->rdllist); } ........................ } |