dpdk中断机制
这里主要介绍一下dpdk的中断机制,虽然dpdk大多数场景用的是polling模式,但是也是支持中断模式的,另一方面除了收发包之外,设备的其他功能,如状态改变等,还是要依赖中断机制。当然dpdk的中断是用户态的中断,实现方式是通过vfio或uio模块将内核的中断传递到用户态,具体vfio和uio的工作方式不是本文的重点,这里重点关注dpdk的中断处理流程。首先看一下dpdk中断处理相关的初始化流程。
3.5.1 中断初始化
rte_eal_initàrte_eal_intr_init
中断初始化主要在rte_eal_intr_init中完成。
l rte_eal_intr_init
在rte_eal_intr_init()函数中初始化中断。具体如下:
(1) 首先初始化intr_sources链表。所有设备的中断都挂在这个链表上,中断处理线程通过遍历这个链表,来执行设备的中断。
(2) 创建intr_pipe管道,用于epoll模型的消息通知。
(3) 创建线程intr_thread,线程的执行体是eal_intr_thread_main()函数,创建epoll模型,遍历intr_sources链表,监听已注册的所有UIO设备的中断事件,并调用对应UIO设备的中断处理函数。
点击(此处)折叠或打开
- int rte_eal_intr_init(void)
- {
- int ret = 0, ret_1 = 0;
- char thread_name[RTE_MAX_THREAD_NAME_LEN];
-
- /* init the global interrupt source head */
- /*初始化intr_sources全局链表,用来存放设备的中断资源*/
- TAILQ_INIT(&intr_sources);
-
- /**
- * create a pipe which will be waited by epoll and notified to
- * rebuild the wait list of epoll.
- */
- /*创建管道,返回的两个fd存放在全局变量intr_pipe中*/
- if (pipe(intr_pipe.pipefd) < 0)
- return -1;
- /*创建中断处理线程*/
- /* create the host thread to wait/handle the interrupt */
- ret = pthread_create(&intr_thread, NULL,
- eal_intr_thread_main, NULL);
- if (ret != 0) {
- RTE_LOG(ERR, EAL,
- "Failed to create thread for interrupt handling\n");
- } else {
- /* Set thread_name for aid in debugging. */
- snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN,
- "eal-intr-thread");
- ret_1 = rte_thread_setname(intr_thread, thread_name);
- if (ret_1 != 0)
- RTE_LOG(DEBUG, EAL,
- "Failed to set thread name for interrupt handling\n");
- }
-
- return -ret;
- }
在继续分析之前先看下intr_sources这个全局链表的样子,如下图所示:
链表由structrte_intr_source结构组成,每个structrte_intr_source结构描述一个设备的中断信息。而structrte_intr_source中又有三个重要成员:
l intr_handle
点击(此处)折叠或打开
- struct rte_intr_handle {
- RTE_STD_C11
- union {
- int vfio_dev_fd; /**< VFIO device file descriptor */
- int uio_cfg_fd; /**< UIO config file descriptor
- for uio_pci_generic */
- };
- int fd; /**< interrupt event file descriptor */
- enum rte_intr_handle_type type; /**< handle type */
- uint32_t max_intr; /* nb_efd+1 */
- uint32_t nb_efd; /* efds中有效的个数 */
- int efds[RTE_MAX_RXTX_INTR_VEC_ID]; /*传递中断的fd,每个队列一个 */
- struct rte_epoll_event elist[RTE_MAX_RXTX_INTR_VEC_ID];
- /**< intr vector epoll event */
- int *intr_vec; /**< intr vector number array,每个队列ring 的offset*/
- };
这个结构用来记录设备中断的相关信息,其中主要是设备每个队列对应的传递中的fd,如(uio或vfio暴露给用户态的文件打开fd)。当然较新的dpdk(如18.05)虚拟设备也可以支持中断,如vhost_user后端设备。如果对vhost_user设备的rte_intr_handle进行初始化,可以如下进行:
点击(此处)折叠或打开
- static int
- eth_vhost_install_intr(struct rte_eth_dev *dev)
- {
- struct rte_vhost_vring vring;
- struct vhost_queue *vq;
- int count = 0;
- int nb_rxq = dev->data->nb_rx_queues;
- int i;
- int ret;
-
- dev->intr_handle = malloc(sizeof(*dev->intr_handle));
- memset(dev->intr_handle, 0, sizeof(*dev->intr_handle));
- dev->intr_handle->intr_vec =
- malloc(nb_rxq * sizeof(dev->intr_handle->intr_vec[0]));
-
- for (i = 0; i < nb_rxq; i++) {
- vq = dev->data->rx_queues[i];
- if (!vq)
- continue;
- ret = rte_vhost_get_vhost_vring(vq->vid, i << 1, &vring);
- dev->intr_handle->intr_vec[i] = RTE_INTR_VEC_RXTX_OFFSET + i;
- dev->intr_handle->efds[i] = vring.callfd; /*对于vhost_user设备这里就使用callfd接收来自前端的中断*/
- count++;
- }
-
- dev->intr_handle->nb_efd = count;
- dev->intr_handle->max_intr = count + 1;
- dev->intr_handle->type = RTE_INTR_HANDLE_VDEV;
-
- return 0;
- }
l callbacks
这是一个rte_intr_callback结构组成的链表,主要保存设备的中断处理函数和参数信息。为什么要一个链表呢?因为可以对一个中断注册多个处理函数。
l active
描述设备中断的状态。设备上是否有未处理的中断。
下面来看eal_intr_thread_main函数,也就是中断线程的主体函数。
l eal_intr_thread_main
中断线程执行主体eal_intr_thread_main()函数具体如下:
(1) epoll_create()创建epoll模型。
(2) 将intr_pipe管道加入到epoll中。
(3) 遍历intr_sources链表,将所有UIO设备加入到epoll中。
(4) 执行eal_intr_handle_interrupts()函数。
l eal_intr_thread_main
点击(此处)折叠或打开
- static __attribute__((noreturn)) void * eal_intr_thread_main(__rte_unused void *arg)
- {
- struct epoll_event ev;
-
- /* host thread, never break out */
- for (;;) {
- /* build up the epoll fd with all descriptors we are to
- * wait on then pass it to the handle_interrupts function
- */
- static struct epoll_event pipe_event = {
- .events = EPOLLIN | EPOLLPRI,
- };
- struct rte_intr_source *src;
- unsigned numfds = 0;
-
- /* create epoll fd */
- int pfd = epoll_create(1);
- if (pfd < 0)
- rte_panic("Cannot create epoll instance\n");
- /*intr_pipe是一个全局变量,在rte_eal_intr_init中已经初始化*/
- pipe_event.data.fd = intr_pipe.readfd;
- /**
- * add pipe fd into wait list, this pipe is used to
- * rebuild the wait list.
- */
- /*将intr_pipe.readfd添加到epoll的监听列表*/
- if (epoll_ctl(pfd, EPOLL_CTL_ADD, intr_pipe.readfd,
- &pipe_event) < 0) {
- rte_panic("Error adding fd to %d epoll_ctl, %s\n",
- intr_pipe.readfd, strerror(errno));
- }
- numfds++;
-
- rte_spinlock_lock(&intr_lock);
- /*遍历intr_sources链表,将所有设备的中断通知fd加入到epoll中*/
- TAILQ_FOREACH(src, &intr_sources, next) {
- if (src->callbacks.tqh_first == NULL)
- continue; /* skip those with no callbacks */
- ev.events = EPOLLIN | EPOLLPRI;
- ev.data.fd = src->intr_handle.fd;
-
- /**
- * add all the uio device file descriptor
- * into wait list.
- */
- if (epoll_ctl(pfd, EPOLL_CTL_ADD,
- src->intr_handle.fd, &ev) < 0){
- rte_panic("Error adding fd %d epoll_ctl, %s\n",
- src->intr_handle.fd, strerror(errno));
- }
- else
- numfds++;
- }
- rte_spinlock_unlock(&intr_lock);
- /* serve the interrupt */
- eal_intr_handle_interrupts(pfd, numfds);
-
- /**
- * when we return, we need to rebuild the
- * list of fds to monitor.
- */
- close(pfd);
- }
- }
然后函数调用eal_intr_handle_interrupts。
l eal_intr_handle_interrupts
eal_intr_handle_interrupts主要就是在死循环中调用epoll,然后处理中断。
点击(此处)折叠或打开
- static void eal_intr_handle_interrupts(int pfd, unsigned totalfds)
- {
- struct epoll_event events[totalfds];
- int nfds = 0;
-
- for(;;) {
- nfds = epoll_wait(pfd, events, totalfds,
- EAL_INTR_EPOLL_WAIT_FOREVER);
- /* epoll_wait fail */
- if (nfds < 0) {
- if (errno == EINTR)
- continue;
- RTE_LOG(ERR, EAL,
- "epoll_wait returns with fail\n");
- return;
- }
- /* epoll_wait timeout, will never happens here */
- else if (nfds == 0)
- continue;
- /* epoll_wait has at least one fd ready to read */
- /* 注意只有这里返回小于0,这个无限循环才会退出 */
- if (eal_intr_process_interrupts(events, nfds) < 0)
- return;
- }
- }
这个函数在一个for(;;)死循环中,调用epoll_wait()阻塞模式监听事件。如果有事件发生,则调用eal_intr_process_interrupts()函数。
l eal_intr_process_interrupts
点击(此处)折叠或打开
- static int eal_intr_process_interrupts(struct epoll_event *events, int nfds)
- {
- int n, bytes_read;
- struct rte_intr_source *src;
- struct rte_intr_callback *cb;
- union rte_intr_read_buffer buf;
- struct rte_intr_callback active_cb;
-
- for (n = 0; n < nfds; n++) {
-
- /**
- * if the pipe fd is ready to read, return out to
- * rebuild the wait list.
- */
- /*如果是pipefd有数据,说明有新注册的中断,返回-1让上层退出无限循环,重新扫描intr_sources 链表,添加中断fd*/
- if (events[n].data.fd == intr_pipe.readfd){
- int r = read(intr_pipe.readfd, buf.charbuf,
- sizeof(buf.charbuf));
- RTE_SET_USED(r);
- return -1;
- }
- rte_spinlock_lock(&intr_lock);
- /*遍历intr_sources 链表,处理上面的中断*/
- TAILQ_FOREACH(src, &intr_sources, next)
- if (src->intr_handle.fd ==
- events[n].data.fd) /*判断设备是否产生了中断*/
- break;
- if (src == NULL){
- rte_spinlock_unlock(&intr_lock);
- continue;
- }
-
- /* mark this interrupt source as active and release the lock. */
- src->active = 1; /*表明这个设备的中断尚未处理*/
- rte_spinlock_unlock(&intr_lock);
-
- /* set the length to be read dor different handle type */
- /*根据中断设备的类型,UIO或者vfio等,设置要读取数据的大小*/
- switch (src->intr_handle.type) {
- case RTE_INTR_HANDLE_UIO:
- case RTE_INTR_HANDLE_UIO_INTX:
- bytes_read = sizeof(buf.uio_intr_count);
- break;
- case RTE_INTR_HANDLE_ALARM:
- bytes_read = sizeof(buf.timerfd_num);
- break;
- #ifdef VFIO_PRESENT
- case RTE_INTR_HANDLE_VFIO_MSIX:
- case RTE_INTR_HANDLE_VFIO_MSI:
- case RTE_INTR_HANDLE_VFIO_LEGACY:
- bytes_read = sizeof(buf.vfio_intr_count);
- break;
- #endif
- case RTE_INTR_HANDLE_EXT:
- default:
- bytes_read = 1;
- break;
- }
- /*从uio或vfio中断设备中读取中断数据*/
- if (src->intr_handle.type != RTE_INTR_HANDLE_EXT) {
- /**
- * read out to clear the ready-to-be-read flag
- * for epoll_wait.
- */
- bytes_read = read(events[n].data.fd, &buf, bytes_read);
- if (bytes_read < 0) {
- if (errno == EINTR || errno == EWOULDBLOCK)
- continue;
-
- RTE_LOG(ERR, EAL, "Error reading from file "
- "descriptor %d: %s\n",
- events[n].data.fd,
- strerror(errno));
- } else if (bytes_read == 0)
- RTE_LOG(ERR, EAL, "Read nothing from file "
- "descriptor %d\n", events[n].data.fd);
- }
-
- /* grab a lock, again to call callbacks and update status. */
- rte_spinlock_lock(&intr_lock);
- /*调用中断设备自己的中断处理函数*/
- if (bytes_read > 0) {
-
- /* Finally, call all callbacks. */
- /* 注意是调用这个设备注册的所有中断处理函数 */
- TAILQ_FOREACH(cb, &src->callbacks, next) {
-
- /* make a copy and unlock. */
- active_cb = *cb;
- rte_spinlock_unlock(&intr_lock);
-
- /* call the actual callback */
- active_cb.cb_fn(&src->intr_handle,
- active_cb.cb_arg);
-
- /*get the lock back. */
- rte_spinlock_lock(&intr_lock);
- }
- }
-
- /* we done with that interrupt source, release it. */
- src->active = 0; /*处理完中断后清除设备中断状态*/
- rte_spinlock_unlock(&intr_lock);
- }
-
- return 0;
- }
到此设备中断的相关初始化就结束了,整个过程如下所示:
3.5.2 设备中断注册
那么中断又是怎么注册的呢?这就不得不提rte_intr_callback_register这个函数,设备的中断处理都是通过这个函数注册的,我们看下他的实现。
l rte_intr_callback_register
点击(此处)折叠或打开
- int
- rte_intr_callback_register(const struct rte_intr_handle *intr_handle,
- rte_intr_callback_fn cb, void *cb_arg)
- {
- int ret, wake_thread;
- struct rte_intr_source *src;
- struct rte_intr_callback *callback;
-
- wake_thread = 0;
-
- /* first do parameter checking */
- if (intr_handle == NULL || intr_handle->fd < 0 || cb == NULL) {
- RTE_LOG(ERR, EAL,
- "Registering with invalid input parameter\n");
- return -EINVAL;
- }
-
- /* allocate a new interrupt callback entity */
- callback = rte_zmalloc("interrupt callback list",
- sizeof(*callback), 0);
- if (callback == NULL) {
- RTE_LOG(ERR, EAL, "Can not allocate memory\n");
- return -ENOMEM;
- }
- /* 初始化callback */
- callback->cb_fn = cb;
- callback->cb_arg = cb_arg;
-
- rte_spinlock_lock(&intr_lock);
-
- /* check if there is at least one callback registered for the fd */
- /* 遍历intr_sources链表,找对应的rte_intr_source */
- TAILQ_FOREACH(src, &intr_sources, next) {
- if (src->intr_handle.fd == intr_handle->fd) {
- /* we had no interrupts for this */
- /* 如果这个设备的这个中断之前没有注册过处理函数,则需要唤醒中断处理线程,将这个中断fd添加到epoll中 */
- if TAILQ_EMPTY(&src->callbacks)
- wake_thread = 1;
- /* 如果这个中断已经有对应的处理函数了,说明已经在epoll中了,则只需要把新的callback加入链表 */
- TAILQ_INSERT_TAIL(&(src->callbacks), callback, next);
- ret = 0;
- break;
- }
- }
- /* 如果没有设备对应的rte_intr_source结构,则创建一个并添加到全局链表 */
- /* no existing callbacks for this - add new source */
- if (src == NULL) {
- if ((src = rte_zmalloc("interrupt source list",
- sizeof(*src), 0)) == NULL) {
- RTE_LOG(ERR, EAL, "Can not allocate memory\n");
- rte_free(callback);
- ret = -ENOMEM;
- } else {
- src->intr_handle = *intr_handle;
- TAILQ_INIT(&src->callbacks);
- TAILQ_INSERT_TAIL(&(src->callbacks), callback, next);
- TAILQ_INSERT_TAIL(&intr_sources, src, next);
- wake_thread = 1;
- ret = 0;
- }
- }
-
- rte_spinlock_unlock(&intr_lock);
-
- /**
- * check if need to notify the pipe fd waited by epoll_wait to
- * rebuild the wait list.
- */
- if (wake_thread) /* 唤醒中断处理线程 */
- if (write(intr_pipe.writefd, "1", 1) < 0)
- return -EPIPE;
-
- return ret;
- }
这个函数主要是为中断创建一个rte_intr_source结构,我们从其参数可以看出来,参数正式rte_intr_source结构成员所需要的,然后将rte_intr_source结构加入全局链表intr_sources中,并通知前面创建的中断处理线程,中断处理线程可以再次遍历intr_sources,将新加入的rte_intr_source中的handle->fd加入epoll中处理。整个处理流程如下所示。
下面列举一个uio/vfio设备的中断回调函数注册的完整路径,以ixgbevf为例:
rte_eth_dev_pci_probe àeth_ixgbevf_dev_inità rte_intr_callback_register,其中rte_eth_dev_pci_probe在下面的“绑定驱动”中会介绍。
对应ixgbevf其调用如下,这次的中断处理函数为ixgbevf_dev_interrupt_handler。
rte_intr_callback_register(intr_handle,ixgbevf_dev_interrupt_handler,eth_dev);
l ixgbevf_dev_interrupt_handler
点击(此处)折叠或打开
- static void ixgbevf_dev_interrupt_handler(__rte_unused struct rte_intr_handle *handle,
- void *param)
- {
- struct rte_eth_dev *dev = (struct rte_eth_dev *)param;
- /*暂时先禁止中断*/
- ixgbevf_dev_interrupt_get_status(dev);
- ixgbevf_dev_interrupt_action(dev);
- }
l ixgbevf_dev_interrupt_action
其中主要是SRIOV设备,mailbox的处理,这里不再展开。
点击(此处)折叠或打开
- static int ixgbevf_dev_interrupt_action(struct rte_eth_dev *dev)
- {
- struct ixgbe_hw *hw = IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
- struct ixgbe_interrupt *intr =
- IXGBE_DEV_PRIVATE_TO_INTR(dev->data->dev_private);
-
- if (intr->flags & IXGBE_FLAG_MAILBOX) {
- ixgbevf_mbx_process(dev);
- intr->flags &= ~IXGBE_FLAG_MAILBOX;
- }
- /*开启中断*/
- ixgbevf_intr_enable(hw);
-
- return 0;
- }
3.5.3 接收队列中断注册
我们上面讲了设备的中断注册,但是上面所说的中断注册一般不是数据中断,而是控制中断,比如设备状态改变等情况。这种中断我们一般会设置intr_handle->fd,如上面的描述,但是如果我们想要注册设备的接收队列中断呢(rxq interrupt),由于设备可能是多队列,那么显然一个fd是不够的,所以我们可以像上面为vhost_user设备注册中断一样(eth_vhost_install_intr)使用intr_handle->efds这个数组为每个rxq设置一个中断fd。但是这就有个问题,我们在“中断初始化”中分析eal_intr_thread_main中讲过,中断处理线程仅会将intr_handle->fd加入epoll中,但是并不会添加intr_handle->efds。那我们设置intr_handle->efds该怎么使用呢?其实这就涉及到数据面的中断注册了,一个非常好的例子是dpdk代码中的examples\l3fwd-power。
普通的DPDK是采用的PMD模式,也就是轮询模式,这种模式下无论是否有报文处理,都是采用的轮询也就是CPU占用率100%;l3fwd-power就是为了解决这个问题,当CPU根本就不需要处理报文的时候进入省电模式也就是中断模式。我们这里只关注其中的中断注册,其他暂时不去分析。设备的rxq中断是从event_register注册的。
l event_register
点击(此处)折叠或打开
- static int event_register(struct lcore_conf *qconf)
- {
- struct lcore_rx_queue *rx_queue;
- uint8_t portid, queueid;
- uint32_t data;
- int ret;
- int i;
- /* 为设备的每个接收队列调用rte_eth_dev_rx_intr_ctl_q注册中断 */
- for (i = 0; i < qconf->n_rx_queue; ++i) {
- rx_queue = &(qconf->rx_queue_list[i]);
- portid = rx_queue->port_id;
- queueid = rx_queue->queue_id;
- data = portid << CHAR_BIT | queueid;
-
- ret = rte_eth_dev_rx_intr_ctl_q(portid, queueid,
- RTE_EPOLL_PER_THREAD,
- RTE_INTR_EVENT_ADD,
- (void *)((uintptr_t)data));
- if (ret)
- return ret;
- }
-
- return 0;
- }
其中注册每个rxq的中断由rte_eth_dev_rx_intr_ctl_q函数完成,注意RTE_EPOLL_PER_THREAD的值为-1。
l rte_eth_dev_rx_intr_ctl_q
点击(此处)折叠或打开
- int rte_eth_dev_rx_intr_ctl_q(uint8_t port_id, uint16_t queue_id,
- int epfd, int op, void *data)
- {
- uint32_t vec;
- struct rte_eth_dev *dev;
- struct rte_intr_handle *intr_handle;
- int rc;
-
- RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV);
- /* 根据port_id找到对应的struct rte_eth_dev */
- dev = &rte_eth_devices[port_id];
- if (queue_id >= dev->data->nb_rx_queues) {
- RTE_PMD_DEBUG_TRACE("Invalid RX queue_id=%u\n", queue_id);
- return -EINVAL;
- }
- /* 检查设备是否初始化了intr_handle */
- if (!dev->intr_handle) {
- RTE_PMD_DEBUG_TRACE("RX Intr handle unset\n");
- return -ENOTSUP;
- }
-
- intr_handle = dev->intr_handle;
- if (!intr_handle->intr_vec) {
- RTE_PMD_DEBUG_TRACE("RX Intr vector unset\n");
- return -EPERM;
- }
- /* intr_handle->intr_vec[queue_id]为queue的ring idx */
- vec = intr_handle->intr_vec[queue_id];
- rc = rte_intr_rx_ctl(intr_handle, epfd, op, vec, data);
- if (rc && rc != -EEXIST) {
- RTE_PMD_DEBUG_TRACE("p %u q %u rx ctl error"
- " op %d epfd %d vec %u\n",
- port_id, queue_id, op, epfd, vec);
- return rc;
- }
-
- return 0;
- }
这个函数调用了一系列检查,最终调用rte_intr_rx_ctl完成中断fd注册。在看rte_intr_rx_ctl实现之前,先看下rte_intr_handle的之前没展开的细节结构,如下所示。
l rte_intr_rx_ctl
点击(此处)折叠或打开
- int
- rte_intr_rx_ctl(struct rte_intr_handle *intr_handle, int epfd,
- int op, unsigned int vec, void *data)
- {
- struct rte_epoll_event *rev;
- struct rte_epoll_data *epdata;
- int epfd_op;
- unsigned int efd_idx;
- int rc = 0;
-
- efd_idx = (vec >= RTE_INTR_VEC_RXTX_OFFSET) ?
- (vec - RTE_INTR_VEC_RXTX_OFFSET) : vec;
-
- if (!intr_handle || intr_handle->nb_efd == 0 ||
- efd_idx >= intr_handle->nb_efd) {
- RTE_LOG(ERR, EAL, "Wrong intr vector number.\n");
- return -EPERM;
- }
-
- switch (op) {
- case RTE_INTR_EVENT_ADD:
- epfd_op = EPOLL_CTL_ADD;
- rev = &intr_handle->elist[efd_idx];
- /* rev->status != RTE_EPOLL_INVALID说明这个中断fd已经加入了epoll了 */
- if (rev->status != RTE_EPOLL_INVALID) {
- RTE_LOG(INFO, EAL, "Event already been added.\n");
- return -EEXIST;
- }
- /* 设置intr_handle->elist[efd_idx].epdata */
- /* attach to intr vector fd */
- epdata = &rev->epdata;
- epdata->event = EPOLLIN | EPOLLPRI | EPOLLET;
- epdata->data = data;
- epdata->cb_fun = (rte_intr_event_cb_t)eal_intr_proc_rxtx_intr;
- epdata->cb_arg = (void *)intr_handle;
- /* 注意这里传入的是intr_handle->efds[efd_idx] */
- rc = rte_epoll_ctl(epfd, epfd_op,
- intr_handle->efds[efd_idx], rev);
- if (!rc)
- RTE_LOG(DEBUG, EAL,
- "efd %d associated with vec %d added on epfd %d"
- "\n", rev->fd, vec, epfd);
- else
- rc = -EPERM;
- break;
- case RTE_INTR_EVENT_DEL:
- epfd_op = EPOLL_CTL_DEL;
- rev = &intr_handle->elist[efd_idx];
- if (rev->status == RTE_EPOLL_INVALID) {
- RTE_LOG(INFO, EAL, "Event does not exist.\n");
- return -EPERM;
- }
-
- rc = rte_epoll_ctl(rev->epfd, epfd_op, rev->fd, rev);
- if (rc)
- rc = -EPERM;
- break;
- default:
- RTE_LOG(ERR, EAL, "event op type mismatch\n");
- rc = -EPERM;
- }
-
- return rc;
- }
由于是中断注册,我们只关注RTE_INTR_EVENT_ADD的逻辑,这里我们终于看到了intr_handle->efds[efd_idx],通过rte_epoll_ctl进行注册,同时我们也看到了这里会初始化一个中断处理函数eal_intr_proc_rxtx_intr,这个我们后面分析。
l rte_epoll_ctl
点击(此处)折叠或打开
- int
- rte_epoll_ctl(int epfd, int op, int fd,
- struct rte_epoll_event *event)
- {
- struct epoll_event ev;
-
- if (!event) {
- RTE_LOG(ERR, EAL, "rte_epoll_event can't be NULL\n");
- return -1;
- }
-
- /* using per thread epoll fd */
- /*如果epfd为-1,则创建epollfd,注意这里把epollfd存放在了“每线程变量中”*/
- if (epfd == RTE_EPOLL_PER_THREAD)
- epfd = rte_intr_tls_epfd();
-
- if (op == EPOLL_CTL_ADD) {
- event->status = RTE_EPOLL_VALID;
- event->fd = fd; /* ignore fd in event */
- event->epfd = epfd;
- ev.data.ptr = (void *)event;
- }
-
- ev.events = event->epdata.event;
- /*添加到epoll中*/
- if (epoll_ctl(epfd, op, fd, &ev) < 0) {
- RTE_LOG(ERR, EAL, "Error op %d fd %d epoll_ctl, %s\n",
- op, fd, strerror(errno));
- if (op == EPOLL_CTL_ADD)
- /* rollback status when CTL_ADD fail */
- event->status = RTE_EPOLL_INVALID;
- return -1;
- }
-
- if (op == EPOLL_CTL_DEL && event->status != RTE_EPOLL_INVALID)
- eal_epoll_data_safe_free(event);
-
- return 0;
- }
这个函数主要就是创建一个per thread的epollfd,然后调用了epoll_ctl来讲rxq的fd加入epollfd。到此中断注册就完成了。下面我们看中断回调过程。整个中断线程就是dataplane的的主线程。具体不再展开,调用路径如下所示。
这里我们主要看一下rte_epoll_wait的处理逻辑,之所以要对epoll_wait进行一次封装,主要是在epoll_wait返回后调用了eal_epoll_process_event。
l rte_epoll_wait
点击(此处)折叠或打开
- int rte_epoll_wait(int epfd, struct rte_epoll_event *events,
- int maxevents, int timeout)
- {
- struct epoll_event evs[maxevents];
- int rc;
-
- if (!events) {
- RTE_LOG(ERR, EAL, "rte_epoll_event can't be NULL\n");
- return -1;
- }
-
- /* using per thread epoll fd */
- /* 获取之前创建的epollfd */
- if (epfd == RTE_EPOLL_PER_THREAD)
- epfd = rte_intr_tls_epfd();
-
- while (1) {
- rc = epoll_wait(epfd, evs, maxevents, timeout);
- if (likely(rc > 0)) {
- /* epoll_wait has at least one fd ready to read */
- rc = eal_epoll_process_event(evs, rc, events);
- break;
- } else if (rc < 0) {
- if (errno == EINTR)
- continue;
- /* epoll_wait fail */
- RTE_LOG(ERR, EAL, "epoll_wait returns with fail %s\n",
- strerror(errno));
- rc = -1;
- break;
- } else {
- /* rc == 0, epoll_wait timed out */
- break;
- }
- }
-
- return rc;
- }
l eal_epoll_process_event
点击(此处)折叠或打开
- static int
- eal_epoll_process_event(struct epoll_event *evs, unsigned int n,
- struct rte_epoll_event *events)
- {
- unsigned int i, count = 0;
- struct rte_epoll_event *rev;
-
- for (i = 0; i < n; i++) {
- rev = evs[i].data.ptr;
- if (!rev || !rte_atomic32_cmpset(&rev->status, RTE_EPOLL_VALID,
- RTE_EPOLL_EXEC))
- continue;
-
- events[count].status = RTE_EPOLL_VALID;
- events[count].fd = rev->fd;
- events[count].epfd = rev->epfd;
- events[count].epdata.event = rev->epdata.event;
- events[count].epdata.data = rev->epdata.data;
- if (rev->epdata.cb_fun)
- rev->epdata.cb_fun(rev->fd,
- rev->epdata.cb_arg);
-
- rte_compiler_barrier();
- rev->status = RTE_EPOLL_VALID;
- count++;
- }
- return count;
- }
而eal_epoll_process_event的主要逻辑就是调用之前rte_intr_rx_ctl中注册的epdata.cb_fun,也就是eal_intr_proc_rxtx_intr。
l eal_intr_proc_rxtx_intr
这个函数其实主要就是读出fd中的数据,以免下次将加入epoll中直接返回,当然这是dpdk 17.02的实现,在18.05中加入了RTE_INTR_HANDLE_VDEV,也就是之前我们注册vhost_user时使用的handle type,对应RTE_INTR_HANDLE_VDEV是不需要从fd读数据的,所以bytes_read为0。
点击(此处)折叠或打开
- static void eal_intr_proc_rxtx_intr(int fd, const struct rte_intr_handle *intr_handle)
- {
- union rte_intr_read_buffer buf;
- int bytes_read = 1;
- int nbytes;
-
- switch (intr_handle->type) {
- case RTE_INTR_HANDLE_UIO:
- case RTE_INTR_HANDLE_UIO_INTX:
- bytes_read = sizeof(buf.uio_intr_count);
- break;
- #ifdef VFIO_PRESENT
- case RTE_INTR_HANDLE_VFIO_MSIX:
- case RTE_INTR_HANDLE_VFIO_MSI:
- case RTE_INTR_HANDLE_VFIO_LEGACY:
- bytes_read = sizeof(buf.vfio_intr_count);
- break;
- #endif
- default:
- bytes_read = 1;
- RTE_LOG(INFO, EAL, "unexpected intr type\n");
- break;
- }
-
- /**
- * read out to clear the ready-to-be-read flag
- * for epoll_wait.
- */
- do {
- nbytes = read(fd, &buf, bytes_read);
- if (nbytes < 0) {
- if (errno == EINTR || errno == EWOULDBLOCK ||
- errno == EAGAIN)
- continue;
- RTE_LOG(ERR, EAL,
- "Error reading from fd %d: %s\n",
- fd, strerror(errno));
- } else if (nbytes == 0)
- RTE_LOG(ERR, EAL, "Read nothing from fd %d\n", fd);
- return;
- } while (1);
- }
之后就返回主线程了,主线程函数在rte_epoll_wait返回后调用收包逻辑处理。
下面是整个中断注册回调逻辑图。