l3fwd负责三层转发,比l2fwd要复杂点。
/*-
* BSD LICENSE
*
* Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Intel Corporation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/ #include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <inttypes.h>
#include <sys/types.h>
#include <string.h>
#include <sys/queue.h>
#include <stdarg.h>
#include <errno.h>
#include <getopt.h> #include <rte_common.h>
#include <rte_vect.h>
#include <rte_byteorder.h>
#include <rte_log.h>
#include <rte_memory.h>
#include <rte_memcpy.h>
#include <rte_memzone.h>
#include <rte_eal.h>
#include <rte_per_lcore.h>
#include <rte_launch.h>
#include <rte_atomic.h>
#include <rte_cycles.h>
#include <rte_prefetch.h>
#include <rte_lcore.h>
#include <rte_per_lcore.h>
#include <rte_branch_prediction.h>
#include <rte_interrupts.h>
#include <rte_pci.h>
#include <rte_random.h>
#include <rte_debug.h>
#include <rte_ether.h>
#include <rte_ethdev.h>
#include <rte_ring.h>
#include <rte_mempool.h>
#include <rte_mbuf.h>
#include <rte_ip.h>
#include <rte_tcp.h>
#include <rte_udp.h>
#include <rte_string_fns.h> #define APP_LOOKUP_EXACT_MATCH 0
#define APP_LOOKUP_LPM 1
#define DO_RFC_1812_CHECKS #ifndef APP_LOOKUP_METHOD //默认使用LPM来路由
#define APP_LOOKUP_METHOD APP_LOOKUP_LPM
#endif /*
* 0表示未优化 When set to zero, simple forwaring path is eanbled.
* 1表示优化 When set to one, optimized forwarding path is enabled.
* LPM会用到SSE4.1特性 Note that LPM optimisation path uses SSE4.1 instructions.
* 注意: 发现深圳测试机的CPU支持的是SSE 4.2特性,不知道会不会有影响呢???
*/
#if ((APP_LOOKUP_METHOD == APP_LOOKUP_LPM) && !defined(__SSE4_1__))
#define ENABLE_MULTI_BUFFER_OPTIMIZE 0
#else
#define ENABLE_MULTI_BUFFER_OPTIMIZE 1
#endif #if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH)
#include <rte_hash.h>
#elif (APP_LOOKUP_METHOD == APP_LOOKUP_LPM)
#include <rte_lpm.h>
#include <rte_lpm6.h>
#else
#error "APP_LOOKUP_METHOD set to incorrect value"
#endif #ifndef IPv6_BYTES
#define IPv6_BYTES_FMT "%02x%02x:%02x%02x:%02x%02x:%02x%02x:"\
"%02x%02x:%02x%02x:%02x%02x:%02x%02x"
#define IPv6_BYTES(addr) \
addr[], addr[], addr[], addr[], \
addr[], addr[], addr[], addr[], \
addr[], addr[], addr[], addr[],\
addr[], addr[],addr[], addr[]
#endif #define RTE_LOGTYPE_L3FWD RTE_LOGTYPE_USER1 #define MAX_JUMBO_PKT_LEN 9600 #define IPV6_ADDR_LEN 16 #define MEMPOOL_CACHE_SIZE 256 #define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM) /*
* This expression is used to calculate the number of mbufs needed depending on user input, taking
* into account memory for rx and tx hardware rings, cache per lcore and mtable per port per lcore.
* RTE_MAX is used to ensure that NB_MBUF never goes below a minimum value of 8192
*/ #define NB_MBUF RTE_MAX ( \
(nb_ports*nb_rx_queue*RTE_TEST_RX_DESC_DEFAULT + \
nb_ports*nb_lcores*MAX_PKT_BURST + \
nb_ports*n_tx_queue*RTE_TEST_TX_DESC_DEFAULT + \
nb_lcores*MEMPOOL_CACHE_SIZE), \
(unsigned)) #define MAX_PKT_BURST 32
#define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ /*
* Try to avoid TX buffering if we have at least MAX_TX_BURST packets to send.
*/
#define MAX_TX_BURST (MAX_PKT_BURST / 2) #define NB_SOCKETS 8 /* Configure how many packets ahead to prefetch, when reading packets */
#define PREFETCH_OFFSET 3 /* Used to mark destination port as 'invalid'. */
#define BAD_PORT ((uint16_t)-1) #define FWDSTEP 4 /*
* Configurable number of RX/TX ring descriptors
*/
#define RTE_TEST_RX_DESC_DEFAULT 128
#define RTE_TEST_TX_DESC_DEFAULT 512
static uint16_t nb_rxd = RTE_TEST_RX_DESC_DEFAULT;
static uint16_t nb_txd = RTE_TEST_TX_DESC_DEFAULT; /* ethernet addresses of ports */
static struct ether_addr ports_eth_addr[RTE_MAX_ETHPORTS]; static __m128i val_eth[RTE_MAX_ETHPORTS]; /* replace first 12B of the ethernet header. */
#define MASK_ETH 0x3f /* mask of enabled ports */
static uint32_t enabled_port_mask = ;
static int promiscuous_on = ; /**< Ports set in promiscuous mode off by default. */
static int numa_on = ; /**< NUMA is enabled by default. */ #if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH)
static int ipv6 = ; /**< ipv6 is false by default. */
#endif struct mbuf_table {
uint16_t len; //实际个数???
struct rte_mbuf *m_table[MAX_PKT_BURST];
}; struct lcore_rx_queue {
uint8_t port_id; //物理端口的编号
uint8_t queue_id;//网卡队列的编号
} __rte_cache_aligned; #define MAX_RX_QUEUE_PER_LCORE 16 //每个lcore上最多有16个接收队列
#define MAX_TX_QUEUE_PER_PORT RTE_MAX_ETHPORTS //每个物理端口上最多32个发送队列
#define MAX_RX_QUEUE_PER_PORT 128 //每个物理端口上最多128个接收队列 #define MAX_LCORE_PARAMS 1024
struct lcore_params {
uint8_t port_id; //物理端口的编号
uint8_t queue_id; //网卡队列的编号
uint8_t lcore_id; //lcore的编号
} __rte_cache_aligned; static struct lcore_params lcore_params_array[MAX_LCORE_PARAMS];//最大1024 //此处可以修改lcore的默认配置
static struct lcore_params lcore_params_array_default[] = {
{, , },//物理端口的编号,网卡队列的编号,lcore的编号
{, , },
{, , },
{, , },
{, , },
{, , },
{, , },
{, , },
{, , },
}; static struct lcore_params * lcore_params = lcore_params_array_default;
static uint16_t nb_lcore_params = sizeof(lcore_params_array_default) /
sizeof(lcore_params_array_default[]);//默认值为9 static struct rte_eth_conf port_conf = {
.rxmode = {
.mq_mode = ETH_MQ_RX_RSS, //看起来l3fwd支持RSS哟
.max_rx_pkt_len = ETHER_MAX_LEN,
.split_hdr_size = ,
.header_split = , /**< Header Split disabled */
.hw_ip_checksum = , /**< IP checksum offload enabled */
.hw_vlan_filter = , /**< VLAN filtering disabled */
.jumbo_frame = , /**< Jumbo Frame Support disabled */
.hw_strip_crc = , /**< CRC stripped by hardware */
},
.rx_adv_conf = {
.rss_conf = {
.rss_key = NULL,
.rss_hf = ETH_RSS_IP,
},
},
.txmode = {
.mq_mode = ETH_MQ_TX_NONE,
},
}; static struct rte_mempool * pktmbuf_pool[NB_SOCKETS]; #if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH)
#ifdef RTE_MACHINE_CPUFLAG_SSE4_2
#include <rte_hash_crc.h>
#define DEFAULT_HASH_FUNC rte_hash_crc
#else
#include <rte_jhash.h>
#define DEFAULT_HASH_FUNC rte_jhash
#endif
struct ipv4_5tuple { //五元组
uint32_t ip_dst; //目的ip地址
uint32_t ip_src; //源ip地址
uint16_t port_dst; //目的端口号
uint16_t port_src; //源端口号
uint8_t proto; //传输层协议类型
} __attribute__((__packed__));
union ipv4_5tuple_host {
struct {
uint8_t pad0;
uint8_t proto;
uint16_t pad1;
uint32_t ip_src;
uint32_t ip_dst;
uint16_t port_src;
uint16_t port_dst;
};
__m128i xmm;
}; #define XMM_NUM_IN_IPV6_5TUPLE 3
struct ipv6_5tuple {
uint8_t ip_dst[IPV6_ADDR_LEN];
uint8_t ip_src[IPV6_ADDR_LEN];
uint16_t port_dst;
uint16_t port_src;
uint8_t proto;
} __attribute__((__packed__));
union ipv6_5tuple_host {
struct {
uint16_t pad0;
uint8_t proto;
uint8_t pad1;
uint8_t ip_src[IPV6_ADDR_LEN];
uint8_t ip_dst[IPV6_ADDR_LEN];
uint16_t port_src;
uint16_t port_dst;
uint64_t reserve;
};
__m128i xmm[XMM_NUM_IN_IPV6_5TUPLE];
};
struct ipv4_l3fwd_route {
struct ipv4_5tuple key;
uint8_t if_out;
};
struct ipv6_l3fwd_route {
struct ipv6_5tuple key; u
int8_t if_out;
};
//这里设置默认的静态的三层转发路由规则,实际使用的时候需要修改这个地方
static struct ipv4_l3fwd_route ipv4_l3fwd_route_array[] = {
{{IPv4(,,,), IPv4(,,,), , , IPPROTO_TCP}, },
{{IPv4(,,,), IPv4(,,,), , , IPPROTO_TCP}, },
{{IPv4(,,,), IPv4(,,,), , , IPPROTO_TCP}, },
{{IPv4(,,,), IPv4(,,,), , , IPPROTO_TCP}, },
};
static struct ipv6_l3fwd_route ipv6_l3fwd_route_array[] = {
{{ {0xfe, 0x80, , , , , , , 0x02, 0x1e, 0x67, 0xff, 0xfe, , , },
{0xfe, 0x80, , , , , , , 0x02, 0x1b, 0x21, 0xff, 0xfe, 0x91, 0x38, 0x05},
, , IPPROTO_TCP}, },
{{ {0xfe, 0x90, , , , , , , 0x02, 0x1e, 0x67, 0xff, 0xfe, , , },
{0xfe, 0x90, , , , , , , 0x02, 0x1b, 0x21, 0xff, 0xfe, 0x91, 0x38, 0x05},
, , IPPROTO_TCP}, },
{{ {0xfe, 0xa0, , , , , , , 0x02, 0x1e, 0x67, 0xff, 0xfe, , , },
{0xfe, 0xa0, , , , , , , 0x02, 0x1b, 0x21, 0xff, 0xfe, 0x91, 0x38, 0x05},
, , IPPROTO_TCP}, },
{{ {0xfe, 0xb0, , , , , , , 0x02, 0x1e, 0x67, 0xff, 0xfe, , , },
{0xfe, 0xb0, , , , , , , 0x02, 0x1b, 0x21, 0xff, 0xfe, 0x91, 0x38, 0x05},
, , IPPROTO_TCP}, },
};
typedef struct rte_hash lookup_struct_t;
static lookup_struct_t *ipv4_l3fwd_lookup_struct[NB_SOCKETS];
static lookup_struct_t *ipv6_l3fwd_lookup_struct[NB_SOCKETS];
#ifdef RTE_ARCH_X86_64
/* default to 4 million hash entries (approx) */
#define L3FWD_HASH_ENTRIES 1024*1024*4
#else
/* 32-bit has less address-space for hugepage memory, limit to 1M entries */
#define L3FWD_HASH_ENTRIES 1024*1024*1
#endif
#define HASH_ENTRY_NUMBER_DEFAULT 4
static uint32_t hash_entry_number = HASH_ENTRY_NUMBER_DEFAULT;
static inline uint32_tipv4_hash_crc(const void *data,
__rte_unused uint32_t data_len, uint32_t init_val){
const union ipv4_5tuple_host *k;
uint32_t t; const uint32_t *p;
k = data;
t = k->proto;
p = (const uint32_t *)&k->port_src;
#ifdef RTE_MACHINE_CPUFLAG_SSE4_2
init_val = rte_hash_crc_4byte(t, init_val);
init_val = rte_hash_crc_4byte(k->ip_src, init_val);
init_val = rte_hash_crc_4byte(k->ip_dst, init_val);
init_val = rte_hash_crc_4byte(*p, init_val);
#else /* RTE_MACHINE_CPUFLAG_SSE4_2 */
init_val = rte_jhash_1word(t, init_val);
init_val = rte_jhash_1word(k->ip_src, init_val);
init_val = rte_jhash_1word(k->ip_dst, init_val);
init_val = rte_jhash_1word(*p, init_val);
#endif /* RTE_MACHINE_CPUFLAG_SSE4_2 */
return (init_val);
}
static inline uint32_tipv6_hash_crc(const void *data,
__rte_unused uint32_t data_len, uint32_t init_val){
const union ipv6_5tuple_host *k;
uint32_t t;
const uint32_t *p;
#ifdef RTE_MACHINE_CPUFLAG_SSE4_2
const uint32_t *ip_src0, *ip_src1, *ip_src2, *ip_src3;
const uint32_t *ip_dst0, *ip_dst1, *ip_dst2, *ip_dst3;
#endif /* RTE_MACHINE_CPUFLAG_SSE4_2 */
k = data;
t = k->proto;
p = (const uint32_t *)&k->port_src;
#ifdef RTE_MACHINE_CPUFLAG_SSE4_2
ip_src0 = (const uint32_t *) k->ip_src;
ip_src1 = (const uint32_t *)(k->ip_src+);
ip_src2 = (const uint32_t *)(k->ip_src+);
ip_src3 = (const uint32_t *)(k->ip_src+);
ip_dst0 = (const uint32_t *) k->ip_dst;
ip_dst1 = (const uint32_t *)(k->ip_dst+);
ip_dst2 = (const uint32_t *)(k->ip_dst+);
ip_dst3 = (const uint32_t *)(k->ip_dst+);
init_val = rte_hash_crc_4byte(t, init_val);
init_val = rte_hash_crc_4byte(*ip_src0, init_val);
init_val = rte_hash_crc_4byte(*ip_src1, init_val);
init_val = rte_hash_crc_4byte(*ip_src2, init_val);
init_val = rte_hash_crc_4byte(*ip_src3, init_val);
init_val = rte_hash_crc_4byte(*ip_dst0, init_val);
init_val = rte_hash_crc_4byte(*ip_dst1, init_val);
init_val = rte_hash_crc_4byte(*ip_dst2, init_val);
init_val = rte_hash_crc_4byte(*ip_dst3, init_val);
init_val = rte_hash_crc_4byte(*p, init_val);
#else /* RTE_MACHINE_CPUFLAG_SSE4_2 */
init_val = rte_jhash_1word(t, init_val);
init_val = rte_jhash(k->ip_src, sizeof(uint8_t) * IPV6_ADDR_LEN, init_val);
init_val = rte_jhash(k->ip_dst, sizeof(uint8_t) * IPV6_ADDR_LEN, init_val);
init_val = rte_jhash_1word(*p, init_val);
#endif /* RTE_MACHINE_CPUFLAG_SSE4_2 */
return (init_val);
}
#define IPV4_L3FWD_NUM_ROUTES \
(sizeof(ipv4_l3fwd_route_array) / sizeof(ipv4_l3fwd_route_array[]))
#define IPV6_L3FWD_NUM_ROUTES \
(sizeof(ipv6_l3fwd_route_array) / sizeof(ipv6_l3fwd_route_array[]))
static uint8_t ipv4_l3fwd_out_if[L3FWD_HASH_ENTRIES] __rte_cache_aligned;
static uint8_t ipv6_l3fwd_out_if[L3FWD_HASH_ENTRIES] __rte_cache_aligned;
#endif #if (APP_LOOKUP_METHOD == APP_LOOKUP_LPM)
struct ipv4_l3fwd_route {
uint32_t ip; //看起来l3fwd支持RSS哟
uint8_t depth; //深度
uint8_t if_out; //数据转发的出口
}; struct ipv6_l3fwd_route {
uint8_t ip[];
uint8_t depth;
uint8_t if_out;
}; //这里设置默认的静态的三层转发路由规则,实际使用的时候需要修改这个地方
static struct ipv4_l3fwd_route ipv4_l3fwd_route_array[] = { //只有8个元素???
{IPv4(,,,), , }, //{IPv4(192,168,10,0), 24, 0},
{IPv4(,,,), , },
{IPv4(,,,), , },
{IPv4(,,,), , },
{IPv4(,,,), , },
{IPv4(,,,), , },
{IPv4(,,,), , },
{IPv4(,,,), , },
}; static struct ipv6_l3fwd_route ipv6_l3fwd_route_array[] = {
{{,,,,,,,,,,,,,,,}, , },
{{,,,,,,,,,,,,,,,}, , },
{{,,,,,,,,,,,,,,,}, , },
{{,,,,,,,,,,,,,,,}, , },
{{,,,,,,,,,,,,,,,}, , },
{{,,,,,,,,,,,,,,,}, , },
{{,,,,,,,,,,,,,,,}, , },
{{,,,,,,,,,,,,,,,}, , },
}; static struct ipv4_l3fwd_route ipv4_l3fwd_route_array[] = {
{{IPv4(,,,), IPv4(,,,), , , IPPROTO_TCP}, },
{{IPv4(,,,), IPv4(,,,), , , IPPROTO_TCP}, },
{{IPv4(,,,), IPv4(,,,), , , IPPROTO_TCP}, },
{{IPv4(,,,), IPv4(,,,), , , IPPROTO_TCP}, },
}; static struct ipv6_l3fwd_route ipv6_l3fwd_route_array[] = {
{{
{0xfe, 0x80, , , , , , , 0x02, 0x1e, 0x67, 0xff, 0xfe, , , },
{0xfe, 0x80, , , , , , , 0x02, 0x1b, 0x21, 0xff, 0xfe, 0x91, 0x38, 0x05},
, , IPPROTO_TCP}, }, {{
{0xfe, 0x90, , , , , , , 0x02, 0x1e, 0x67, 0xff, 0xfe, , , },
{0xfe, 0x90, , , , , , , 0x02, 0x1b, 0x21, 0xff, 0xfe, 0x91, 0x38, 0x05},
, , IPPROTO_TCP}, }, {{
{0xfe, 0xa0, , , , , , , 0x02, 0x1e, 0x67, 0xff, 0xfe, , , },
{0xfe, 0xa0, , , , , , , 0x02, 0x1b, 0x21, 0xff, 0xfe, 0x91, 0x38, 0x05},
, , IPPROTO_TCP}, }, {{
{0xfe, 0xb0, , , , , , , 0x02, 0x1e, 0x67, 0xff, 0xfe, , , },
{0xfe, 0xb0, , , , , , , 0x02, 0x1b, 0x21, 0xff, 0xfe, 0x91, 0x38, 0x05},
, , IPPROTO_TCP}, },
}; #define IPV4_L3FWD_NUM_ROUTES \
(sizeof(ipv4_l3fwd_route_array) / sizeof(ipv4_l3fwd_route_array[]))
#define IPV6_L3FWD_NUM_ROUTES \
(sizeof(ipv6_l3fwd_route_array) / sizeof(ipv6_l3fwd_route_array[])) #define IPV4_L3FWD_LPM_MAX_RULES 1024
#define IPV6_L3FWD_LPM_MAX_RULES 1024
#define IPV6_L3FWD_LPM_NUMBER_TBL8S (1 << 16) typedef struct rte_lpm lookup_struct_t;
typedef struct rte_lpm6 lookup6_struct_t;
static lookup_struct_t *ipv4_l3fwd_lookup_struct[NB_SOCKETS];//8个元素
static lookup6_struct_t *ipv6_l3fwd_lookup_struct[NB_SOCKETS];
#endif struct lcore_conf {//保存lcore的配置信息
uint16_t n_rx_queue; //接收队列的总数量
struct lcore_rx_queue rx_queue_list[MAX_RX_QUEUE_PER_LCORE];//物理端口和网卡队列编号组成的数组
uint16_t tx_queue_id[RTE_MAX_ETHPORTS]; //发送队列的编号组成的数组
struct mbuf_table tx_mbufs[RTE_MAX_ETHPORTS];//mbuf表
lookup_struct_t * ipv4_lookup_struct; //实际上就是struct rte_lpm *
#if (APP_LOOKUP_METHOD == APP_LOOKUP_LPM)
lookup6_struct_t * ipv6_lookup_struct;
#else
lookup_struct_t * ipv6_lookup_struct;
#endif
} __rte_cache_aligned; static struct lcore_conf lcore_conf[RTE_MAX_LCORE]; /* Send burst of packets on an output interface */
static inline int //在输出接口port上把数据包burst发送出去
send_burst(struct lcore_conf *qconf, uint16_t n, uint8_t port)
{
struct rte_mbuf **m_table;
int ret;
uint16_t queueid; queueid = qconf->tx_queue_id[port];
m_table = (struct rte_mbuf **)qconf->tx_mbufs[port].m_table; ret = rte_eth_tx_burst(port, queueid, m_table, n);
if (unlikely(ret < n)) {
do {
rte_pktmbuf_free(m_table[ret]);
} while (++ret < n);
} return ;
} /* Enqueue a single packet, and send burst if queue is filled */
static inline int //发送一个mbuf
send_single_packet(struct rte_mbuf *m, uint8_t port)
{
uint32_t lcore_id;
uint16_t len;
struct lcore_conf *qconf; lcore_id = rte_lcore_id(); qconf = &lcore_conf[lcore_id];
len = qconf->tx_mbufs[port].len;
qconf->tx_mbufs[port].m_table[len] = m;
len++; /* enough pkts to be sent */
if (unlikely(len == MAX_PKT_BURST)) { //如果累计到32个数据包
send_burst(qconf, MAX_PKT_BURST, port); //把32个数据包发送出去
len = ;
} qconf->tx_mbufs[port].len = len;
return ;
} static inline __attribute__ void
send_packetsx4(struct lcore_conf *qconf, uint8_t port,
struct rte_mbuf *m[], uint32_t num)
{
uint32_t len, j, n; len = qconf->tx_mbufs[port].len; /* 如果某个队列的发送缓冲区为空,而且已有足够数量数据包待发送,那么立即发送
* If TX buffer for that queue is empty, and we have enough packets,
* then send them straightway.
*/
if (num >= MAX_TX_BURST && len == ) {
n = rte_eth_tx_burst(port, qconf->tx_queue_id[port], m, num);//burst发送num个mbufs
if (unlikely(n < num)) { //如果实际发送数据包的个数小于num
do {
rte_pktmbuf_free(m[n]); //把剩下的num-n个mbufs返回mempool
} while (++n < num);
}
return;
} /*
* Put packets into TX buffer for that queue.
*/
//把那些数据包放到网卡队列的发送缓冲区中
n = len + num;
n = (n > MAX_PKT_BURST) ? MAX_PKT_BURST - len : num; j = ;
switch (n % FWDSTEP) {
while (j < n) {
case :
qconf->tx_mbufs[port].m_table[len + j] = m[j];
j++;
case :
qconf->tx_mbufs[port].m_table[len + j] = m[j];
j++;
case :
qconf->tx_mbufs[port].m_table[len + j] = m[j];
j++;
case :
qconf->tx_mbufs[port].m_table[len + j] = m[j];
j++;
}
} len += n; /*待发送的包数量达到32个 enough pkts to be sent */
if (unlikely(len == MAX_PKT_BURST)) { send_burst(qconf, MAX_PKT_BURST, port); /* copy rest of the packets into the TX buffer. */
len = num - n;
j = ;
switch (len % FWDSTEP) {
while (j < len) {
case :
qconf->tx_mbufs[port].m_table[j] = m[n + j];
j++;
case :
qconf->tx_mbufs[port].m_table[j] = m[n + j];
j++;
case :
qconf->tx_mbufs[port].m_table[j] = m[n + j];
j++;
case :
qconf->tx_mbufs[port].m_table[j] = m[n + j];
j++;
}
}
} qconf->tx_mbufs[port].len = len;
} #ifdef DO_RFC_1812_CHECKS
static inline int
is_valid_ipv4_pkt(struct ipv4_hdr *pkt, uint32_t link_len)
{
/* From http://www.rfc-editor.org/rfc/rfc1812.txt section 5.2.2 */
/*
* 1. The packet length reported by the Link Layer must be large
* enough to hold the minimum length legal IP datagram (20 bytes).
*/
if (link_len < sizeof(struct ipv4_hdr))
return -; /* 2. The IP checksum must be correct. */
/* this is checked in H/W */ /*
* 3. The IP version number must be 4. If the version number is not 4
* then the packet may be another version of IP, such as IPng or
* ST-II.
*/
if (((pkt->version_ihl) >> ) != )
return -;
/*
* 4. The IP header length field must be large enough to hold the
* minimum length legal IP datagram (20 bytes = 5 words).
*/
if ((pkt->version_ihl & 0xf) < )
return -; /*
* 5. The IP total length field must be large enough to hold the IP
* datagram header, whose length is specified in the IP header length
* field.
*/
if (rte_cpu_to_be_16(pkt->total_length) < sizeof(struct ipv4_hdr))
return -; return ;
}
#endif #if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH) static __m128i mask0;
static __m128i mask1;
static __m128i mask2;
static inline uint8_t //哈希情形下获取转发出口
get_ipv4_dst_port(void *ipv4_hdr, uint8_t portid, lookup_struct_t * ipv4_l3fwd_lookup_struct)
{
int ret = ;
union ipv4_5tuple_host key; ipv4_hdr = (uint8_t *)ipv4_hdr + offsetof(struct ipv4_hdr, time_to_live);
__m128i data = _mm_loadu_si128((__m128i*)(ipv4_hdr));
/* Get 5 tuple: dst port, src port, dst IP address, src IP address and protocol */
key.xmm = _mm_and_si128(data, mask0);
/* Find destination port */
ret = rte_hash_lookup(ipv4_l3fwd_lookup_struct, (const void *)&key);
return (uint8_t)((ret < )? portid : ipv4_l3fwd_out_if[ret]);
} static inline uint8_t
get_ipv6_dst_port(void *ipv6_hdr, uint8_t portid, lookup_struct_t * ipv6_l3fwd_lookup_struct)
{
int ret = ;
union ipv6_5tuple_host key; ipv6_hdr = (uint8_t *)ipv6_hdr + offsetof(struct ipv6_hdr, payload_len);
__m128i data0 = _mm_loadu_si128((__m128i*)(ipv6_hdr));
__m128i data1 = _mm_loadu_si128((__m128i*)(((uint8_t*)ipv6_hdr)+sizeof(__m128i)));
__m128i data2 = _mm_loadu_si128((__m128i*)(((uint8_t*)ipv6_hdr)+sizeof(__m128i)+sizeof(__m128i)));
/* Get part of 5 tuple: src IP address lower 96 bits and protocol */
key.xmm[] = _mm_and_si128(data0, mask1);
/* Get part of 5 tuple: dst IP address lower 96 bits and src IP address higher 32 bits */
key.xmm[] = data1;
/* Get part of 5 tuple: dst port and src port and dst IP address higher 32 bits */
key.xmm[] = _mm_and_si128(data2, mask2); /* Find destination port */
ret = rte_hash_lookup(ipv6_l3fwd_lookup_struct, (const void *)&key);
return (uint8_t)((ret < )? portid : ipv6_l3fwd_out_if[ret]);
}
#endif #if (APP_LOOKUP_METHOD == APP_LOOKUP_LPM) static inline uint8_t //LPM情形下获取ipv4数据包的目的端口
get_ipv4_dst_port(void *ipv4_hdr, uint8_t portid, lookup_struct_t * ipv4_l3fwd_lookup_struct)
{
uint8_t next_hop; return (uint8_t) ((rte_lpm_lookup(ipv4_l3fwd_lookup_struct,
rte_be_to_cpu_32(((struct ipv4_hdr *)ipv4_hdr)->dst_addr),
&next_hop) == ) ? next_hop : portid);
} static inline uint8_t
get_ipv6_dst_port(void *ipv6_hdr, uint8_t portid, lookup6_struct_t * ipv6_l3fwd_lookup_struct)
{
uint8_t next_hop;
return (uint8_t) ((rte_lpm6_lookup(ipv6_l3fwd_lookup_struct,
((struct ipv6_hdr*)ipv6_hdr)->dst_addr, &next_hop) == )?
next_hop : portid);
}
#endif static inline void l3fwd_simple_forward(struct rte_mbuf *m, uint8_t portid,
struct lcore_conf *qconf) __attribute__((unused)); #if ((APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH) && \
(ENABLE_MULTI_BUFFER_OPTIMIZE == )) static inline void get_ipv6_5tuple(struct rte_mbuf* m0, __m128i mask0, __m128i mask1,
union ipv6_5tuple_host * key)
{
__m128i tmpdata0 = _mm_loadu_si128((__m128i*)(rte_pktmbuf_mtod(m0, unsigned char *)
+ sizeof(struct ether_hdr) + offsetof(struct ipv6_hdr, payload_len)));
__m128i tmpdata1 = _mm_loadu_si128((__m128i*)(rte_pktmbuf_mtod(m0, unsigned char *)
+ sizeof(struct ether_hdr) + offsetof(struct ipv6_hdr, payload_len)
+ sizeof(__m128i)));
__m128i tmpdata2 = _mm_loadu_si128((__m128i*)(rte_pktmbuf_mtod(m0, unsigned char *)
+ sizeof(struct ether_hdr) + offsetof(struct ipv6_hdr, payload_len)
+ sizeof(__m128i) + sizeof(__m128i)));
key->xmm[] = _mm_and_si128(tmpdata0, mask0);
key->xmm[] = tmpdata1;
key->xmm[] = _mm_and_si128(tmpdata2, mask1);
return;
} static inline void
simple_ipv4_fwd_4pkts(struct rte_mbuf* m[], uint8_t portid, struct lcore_conf *qconf)
{
struct ether_hdr *eth_hdr[];
struct ipv4_hdr *ipv4_hdr[];
void *d_addr_bytes[];
uint8_t dst_port[];
int32_t ret[];
union ipv4_5tuple_host key[];
__m128i data[]; eth_hdr[] = rte_pktmbuf_mtod(m[], struct ether_hdr *);
eth_hdr[] = rte_pktmbuf_mtod(m[], struct ether_hdr *);
eth_hdr[] = rte_pktmbuf_mtod(m[], struct ether_hdr *);
eth_hdr[] = rte_pktmbuf_mtod(m[], struct ether_hdr *); /* Handle IPv4 headers.*/
ipv4_hdr[] = (struct ipv4_hdr *)(rte_pktmbuf_mtod(m[], unsigned char *) +
sizeof(struct ether_hdr));
ipv4_hdr[] = (struct ipv4_hdr *)(rte_pktmbuf_mtod(m[], unsigned char *) +
sizeof(struct ether_hdr));
ipv4_hdr[] = (struct ipv4_hdr *)(rte_pktmbuf_mtod(m[], unsigned char *) +
sizeof(struct ether_hdr));
ipv4_hdr[] = (struct ipv4_hdr *)(rte_pktmbuf_mtod(m[], unsigned char *) +
sizeof(struct ether_hdr)); #ifdef DO_RFC_1812_CHECKS
/* Check to make sure the packet is valid (RFC1812) */
uint8_t valid_mask = MASK_ALL_PKTS;
if (is_valid_ipv4_pkt(ipv4_hdr[], m[]->pkt_len) < ) {
rte_pktmbuf_free(m[]);
valid_mask &= EXECLUDE_1ST_PKT;
}
if (is_valid_ipv4_pkt(ipv4_hdr[], m[]->pkt_len) < ) {
rte_pktmbuf_free(m[]);
valid_mask &= EXECLUDE_2ND_PKT;
}
if (is_valid_ipv4_pkt(ipv4_hdr[], m[]->pkt_len) < ) {
rte_pktmbuf_free(m[]);
valid_mask &= EXECLUDE_3RD_PKT;
}
if (is_valid_ipv4_pkt(ipv4_hdr[], m[]->pkt_len) < ) {
rte_pktmbuf_free(m[]);
valid_mask &= EXECLUDE_4TH_PKT;
}
if (unlikely(valid_mask != MASK_ALL_PKTS)) {
if (valid_mask == ){
return;
} else {
uint8_t i = ;
for (i = ; i < ; i++) {
if ((0x1 << i) & valid_mask) {
l3fwd_simple_forward(m[i], portid, qconf);
}
}
return;
}
}
#endif // End of #ifdef DO_RFC_1812_CHECKS data[] = _mm_loadu_si128((__m128i*)(rte_pktmbuf_mtod(m[], unsigned char *) +
sizeof(struct ether_hdr) + offsetof(struct ipv4_hdr, time_to_live)));
data[] = _mm_loadu_si128((__m128i*)(rte_pktmbuf_mtod(m[], unsigned char *) +
sizeof(struct ether_hdr) + offsetof(struct ipv4_hdr, time_to_live)));
data[] = _mm_loadu_si128((__m128i*)(rte_pktmbuf_mtod(m[], unsigned char *) +
sizeof(struct ether_hdr) + offsetof(struct ipv4_hdr, time_to_live)));
data[] = _mm_loadu_si128((__m128i*)(rte_pktmbuf_mtod(m[], unsigned char *) +
sizeof(struct ether_hdr) + offsetof(struct ipv4_hdr, time_to_live))); key[].xmm = _mm_and_si128(data[], mask0);
key[].xmm = _mm_and_si128(data[], mask0);
key[].xmm = _mm_and_si128(data[], mask0);
key[].xmm = _mm_and_si128(data[], mask0); const void *key_array[] = {&key[], &key[], &key[],&key[]};
rte_hash_lookup_multi(qconf->ipv4_lookup_struct, &key_array[], , ret);
dst_port[] = (uint8_t) ((ret[] < ) ? portid : ipv4_l3fwd_out_if[ret[]]);
dst_port[] = (uint8_t) ((ret[] < ) ? portid : ipv4_l3fwd_out_if[ret[]]);
dst_port[] = (uint8_t) ((ret[] < ) ? portid : ipv4_l3fwd_out_if[ret[]]);
dst_port[] = (uint8_t) ((ret[] < ) ? portid : ipv4_l3fwd_out_if[ret[]]); if (dst_port[] >= RTE_MAX_ETHPORTS || (enabled_port_mask & << dst_port[]) == )
dst_port[] = portid;
if (dst_port[] >= RTE_MAX_ETHPORTS || (enabled_port_mask & << dst_port[]) == )
dst_port[] = portid;
if (dst_port[] >= RTE_MAX_ETHPORTS || (enabled_port_mask & << dst_port[]) == )
dst_port[] = portid;
if (dst_port[] >= RTE_MAX_ETHPORTS || (enabled_port_mask & << dst_port[]) == )
dst_port[] = portid; /* 02:00:00:00:00:xx */
d_addr_bytes[] = ð_hdr[]->d_addr.addr_bytes[];
d_addr_bytes[] = ð_hdr[]->d_addr.addr_bytes[];
d_addr_bytes[] = ð_hdr[]->d_addr.addr_bytes[];
d_addr_bytes[] = ð_hdr[]->d_addr.addr_bytes[];
*((uint64_t *)d_addr_bytes[]) = 0x000000000002 + ((uint64_t)dst_port[] << );
*((uint64_t *)d_addr_bytes[]) = 0x000000000002 + ((uint64_t)dst_port[] << );
*((uint64_t *)d_addr_bytes[]) = 0x000000000002 + ((uint64_t)dst_port[] << );
*((uint64_t *)d_addr_bytes[]) = 0x000000000002 + ((uint64_t)dst_port[] << ); #ifdef DO_RFC_1812_CHECKS
/* Update time to live and header checksum */
--(ipv4_hdr[]->time_to_live);
--(ipv4_hdr[]->time_to_live);
--(ipv4_hdr[]->time_to_live);
--(ipv4_hdr[]->time_to_live);
++(ipv4_hdr[]->hdr_checksum);
++(ipv4_hdr[]->hdr_checksum);
++(ipv4_hdr[]->hdr_checksum);
++(ipv4_hdr[]->hdr_checksum);
#endif /* src addr */
ether_addr_copy(&ports_eth_addr[dst_port[]], ð_hdr[]->s_addr);
ether_addr_copy(&ports_eth_addr[dst_port[]], ð_hdr[]->s_addr);
ether_addr_copy(&ports_eth_addr[dst_port[]], ð_hdr[]->s_addr);
ether_addr_copy(&ports_eth_addr[dst_port[]], ð_hdr[]->s_addr); send_single_packet(m[], (uint8_t)dst_port[]);
send_single_packet(m[], (uint8_t)dst_port[]);
send_single_packet(m[], (uint8_t)dst_port[]);
send_single_packet(m[], (uint8_t)dst_port[]); } #define MASK_ALL_PKTS 0xf
#define EXECLUDE_1ST_PKT 0xe
#define EXECLUDE_2ND_PKT 0xd
#define EXECLUDE_3RD_PKT 0xb
#define EXECLUDE_4TH_PKT 0x7 static inline void
simple_ipv6_fwd_4pkts(struct rte_mbuf* m[], uint8_t portid, struct lcore_conf *qconf)
{
struct ether_hdr *eth_hdr[];
__attribute__((unused)) struct ipv6_hdr *ipv6_hdr[];
void *d_addr_bytes[];
uint8_t dst_port[];
int32_t ret[];
union ipv6_5tuple_host key[]; eth_hdr[] = rte_pktmbuf_mtod(m[], struct ether_hdr *);
eth_hdr[] = rte_pktmbuf_mtod(m[], struct ether_hdr *);
eth_hdr[] = rte_pktmbuf_mtod(m[], struct ether_hdr *);
eth_hdr[] = rte_pktmbuf_mtod(m[], struct ether_hdr *); /* Handle IPv6 headers.*/
ipv6_hdr[] = (struct ipv6_hdr *)(rte_pktmbuf_mtod(m[], unsigned char *) +
sizeof(struct ether_hdr));
ipv6_hdr[] = (struct ipv6_hdr *)(rte_pktmbuf_mtod(m[], unsigned char *) +
sizeof(struct ether_hdr));
ipv6_hdr[] = (struct ipv6_hdr *)(rte_pktmbuf_mtod(m[], unsigned char *) +
sizeof(struct ether_hdr));
ipv6_hdr[] = (struct ipv6_hdr *)(rte_pktmbuf_mtod(m[], unsigned char *) +
sizeof(struct ether_hdr)); get_ipv6_5tuple(m[], mask1, mask2, &key[]);
get_ipv6_5tuple(m[], mask1, mask2, &key[]);
get_ipv6_5tuple(m[], mask1, mask2, &key[]);
get_ipv6_5tuple(m[], mask1, mask2, &key[]); const void *key_array[] = {&key[], &key[], &key[],&key[]};
rte_hash_lookup_multi(qconf->ipv6_lookup_struct, &key_array[], , ret);
dst_port[] = (uint8_t) ((ret[] < )? portid:ipv6_l3fwd_out_if[ret[]]);
dst_port[] = (uint8_t) ((ret[] < )? portid:ipv6_l3fwd_out_if[ret[]]);
dst_port[] = (uint8_t) ((ret[] < )? portid:ipv6_l3fwd_out_if[ret[]]);
dst_port[] = (uint8_t) ((ret[] < )? portid:ipv6_l3fwd_out_if[ret[]]); if (dst_port[] >= RTE_MAX_ETHPORTS || (enabled_port_mask & << dst_port[]) == )
dst_port[] = portid;
if (dst_port[] >= RTE_MAX_ETHPORTS || (enabled_port_mask & << dst_port[]) == )
dst_port[] = portid;
if (dst_port[] >= RTE_MAX_ETHPORTS || (enabled_port_mask & << dst_port[]) == )
dst_port[] = portid;
if (dst_port[] >= RTE_MAX_ETHPORTS || (enabled_port_mask & << dst_port[]) == )
dst_port[] = portid; /* 02:00:00:00:00:xx */
d_addr_bytes[] = ð_hdr[]->d_addr.addr_bytes[];
d_addr_bytes[] = ð_hdr[]->d_addr.addr_bytes[];
d_addr_bytes[] = ð_hdr[]->d_addr.addr_bytes[];
d_addr_bytes[] = ð_hdr[]->d_addr.addr_bytes[];
*((uint64_t *)d_addr_bytes[]) = 0x000000000002 + ((uint64_t)dst_port[] << );
*((uint64_t *)d_addr_bytes[]) = 0x000000000002 + ((uint64_t)dst_port[] << );
*((uint64_t *)d_addr_bytes[]) = 0x000000000002 + ((uint64_t)dst_port[] << );
*((uint64_t *)d_addr_bytes[]) = 0x000000000002 + ((uint64_t)dst_port[] << ); /* src addr */
ether_addr_copy(&ports_eth_addr[dst_port[]], ð_hdr[]->s_addr);
ether_addr_copy(&ports_eth_addr[dst_port[]], ð_hdr[]->s_addr);
ether_addr_copy(&ports_eth_addr[dst_port[]], ð_hdr[]->s_addr);
ether_addr_copy(&ports_eth_addr[dst_port[]], ð_hdr[]->s_addr); send_single_packet(m[], (uint8_t)dst_port[]);
send_single_packet(m[], (uint8_t)dst_port[]);
send_single_packet(m[], (uint8_t)dst_port[]);
send_single_packet(m[], (uint8_t)dst_port[]); }
#endif /* APP_LOOKUP_METHOD */ static inline __attribute__ void //简单三层转发,没有使用SSE4.1优化
l3fwd_simple_forward(struct rte_mbuf *m, uint8_t portid, struct lcore_conf *qconf)
{
struct ether_hdr *eth_hdr;
struct ipv4_hdr *ipv4_hdr;
void *d_addr_bytes;
uint8_t dst_port; eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); //得到eth_hdr指针 if (m->ol_flags & PKT_RX_IPV4_HDR) { //如果是ipv4包
/* Handle IPv4 headers.*/
ipv4_hdr = (struct ipv4_hdr *)(rte_pktmbuf_mtod(m, unsigned char *) +
sizeof(struct ether_hdr)); #ifdef DO_RFC_1812_CHECKS
/* Check to make sure the packet is valid (RFC1812) */
if (is_valid_ipv4_pkt(ipv4_hdr, m->pkt_len) < ) {
rte_pktmbuf_free(m);
return;
}
#endif
//想要满足文生提出的需求,主要在这里修改ip层和tcp层的数据内容。
dst_port = get_ipv4_dst_port(ipv4_hdr, portid, //获取转发出口
qconf->ipv4_lookup_struct);
if (dst_port >= RTE_MAX_ETHPORTS ||
(enabled_port_mask & << dst_port) == )
dst_port = portid; //出错则直接把入口作为转发出口 /* 02:00:00:00:00:xx 这里是修改目的mac地址吗??? */
d_addr_bytes = ð_hdr->d_addr.addr_bytes[];
*((uint64_t *)d_addr_bytes) = ETHER_LOCAL_ADMIN_ADDR +
((uint64_t)dst_port << ); #ifdef DO_RFC_1812_CHECKS
/* Update time to live and header checksum */
--(ipv4_hdr->time_to_live);
++(ipv4_hdr->hdr_checksum);
#endif /* //把进入包的目的mac地址作为转发包的源地址 src addr */
ether_addr_copy(&ports_eth_addr[dst_port], ð_hdr->s_addr); send_single_packet(m, dst_port); //经过dst_port把转发包发送出去 } else { //如果是ipv6包
/* Handle IPv6 headers.*/
struct ipv6_hdr *ipv6_hdr; ipv6_hdr = (struct ipv6_hdr *)(rte_pktmbuf_mtod(m, unsigned char *) +
sizeof(struct ether_hdr)); dst_port = get_ipv6_dst_port(ipv6_hdr, portid, qconf->ipv6_lookup_struct); if (dst_port >= RTE_MAX_ETHPORTS || (enabled_port_mask & << dst_port) == )
dst_port = portid; /* 02:00:00:00:00:xx */
d_addr_bytes = ð_hdr->d_addr.addr_bytes[];
*((uint64_t *)d_addr_bytes) = ETHER_LOCAL_ADMIN_ADDR +
((uint64_t)dst_port << ); /* src addr */
ether_addr_copy(&ports_eth_addr[dst_port], ð_hdr->s_addr); send_single_packet(m, dst_port);
} } #ifdef DO_RFC_1812_CHECKS #define IPV4_MIN_VER_IHL 0x45
#define IPV4_MAX_VER_IHL 0x4f
#define IPV4_MAX_VER_IHL_DIFF (IPV4_MAX_VER_IHL - IPV4_MIN_VER_IHL) /* Minimum value of IPV4 total length (20B) in network byte order. */
#define IPV4_MIN_LEN_BE (sizeof(struct ipv4_hdr) << 8) /*
* From http://www.rfc-editor.org/rfc/rfc1812.txt section 5.2.2:
* - The IP version number must be 4.
* - The IP header length field must be large enough to hold the
* minimum length legal IP datagram (20 bytes = 5 words).
* - The IP total length field must be large enough to hold the IP
* datagram header, whose length is specified in the IP header length
* field.
* If we encounter invalid IPV4 packet, then set destination port for it
* to BAD_PORT value.
*/
static inline __attribute__ void //ipv4错误检查
rfc1812_process(struct ipv4_hdr *ipv4_hdr, uint16_t *dp, uint32_t flags)
{
uint8_t ihl; if ((flags & PKT_RX_IPV4_HDR) != ) {//如果是ipv4 ihl = ipv4_hdr->version_ihl - IPV4_MIN_VER_IHL; ipv4_hdr->time_to_live--;
ipv4_hdr->hdr_checksum++; if (ihl > IPV4_MAX_VER_IHL_DIFF ||
((uint8_t)ipv4_hdr->total_length == &&
ipv4_hdr->total_length < IPV4_MIN_LEN_BE)) {
dp[] = BAD_PORT; //应该是出错了
}
}
} #else
#define rfc1812_process(mb, dp) do { } while (0)
#endif /* DO_RFC_1812_CHECKS */ #if ((APP_LOOKUP_METHOD == APP_LOOKUP_LPM) && \
(ENABLE_MULTI_BUFFER_OPTIMIZE == )) static inline __attribute__ uint16_t //得到目的ip地址对应的转发出口
get_dst_port(const struct lcore_conf *qconf, struct rte_mbuf *pkt,
uint32_t dst_ipv4, uint8_t portid)
{
uint8_t next_hop;
struct ipv6_hdr *ipv6_hdr;
struct ether_hdr *eth_hdr; if (pkt->ol_flags & PKT_RX_IPV4_HDR) { //如果都是ipv4
if (rte_lpm_lookup(qconf->ipv4_lookup_struct, dst_ipv4,
&next_hop) != ) //返回0则查找到,next_hop中已经得到下一跳
next_hop = portid; //此时没找到,则直接把portid设定为下一跳
} else if (pkt->ol_flags & PKT_RX_IPV6_HDR) { //如果都是ipv6
eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
ipv6_hdr = (struct ipv6_hdr *)(eth_hdr + );
if (rte_lpm6_lookup(qconf->ipv6_lookup_struct,
ipv6_hdr->dst_addr, &next_hop) != )
next_hop = portid;
} else { //如果有其他种类的数据包
next_hop = portid;//设定下一跳
} return next_hop;//返回下一跳
} static inline void //处理一个数据包
process_packet(struct lcore_conf *qconf, struct rte_mbuf *pkt,
uint16_t *dst_port, uint8_t portid)
{
struct ether_hdr *eth_hdr;
struct ipv4_hdr *ipv4_hdr;
uint32_t dst_ipv4;
uint16_t dp;
__m128i te, ve; eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);//获取eth首部
ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + );//获取ipv4首部 dst_ipv4 = ipv4_hdr->dst_addr; //得到大端的ipv4目的地址
dst_ipv4 = rte_be_to_cpu_32(dst_ipv4);//转换成小端
dp = get_dst_port(qconf, pkt, dst_ipv4, portid); //获取转发出口/下一跳 te = _mm_load_si128((__m128i *)eth_hdr);
ve = val_eth[dp]; dst_port[] = dp;
rfc1812_process(ipv4_hdr, dst_port, pkt->ol_flags); te = _mm_blend_epi16(te, ve, MASK_ETH);
_mm_store_si128((__m128i *)eth_hdr, te);
} /* 从4个mbufs中读取目的IP地址和ol_flags
* Read ol_flags and destination IPV4 addresses from 4 mbufs.
*/
static inline void
processx4_step1(struct rte_mbuf *pkt[FWDSTEP], __m128i *dip, uint32_t *flag)
{
struct ipv4_hdr *ipv4_hdr;
struct ether_hdr *eth_hdr;
uint32_t x0, x1, x2, x3;
//第一个mbuf
eth_hdr = rte_pktmbuf_mtod(pkt[], struct ether_hdr *);//得到eth_hdr
ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + );//得到ipv4_hdr
x0 = ipv4_hdr->dst_addr;//得到dst_addr
flag[] = pkt[]->ol_flags & PKT_RX_IPV4_HDR;
//第二个mbuf
eth_hdr = rte_pktmbuf_mtod(pkt[], struct ether_hdr *);
ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + );
x1 = ipv4_hdr->dst_addr;
flag[] &= pkt[]->ol_flags; //与前一个mbuf标志做&运算
//第三个mbuf
eth_hdr = rte_pktmbuf_mtod(pkt[], struct ether_hdr *);
ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + );
x2 = ipv4_hdr->dst_addr;
flag[] &= pkt[]->ol_flags; //与前一个mbuf标志做&运算
//第四个mbuf
eth_hdr = rte_pktmbuf_mtod(pkt[], struct ether_hdr *);
ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + );
x3 = ipv4_hdr->dst_addr;
flag[] &= pkt[]->ol_flags; //与前一个mbuf标志做&运算 dip[] = _mm_set_epi32(x3, x2, x1, x0);//把4个dst_addr合并为128位的寄存器
} /*
* Lookup into LPM for destination port.
* If lookup fails, use incoming port (portid) as destination port.
*/ //在LPM中查找转发出口/下一跳,如果没有找到则把入口作为转发出口
static inline void
processx4_step2(const struct lcore_conf *qconf, __m128i dip, uint32_t flag,
uint8_t portid, struct rte_mbuf *pkt[FWDSTEP], uint16_t dprt[FWDSTEP])
{
rte_xmm_t dst;
const __m128i bswap_mask = _mm_set_epi8(, , , , , , , ,
, , , , , , , ); //表示重新排列的顺序 /* Byte swap 4 IPV4 addresses. 按照字节交换ipv4地址 */
dip = _mm_shuffle_epi8(dip, bswap_mask); /* 如果4个分组都是ipv4的 if all 4 packets are IPV4. */
if (likely(flag != )) {
rte_lpm_lookupx4(qconf->ipv4_lookup_struct, dip, dprt, portid);
} else {
dst.x = dip; //获取4个目的ip地址
dprt[] = get_dst_port(qconf, pkt[], dst.u32[], portid);//得到下一跳/转发出口
dprt[] = get_dst_port(qconf, pkt[], dst.u32[], portid);
dprt[] = get_dst_port(qconf, pkt[], dst.u32[], portid);
dprt[] = get_dst_port(qconf, pkt[], dst.u32[], portid);
}
} /*
* Update source and destination MAC addresses in the ethernet header.
* Perform RFC1812 checks and updates for IPV4 packets.
*/ //更新目的mac和源mac地址
static inline void
processx4_step3(struct rte_mbuf *pkt[FWDSTEP], uint16_t dst_port[FWDSTEP])
{
__m128i te[FWDSTEP];
__m128i ve[FWDSTEP];
__m128i *p[FWDSTEP]; p[] = (rte_pktmbuf_mtod(pkt[], __m128i *));//指向第一个数据包的内容
p[] = (rte_pktmbuf_mtod(pkt[], __m128i *));
p[] = (rte_pktmbuf_mtod(pkt[], __m128i *));
p[] = (rte_pktmbuf_mtod(pkt[], __m128i *)); ve[] = val_eth[dst_port[]];
te[] = _mm_load_si128(p[]);//将p[0]指向的内容加载到128位寄存器中 ve[] = val_eth[dst_port[]];
te[] = _mm_load_si128(p[]); ve[] = val_eth[dst_port[]];
te[] = _mm_load_si128(p[]); ve[] = val_eth[dst_port[]];
te[] = _mm_load_si128(p[]); /*替换更新前12个字节,保留剩余 Update first 12 bytes, keep rest bytes intact. */
te[] = _mm_blend_epi16(te[], ve[], MASK_ETH);
te[] = _mm_blend_epi16(te[], ve[], MASK_ETH);
te[] = _mm_blend_epi16(te[], ve[], MASK_ETH);
te[] = _mm_blend_epi16(te[], ve[], MASK_ETH); _mm_store_si128(p[], te[]);
_mm_store_si128(p[], te[]);
_mm_store_si128(p[], te[]);
_mm_store_si128(p[], te[]); rfc1812_process((struct ipv4_hdr *)((struct ether_hdr *)p[] + ),
&dst_port[], pkt[]->ol_flags);
rfc1812_process((struct ipv4_hdr *)((struct ether_hdr *)p[] + ),
&dst_port[], pkt[]->ol_flags);
rfc1812_process((struct ipv4_hdr *)((struct ether_hdr *)p[] + ),
&dst_port[], pkt[]->ol_flags);
rfc1812_process((struct ipv4_hdr *)((struct ether_hdr *)p[] + ),
&dst_port[], pkt[]->ol_flags);
} /* //把转发出口相同的连续数据包做一次burst发送
为了避免额外的延迟,与其他的包处理一起完成,但在对转发出口做了决策之后。 * We group consecutive packets with the same destionation port into one burst.
* To avoid extra latency this is done together with some other packet
* processing, but after we made a final decision about packet's destination.
* To do this we maintain:
* pnum - array of number of consecutive packets with the same dest port for
* each packet in the input burst. ***pnum是保存转发出口相同的连续数据包的数组
* lp - pointer to the last updated element in the pnum. ***lp指向pnum中最后一次更新的元素
* dlp - dest port value lp corresponds to. ***dlp为lp对应的转发出口编号
*/ #define GRPSZ (1 << FWDSTEP) //
#define GRPMSK (GRPSZ - 1) // #define GROUP_PORT_STEP(dlp, dcp, lp, pn, idx) do { \
if (likely((dlp) == (dcp)[(idx)])) { \
(lp)[]++; \
} else { \
(dlp) = (dcp)[idx]; \
(lp) = (pn) + (idx); \
(lp)[] = ; \
} \
} while () /*
* Group consecutive packets with the same destination port in bursts of 4.
* Suppose we have array of destionation ports:
* dst_port[] = {a, b, c, d,, e, ... }
* dp1 should contain: <a, b, c, d>, dp2: <b, c, d, e>.
* We doing 4 comparisions at once and the result is 4 bit mask.
* This mask is used as an index into prebuild array of pnum values.
*/
static inline uint16_t * //把出口相同的4个数据包构成一组
port_groupx4(uint16_t pn[FWDSTEP + ], uint16_t *lp, __m128i dp1, __m128i dp2)
{
static const struct {
uint64_t pnum; /*为pnum预设的4个值 prebuild 4 values for pnum[]. */
int32_t idx; /*最后一次更新的元素的索引 index for new last updated elemnet. */
uint16_t lpv; /*把值加到最后一次更新的元素 add value to the last updated element. */
} gptbl[GRPSZ] = {
{
/* 0: a != b, b != c, c != d, d != e */
.pnum = UINT64_C(0x0001000100010001),
.idx = ,
.lpv = ,
},
{
/* 1: a == b, b != c, c != d, d != e */
.pnum = UINT64_C(0x0001000100010002),
.idx = ,
.lpv = ,
},
{
/* 2: a != b, b == c, c != d, d != e */
.pnum = UINT64_C(0x0001000100020001),
.idx = ,
.lpv = ,
},
{
/* 3: a == b, b == c, c != d, d != e */
.pnum = UINT64_C(0x0001000100020003),
.idx = ,
.lpv = ,
},
{
/* 4: a != b, b != c, c == d, d != e */
.pnum = UINT64_C(0x0001000200010001),
.idx = ,
.lpv = ,
},
{
/* 5: a == b, b != c, c == d, d != e */
.pnum = UINT64_C(0x0001000200010002),
.idx = ,
.lpv = ,
},
{
/* 6: a != b, b == c, c == d, d != e */
.pnum = UINT64_C(0x0001000200030001),
.idx = ,
.lpv = ,
},
{
/* 7: a == b, b == c, c == d, d != e */
.pnum = UINT64_C(0x0001000200030004),
.idx = ,
.lpv = ,
},
{
/* 8: a != b, b != c, c != d, d == e */
.pnum = UINT64_C(0x0002000100010001),
.idx = ,
.lpv = ,
},
{
/* 9: a == b, b != c, c != d, d == e */
.pnum = UINT64_C(0x0002000100010002),
.idx = ,
.lpv = ,
},
{
/* 0xa: a != b, b == c, c != d, d == e */
.pnum = UINT64_C(0x0002000100020001),
.idx = ,
.lpv = ,
},
{
/* 0xb: a == b, b == c, c != d, d == e */
.pnum = UINT64_C(0x0002000100020003),
.idx = ,
.lpv = ,
},
{
/* 0xc: a != b, b != c, c == d, d == e */
.pnum = UINT64_C(0x0002000300010001),
.idx = ,
.lpv = ,
},
{
/* 0xd: a == b, b != c, c == d, d == e */
.pnum = UINT64_C(0x0002000300010002),
.idx = ,
.lpv = ,
},
{
/* 0xe: a != b, b == c, c == d, d == e */
.pnum = UINT64_C(0x0002000300040001),
.idx = ,
.lpv = ,
},
{
/* 0xf: a == b, b == c, c == d, d == e */
.pnum = UINT64_C(0x0002000300040005),
.idx = ,
.lpv = ,
},
}; union {
uint16_t u16[FWDSTEP + ];
uint64_t u64;
} *pnum = (void *)pn; int32_t v; dp1 = _mm_cmpeq_epi16(dp1, dp2); //按照16位一个单元来比较dp1和dp2
dp1 = _mm_unpacklo_epi16(dp1, dp1); //按照16位一个单元将dp1与dp1来结合
v = _mm_movemask_ps((__m128)dp1); //根据dp1的4个值形成4个位的掩码 /*更新最后一次端口计数 update last port counter. */
lp[] += gptbl[v].lpv; /*如果转发出口的值已经改变 if dest port value has changed. */
if (v != GRPMSK) {
lp = pnum->u16 + gptbl[v].idx;
lp[] = ;
pnum->u64 = gptbl[v].pnum;
} return lp;
} #endif /* APP_LOOKUP_METHOD */ /* 线程执行函数 main processing loop */
static int
main_loop(__attribute__((unused)) void *dummy)
{
struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; //32个指针构成的数组
unsigned lcore_id;
uint64_t prev_tsc, diff_tsc, cur_tsc;
int i, j, nb_rx;
uint8_t portid, queueid;
struct lcore_conf *qconf;
const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - ) /
US_PER_S * BURST_TX_DRAIN_US; #if ((APP_LOOKUP_METHOD == APP_LOOKUP_LPM) && \
(ENABLE_MULTI_BUFFER_OPTIMIZE == ))
int32_t k;
uint16_t dlp; //dlp为lp对应的转发出口编号
uint16_t *lp; //lp指向pkts_burst中最后一次更新的元素
uint16_t dst_port[MAX_PKT_BURST]; //dst_port是32个数据包的转发出口构成的数组
__m128i dip[MAX_PKT_BURST / FWDSTEP]; //数据包的目的IP地址构成的数组
uint32_t flag[MAX_PKT_BURST / FWDSTEP];
uint16_t pnum[MAX_PKT_BURST + ]; //转发出口相同的数据包的编号
#endif prev_tsc = ; lcore_id = rte_lcore_id(); //获取lcore_id
qconf = &lcore_conf[lcore_id];//获取lcore_id的配置信息 if (qconf->n_rx_queue == ) { //如果lcore上没有接收队列
RTE_LOG(INFO, L3FWD, "lcore %u has nothing to do\n", lcore_id);
return ;
} RTE_LOG(INFO, L3FWD, "entering main loop on lcore %u\n", lcore_id); for (i = ; i < qconf->n_rx_queue; i++) { //遍历所有的接收队列 portid = qconf->rx_queue_list[i].port_id; //得到物理端口的编号
queueid = qconf->rx_queue_list[i].queue_id; //得到网卡队列的编号
RTE_LOG(INFO, L3FWD, " -- lcoreid=%u portid=%hhu rxqueueid=%hhu\n", lcore_id,
portid, queueid);
} while () { //死循环,体现PMD思想 cur_tsc = rte_rdtsc(); /*
* TX burst queue drain
*/
diff_tsc = cur_tsc - prev_tsc; //计算时间差
if (unlikely(diff_tsc > drain_tsc)) { //如果两次时间差大于定值 /*
* This could be optimized (use queueid instead of
* portid), but it is not called so often
*/
for (portid = ; portid < RTE_MAX_ETHPORTS; portid++) {//遍历所有的物理端口
if (qconf->tx_mbufs[portid].len == )
continue;
send_burst(qconf,
qconf->tx_mbufs[portid].len,
portid);
qconf->tx_mbufs[portid].len = ;
} prev_tsc = cur_tsc; //记下前一时间
} /* 从接收队列中读取数据包
* Read packet from RX queues
*/
for (i = ; i < qconf->n_rx_queue; ++i) { //遍历所有的接收队列
portid = qconf->rx_queue_list[i].port_id;//得到物理端口的编号
queueid = qconf->rx_queue_list[i].queue_id; //得到网卡队列的编号
nb_rx = rte_eth_rx_burst(portid, queueid, pkts_burst,
MAX_PKT_BURST); //在每个队列上尽量接收32个数据包,用nb_rx记录实际个数
if (nb_rx == ) //如果一个包也没有收到
continue; #if (ENABLE_MULTI_BUFFER_OPTIMIZE == 1) //如果支持Intel SSE4.1特性
if (APP_LOOKUP_METHOD == APP_LOOKUP_LPM) //如果使用lpm k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP); //整除4
for (j = ; j != k; j += FWDSTEP) { //每次处理4个mbufs
processx4_step1(&pkts_burst[j], //从4个mbufs中读取目的ip地址和ol_flags
&dip[j / FWDSTEP],
&flag[j / FWDSTEP]);
} k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
for (j = ; j != k; j += FWDSTEP) {//每次处理4个mbufs
processx4_step2(qconf, dip[j / FWDSTEP], //在LPM中查找转发出口,如果失败则把进入的端口作为转发出口
flag[j / FWDSTEP], portid,
&pkts_burst[j], &dst_port[j]);
} /* 完成包处理,并根据相同的转发出口来分组连续的数据包
* Finish packet processing and group consecutive
* packets with the same destination port.
*/
k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);//处理成4的幂
if (k != ) {
__m128i dp1, dp2; lp = pnum;
lp[] = ; processx4_step3(pkts_burst, dst_port); //更新目的mac和源mac地址 /* dp1: <d[0], d[1], d[2], d[3], ... > */
dp1 = _mm_loadu_si128((__m128i *)dst_port); //把目的端口加载到寄存器dp1中 for (j = FWDSTEP; j != k; j += FWDSTEP) { //每次处理4个mbufs
processx4_step3(&pkts_burst[j], //更新目的mac和源mac地址
&dst_port[j]); /*
* dp2:
* <d[j-3], d[j-2], d[j-1], d[j], ... >
*/
dp2 = _mm_loadu_si128((__m128i *) //返回一个__m128i的寄存器
&dst_port[j - FWDSTEP + ]);
lp = port_groupx4(&pnum[j - FWDSTEP], //把出口相同的4个数据包构成一组
lp, dp1, dp2); /*
* dp1:
* <d[j], d[j+1], d[j+2], d[j+3], ... >
*/
dp1 = _mm_srli_si128(dp2, //逻辑左移3*16位,返回一个__m128i的寄存器
(FWDSTEP - ) *
sizeof(dst_port[]));
} /*
* dp2: <d[j-3], d[j-2], d[j-1], d[j-1], ... >
*/
dp2 = _mm_shufflelo_epi16(dp1, 0xf9); //重新排序,返回一个__m128i的寄存器
lp = port_groupx4(&pnum[j - FWDSTEP], lp, //把4个连续分组按照目的端口分组
dp1, dp2); /*
* remove values added by the last repeated
* dst port.
*/
lp[]--;
dlp = dst_port[j - ];
} else {
/* set dlp and lp to the never used values. */
dlp = BAD_PORT - ;
lp = pnum + MAX_PKT_BURST;
} /*处理最后的三个分组 Process up to last 3 packets one by one. */
switch (nb_rx % FWDSTEP) {
case : //第三个mbuf
process_packet(qconf, pkts_burst[j],
dst_port + j, portid);
GROUP_PORT_STEP(dlp, dst_port, lp, pnum, j);
j++;
case ://第二个mbuf
process_packet(qconf, pkts_burst[j],
dst_port + j, portid);
GROUP_PORT_STEP(dlp, dst_port, lp, pnum, j);
j++;
case ://第一个mbuf
process_packet(qconf, pkts_burst[j],
dst_port + j, portid);
GROUP_PORT_STEP(dlp, dst_port, lp, pnum, j);
j++;
} /*通过目的端口把数据包都发出去,这些数据包之前已经组合好了的
* Send packets out, through destination port.
* Consecuteve pacekts with the same destination port
* are already grouped together.
* If destination port for the packet equals BAD_PORT,
* then free the packet without sending it out.
*/
for (j = ; j < nb_rx; j += k) { //遍历接收到的数据包 int32_t m;
uint16_t pn; pn = dst_port[j];
k = pnum[j]; if (likely(pn != BAD_PORT)) {
send_packetsx4(qconf, pn, //把待发送的数据包放到发送缓冲区中,累积到32个再发出去
pkts_burst + j, k);
} else {
for (m = j; m != j + k; m++)
rte_pktmbuf_free(pkts_burst[m]);
}
} #endif /* APP_LOOKUP_METHOD */
#else /*如果不支持Intel SSE4.1特性 ENABLE_MULTI_BUFFER_OPTIMIZE == 0 */ /*预取接收队列上的第一个数据包 Prefetch first packets */
for (j = ; j < PREFETCH_OFFSET && j < nb_rx; j++) {
rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[j], void *));
} /*预取和转发已经预取的数据包 Prefetch and forward already prefetched packets */
for (j = ; j < (nb_rx - PREFETCH_OFFSET); j++) {
rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[
j + PREFETCH_OFFSET], void *));
l3fwd_simple_forward(pkts_burst[j], portid, qconf);//简单转发4倍数的数据包 } /*转发正在预取的数据包 Forward remaining prefetched packets */
for (; j < nb_rx; j++) {
l3fwd_simple_forward(pkts_burst[j], portid, qconf);//简单转发剩余几个数据包 }
#endif /* ENABLE_MULTI_BUFFER_OPTIMIZE */ } //for (i = 0; i < qconf->n_rx_queue; ++i)
} //while (1)
}//end of main_loop static int //检查lcore的参数
check_lcore_params(void)
{
uint8_t queue, lcore;
uint16_t i;
int socketid; for (i = ; i < nb_lcore_params; ++i) { //遍历lcores的参数表
queue = lcore_params[i].queue_id;
if (queue >= MAX_RX_QUEUE_PER_PORT) { //如果队列编号大于128
printf("invalid queue number: %hhu\n", queue);
return -;
}
lcore = lcore_params[i].lcore_id;
if (!rte_lcore_is_enabled(lcore)) { //如果lcore没有启用
printf("error: lcore %hhu is not enabled in lcore mask\n", lcore);
return -;
}
if ((socketid = rte_lcore_to_socket_id(lcore) != ) &&
(numa_on == )) { //如果numa关闭
printf("warning: lcore %hhu is on socket %d with numa off \n",
lcore, socketid);
}
}
return ;
} static int //检查物理端口的配置
check_port_config(const unsigned nb_ports)
{
unsigned portid;
uint16_t i; for (i = ; i < nb_lcore_params; ++i) { //遍历lcores的参数表
portid = lcore_params[i].port_id;
if ((enabled_port_mask & ( << portid)) == ) {
printf("port %u is not enabled in port mask\n", portid);
return -;
}
if (portid >= nb_ports) {
printf("port %u is not present on the board\n", portid);
return -;
}
}
return ;
} static uint8_t //获取物理端口上的接收队列数量
get_port_n_rx_queues(const uint8_t port) //其实就是取queue_id最大值加1
{
int queue = -;
uint16_t i; for (i = ; i < nb_lcore_params; ++i) { //遍历lcores的参数表
if (lcore_params[i].port_id == port && lcore_params[i].queue_id > queue)
queue = lcore_params[i].queue_id;//获取queue_id值
}
return (uint8_t)(++queue); //因为queue_id从0开始
} static int //初始化lcore上的接收队列
init_lcore_rx_queues(void)
{
uint16_t i, nb_rx_queue;
uint8_t lcore; for (i = ; i < nb_lcore_params; ++i) {//遍历lcores的参数表
lcore = lcore_params[i].lcore_id;
nb_rx_queue = lcore_conf[lcore].n_rx_queue;
if (nb_rx_queue >= MAX_RX_QUEUE_PER_LCORE) {//如果接收队列总数大于128
printf("error: too many queues (%u) for lcore: %u\n",
(unsigned)nb_rx_queue + , (unsigned)lcore);
return -;
} else {
lcore_conf[lcore].rx_queue_list[nb_rx_queue].port_id =
lcore_params[i].port_id; //记录port_id
lcore_conf[lcore].rx_queue_list[nb_rx_queue].queue_id =
lcore_params[i].queue_id; //记录queue_id
lcore_conf[lcore].n_rx_queue++;//lcore上接收队列的数量加1
}
}
return ;
} /* display usage */
static void //打印使用说明
print_usage(const char *prgname)
{
printf ("%s [EAL options] -- -p PORTMASK -P"
" [--config (port,queue,lcore)[,(port,queue,lcore]]"
" [--enable-jumbo [--max-pkt-len PKTLEN]]\n"
" -p PORTMASK: hexadecimal bitmask of ports to configure\n"
" -P : enable promiscuous mode\n"
" --config (port,queue,lcore): rx queues configuration\n"
" --no-numa: optional, disable numa awareness\n"
" --ipv6: optional, specify it if running ipv6 packets\n"
" --enable-jumbo: enable jumbo frame"
" which max packet len is PKTLEN in decimal (64-9600)\n"
" --hash-entry-num: specify the hash entry number in hexadecimal to be setup\n",
prgname);
} static int //分析数据包的长度
parse_max_pkt_len(const char *pktlen)
{
char *end = NULL;
unsigned long len; /* parse decimal string */
len = strtoul(pktlen, &end, ); //把字符串转换成十进制数字
if ((pktlen[] == '\0') || (end == NULL) || (*end != '\0'))
return -; if (len == )
return -; return len;
} static int //分析物理端口的掩码
parse_portmask(const char *portmask)
{
char *end = NULL;
unsigned long pm; /* parse hexadecimal string */
pm = strtoul(portmask, &end, );//字符串转换为十六进制的数字
if ((portmask[] == '\0') || (end == NULL) || (*end != '\0'))
return -; if (pm == )
return -; return pm;
} #if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH)
static int
parse_hash_entry_number(const char *hash_entry_num)
{
char *end = NULL;
unsigned long hash_en;
/* parse hexadecimal string */
hash_en = strtoul(hash_entry_num, &end, );
if ((hash_entry_num[] == '\0') || (end == NULL) || (*end != '\0'))
return -; if (hash_en == )
return -; return hash_en;
}
#endif static int //分析参数中的配置
parse_config(const char *q_arg)
{
char s[];
const char *p, *p0 = q_arg;
char *end;
enum fieldnames {
FLD_PORT = ,
FLD_QUEUE,
FLD_LCORE,
_NUM_FLD
};
unsigned long int_fld[_NUM_FLD];
char *str_fld[_NUM_FLD];
int i;
unsigned size; nb_lcore_params = ; //数组的元素个数初始化为0
//举例: --config="(0,0,1),(0,1,2),(1,0,1),(1,1,3)"
while ((p = strchr(p0,'(')) != NULL) { //找到左括号的位置,并赋值给p,除非找不到左括号才结束while循环
++p;
if((p0 = strchr(p,')')) == NULL) //找到有括号的位置,并赋值给p0
return -; size = p0 - p; //计算括号内的字符串长度
if(size >= sizeof(s))
return -; snprintf(s, sizeof(s), "%.*s", size, p); //按照size宽度拼接字符串s
if (rte_strsplit(s, sizeof(s), str_fld, _NUM_FLD, ',') != _NUM_FLD)//分割字符串s到str_fld中
return -;
for (i = ; i < _NUM_FLD; i++){//遍历各个成员
errno = ;
int_fld[i] = strtoul(str_fld[i], &end, );//获取port_id、queue_id、lcore_id成员的值
if (errno != || end == str_fld[i] || int_fld[i] > )
return -;
}
if (nb_lcore_params >= MAX_LCORE_PARAMS) {
printf("exceeded max number of lcore params: %hu\n",
nb_lcore_params);
return -;
}
lcore_params_array[nb_lcore_params].port_id = (uint8_t)int_fld[FLD_PORT];//赋值port_id
lcore_params_array[nb_lcore_params].queue_id = (uint8_t)int_fld[FLD_QUEUE];//赋值queue_id
lcore_params_array[nb_lcore_params].lcore_id = (uint8_t)int_fld[FLD_LCORE];//赋值lcore_id
++nb_lcore_params; //数组的元素个数自增
}
lcore_params = lcore_params_array;//使用新配置,抛弃默认配置
return ;
} #define CMD_LINE_OPT_CONFIG "config"
#define CMD_LINE_OPT_NO_NUMA "no-numa"
#define CMD_LINE_OPT_IPV6 "ipv6"
#define CMD_LINE_OPT_ENABLE_JUMBO "enable-jumbo"
#define CMD_LINE_OPT_HASH_ENTRY_NUM "hash-entry-num" /* Parse the argument given in the command line of the application */
static int //分析l3fwd相关的参数
parse_args(int argc, char **argv)
{
int opt, ret;
char **argvopt;
int option_index;
char *prgname = argv[];
static struct option lgopts[] = {
{CMD_LINE_OPT_CONFIG, , , }, //config参数对应于case 0
{CMD_LINE_OPT_NO_NUMA, , , },
{CMD_LINE_OPT_IPV6, , , },
{CMD_LINE_OPT_ENABLE_JUMBO, , , },
{CMD_LINE_OPT_HASH_ENTRY_NUM, , , },
{NULL, , , }//应该可以在这个地方加上kni_config命令字 }; argvopt = argv; while ((opt = getopt_long(argc, argvopt, "p:P",
lgopts, &option_index)) != EOF) { switch (opt) {
/* portmask 物理端口的掩码*/
case 'p':
enabled_port_mask = parse_portmask(optarg);//optarg为指向当前选项参数的指针
if (enabled_port_mask == ) {
printf("invalid portmask\n");
print_usage(prgname);
return -;
}
break;
case 'P': //混杂模式
printf("Promiscuous mode selected\n");
promiscuous_on = ;
break; /* long options 解析长选项 */
case :
if (!strncmp(lgopts[option_index].name, CMD_LINE_OPT_CONFIG,
sizeof (CMD_LINE_OPT_CONFIG))) { //参数config
ret = parse_config(optarg);//解析()中的参数
if (ret) {
printf("invalid config\n");
print_usage(prgname);
return -;
}
} if (!strncmp(lgopts[option_index].name, CMD_LINE_OPT_NO_NUMA,
sizeof(CMD_LINE_OPT_NO_NUMA))) { //参数no-numa
printf("numa is disabled \n");
numa_on = ;
} #if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH)
if (!strncmp(lgopts[option_index].name, CMD_LINE_OPT_IPV6,
sizeof(CMD_LINE_OPT_IPV6))) { //参数ipv6
printf("ipv6 is specified \n");
ipv6 = ;
}
#endif if (!strncmp(lgopts[option_index].name, CMD_LINE_OPT_ENABLE_JUMBO,
sizeof (CMD_LINE_OPT_ENABLE_JUMBO))) {//参数enable-jumbo
struct option lenopts = {"max-pkt-len", required_argument, , }; printf("jumbo frame is enabled - disabling simple TX path\n");
port_conf.rxmode.jumbo_frame = ; /* if no max-pkt-len set, use the default value ETHER_MAX_LEN */
if ( == getopt_long(argc, argvopt, "", &lenopts, &option_index)) {
ret = parse_max_pkt_len(optarg); //分析数据包的长度
if ((ret < ) || (ret > MAX_JUMBO_PKT_LEN)){
printf("invalid packet length\n");
print_usage(prgname);
return -;
}
port_conf.rxmode.max_rx_pkt_len = ret;
}
printf("set jumbo frame max packet length to %u\n",
(unsigned int)port_conf.rxmode.max_rx_pkt_len);
}
#if (APP_LOOKUP_METHOD == APP_LOOKUP_EXACT_MATCH)
if (!strncmp(lgopts[option_index].name, CMD_LINE_OPT_HASH_ENTRY_NUM,
sizeof(CMD_LINE_OPT_HASH_ENTRY_NUM))) {//参数hash-entry-num
ret = parse_hash_entry_number(optarg);
if ((ret > ) && (ret <= L3FWD_HASH_ENTRIES)) {
hash_entry_number = ret;
} else {
printf("invalid hash entry number\n");
print_usage(prgname);
return -;
}
}
#endif
break; default:
print_usage(prgname);
return -;
}
} if (optind >= )
argv[optind-] = prgname; ret = optind-;
optind = ; /* optind是下一个选项的索引 reset getopt lib */
return ret;
} static void //打印mac地址
print_ethaddr(const char *name, const struct ether_addr *eth_addr)
{
char buf[ETHER_ADDR_FMT_SIZE];
ether_format_addr(buf, ETHER_ADDR_FMT_SIZE, eth_addr);
printf("%s%s", name, buf);
} #if (APP_LOOKUP_METHOD == APP_LOOKUP_LPM)
static void //创建LPM
setup_lpm(int socketid)
{
struct rte_lpm6_config config;
unsigned i;
int ret;
char s[]; /* 创建LPM ipv4表 create the LPM table */
snprintf(s, sizeof(s), "IPV4_L3FWD_LPM_%d", socketid);
ipv4_l3fwd_lookup_struct[socketid] = rte_lpm_create(s, socketid,
IPV4_L3FWD_LPM_MAX_RULES, );
if (ipv4_l3fwd_lookup_struct[socketid] == NULL)
rte_exit(EXIT_FAILURE, "Unable to create the l3fwd LPM table"
" on socket %d\n", socketid); /* 填充ipv4 LPM表 populate the LPM table */ for (i = ; i < IPV4_L3FWD_NUM_ROUTES; i++) {//遍历已经配置的所有的规则 /* skip unused ports 跳过未使用的物理端口*/
if (( << ipv4_l3fwd_route_array[i].if_out &
enabled_port_mask) == )
continue; //添加一条路由,即把规则转换为tbl24或者tbl8
ret = rte_lpm_add(ipv4_l3fwd_lookup_struct[socketid],
ipv4_l3fwd_route_array[i].ip,
ipv4_l3fwd_route_array[i].depth,
ipv4_l3fwd_route_array[i].if_out); if (ret < ) { //如果添加路由失败
rte_exit(EXIT_FAILURE, "Unable to add entry %u to the "
"l3fwd LPM table on socket %d\n",
i, socketid);
} printf("LPM: Adding route 0x%08x / %d (%d)\n",
(unsigned)ipv4_l3fwd_route_array[i].ip,
ipv4_l3fwd_route_array[i].depth,
ipv4_l3fwd_route_array[i].if_out);
} /* 创建lpm ipv6表 create the LPM6 table */
snprintf(s, sizeof(s), "IPV6_L3FWD_LPM_%d", socketid); config.max_rules = IPV6_L3FWD_LPM_MAX_RULES;
config.number_tbl8s = IPV6_L3FWD_LPM_NUMBER_TBL8S;
config.flags = ;
ipv6_l3fwd_lookup_struct[socketid] = rte_lpm6_create(s, socketid,
&config);
if (ipv6_l3fwd_lookup_struct[socketid] == NULL)
rte_exit(EXIT_FAILURE, "Unable to create the l3fwd LPM table"
" on socket %d\n", socketid); /* 填充LPM ipv6表 populate the LPM table */
for (i = ; i < IPV6_L3FWD_NUM_ROUTES; i++) { /* skip unused ports */
if (( << ipv6_l3fwd_route_array[i].if_out &
enabled_port_mask) == )
continue; ret = rte_lpm6_add(ipv6_l3fwd_lookup_struct[socketid],
ipv6_l3fwd_route_array[i].ip,
ipv6_l3fwd_route_array[i].depth,
ipv6_l3fwd_route_array[i].if_out); if (ret < ) {
rte_exit(EXIT_FAILURE, "Unable to add entry %u to the "
"l3fwd LPM table on socket %d\n",
i, socketid);
} printf("LPM: Adding route %s / %d (%d)\n",
"IPV6",
ipv6_l3fwd_route_array[i].depth,
ipv6_l3fwd_route_array[i].if_out);
}
}
#endif static int //初始化内存
init_mem(unsigned nb_mbuf)
{
struct lcore_conf *qconf;
int socketid;
unsigned lcore_id;
char s[]; for (lcore_id = ; lcore_id < RTE_MAX_LCORE; lcore_id++) {//遍历所有lcores
if (rte_lcore_is_enabled(lcore_id) == )
continue; if (numa_on) //一般开启了numa
socketid = rte_lcore_to_socket_id(lcore_id);//得到lcore所在的socketid
else
socketid = ; //默认socketid为0 if (socketid >= NB_SOCKETS) {
rte_exit(EXIT_FAILURE, "Socket %d of lcore %u is out of range %d\n",
socketid, lcore_id, NB_SOCKETS);
}
if (pktmbuf_pool[socketid] == NULL) {
snprintf(s, sizeof(s), "mbuf_pool_%d", socketid);
pktmbuf_pool[socketid] = //为每一个socket创建mempool用来动态分配mbufs
rte_mempool_create(s, nb_mbuf, MBUF_SIZE, MEMPOOL_CACHE_SIZE,
sizeof(struct rte_pktmbuf_pool_private),
rte_pktmbuf_pool_init, NULL,
rte_pktmbuf_init, NULL,
socketid, );
if (pktmbuf_pool[socketid] == NULL)
rte_exit(EXIT_FAILURE,
"Cannot init mbuf pool on socket %d\n", socketid);
else
printf("Allocated mbuf pool on socket %d\n", socketid); #if (APP_LOOKUP_METHOD == APP_LOOKUP_LPM)
setup_lpm(socketid); //创建LPM表,只需给每个socket cpu创建一个LPM表,而同一个CPU上的lcores共享LPM
#else
setup_hash(socketid); //创建Hash表
#endif
}
qconf = &lcore_conf[lcore_id];
qconf->ipv4_lookup_struct = ipv4_l3fwd_lookup_struct[socketid];
qconf->ipv6_lookup_struct = ipv6_l3fwd_lookup_struct[socketid];
}
return ;
} /* Check the link status of all ports in up to 9s, and print them finally */
static void //检查物理端口的连接状态
check_all_ports_link_status(uint8_t port_num, uint32_t port_mask)
{
#define CHECK_INTERVAL 100 /* 100ms */
#define MAX_CHECK_TIME 90 /* 9s (90 * 100ms) in total */
uint8_t portid, count, all_ports_up, print_flag = ;
struct rte_eth_link link; printf("\nChecking link status");
fflush(stdout);
for (count = ; count <= MAX_CHECK_TIME; count++) {//最多执行9000次
all_ports_up = ;
for (portid = ; portid < port_num; portid++) {//遍历物理端口
if ((port_mask & ( << portid)) == )
continue;
memset(&link, , sizeof(link));
rte_eth_link_get_nowait(portid, &link);
/* print link status if flag set */
if (print_flag == ) {
if (link.link_status)
printf("Port %d Link Up - speed %u "
"Mbps - %s\n", (uint8_t)portid,
(unsigned)link.link_speed,
(link.link_duplex == ETH_LINK_FULL_DUPLEX) ?
("full-duplex") : ("half-duplex\n"));
else
printf("Port %d Link Down\n",
(uint8_t)portid);
continue;
}
/* clear all_ports_up flag if any link down */
if (link.link_status == ) {
all_ports_up = ;
break;
}
}
/* after finally printing all link status, get out */
if (print_flag == )
break; if (all_ports_up == ) {
printf(".");
fflush(stdout);
rte_delay_ms(CHECK_INTERVAL);
} /* set the print_flag if all ports up or timeout */
if (all_ports_up == || count == (MAX_CHECK_TIME - )) {
print_flag = ;
printf("done\n");
}
}
} int //主函数
main(int argc, char **argv)
{
struct lcore_conf *qconf;
struct rte_eth_dev_info dev_info;
struct rte_eth_txconf *txconf;
int ret;
unsigned nb_ports;
uint16_t queueid;
unsigned lcore_id;
uint32_t n_tx_queue, nb_lcores;
uint8_t portid, nb_rx_queue, queue, socketid; /* init EAL */
ret = rte_eal_init(argc, argv); //初始化软件抽象层,并解析EAL有关参数
if (ret < )
rte_exit(EXIT_FAILURE, "Invalid EAL parameters\n");
argc -= ret; //减少参数个数
argv += ret; //移动参数位置 /* parse application arguments (after the EAL ones) */
ret = parse_args(argc, argv); //解析l3fwd有关参数: -p -P --config
if (ret < )
rte_exit(EXIT_FAILURE, "Invalid L3FWD parameters\n"); if (check_lcore_params() < ) //检查lcore参数
rte_exit(EXIT_FAILURE, "check_lcore_params failed\n"); ret = init_lcore_rx_queues(); //初始化每个lcore上的rx queue数量
if (ret < )
rte_exit(EXIT_FAILURE, "init_lcore_rx_queues failed\n"); nb_ports = rte_eth_dev_count(); //获取物理端口的个数
if (nb_ports > RTE_MAX_ETHPORTS) //如果超过32个
nb_ports = RTE_MAX_ETHPORTS; if (check_port_config(nb_ports) < ) //检查物理端口的配置
rte_exit(EXIT_FAILURE, "check_port_config failed\n"); nb_lcores = rte_lcore_count(); //获取启用的lcores的总个数 /* initialize all ports 初始化所有的物理端口 */
for (portid = ; portid < nb_ports; portid++) { //遍历所有的物理端口
/* skip ports that are not enabled 跳过没有启用的物理端口 */
if ((enabled_port_mask & ( << portid)) == ) {
printf("\nSkipping disabled port %d\n", portid);
continue;
} /* init port 初始化物理端口*/
printf("Initializing port %d ... ", portid );
fflush(stdout); //清空标准输出(屏幕)的缓冲区,这样就能立即在屏幕上看到打印信息 nb_rx_queue = get_port_n_rx_queues(portid); //获取portid上的接收队列的个数
n_tx_queue = nb_lcores; //设定portid上的发送队列的个数为启用的lcores的个数
if (n_tx_queue > MAX_TX_QUEUE_PER_PORT) //如果发送队列的数量超过16个
n_tx_queue = MAX_TX_QUEUE_PER_PORT;
printf("Creating queues: nb_rxq=%d nb_txq=%u... ",
nb_rx_queue, (unsigned)n_tx_queue ); //这里是不是有点粗暴啊?????
ret = rte_eth_dev_configure(portid, nb_rx_queue, //第一步,配置网络设备
(uint16_t)n_tx_queue, &port_conf);
if (ret < ) //如果配置设备失败
rte_exit(EXIT_FAILURE, "Cannot configure device: err=%d, port=%d\n",
ret, portid); rte_eth_macaddr_get(portid, &ports_eth_addr[portid]); //记录mac地址到ports_eth_addr[portid]
print_ethaddr(" Address:", &ports_eth_addr[portid]);
printf(", "); /* 为每一个物理端口准备着源mac地址和目的mac地址
* prepare dst and src MACs for each port.
*/
*(uint64_t *)(val_eth + portid) =
ETHER_LOCAL_ADMIN_ADDR + ((uint64_t)portid << );
ether_addr_copy(&ports_eth_addr[portid], //前一个参数为from,后一个为to
(struct ether_addr *)(val_eth + portid) + );
/* init memory 分配内存并创建LPM或者hash */
ret = init_mem(NB_MBUF); //mempool包含8192个元素
if (ret < )
rte_exit(EXIT_FAILURE, "init_mem failed\n"); /*初始化一个发送队列成一对(lcore, port) init one TX queue per couple (lcore,port) */
queueid = ;
for (lcore_id = ; lcore_id < RTE_MAX_LCORE; lcore_id++) { //遍历一个物理接口上的所有的lcores
if (rte_lcore_is_enabled(lcore_id) == ) //忽略未启用的lcore
continue; if (numa_on)//如果启用numa
socketid = (uint8_t)rte_lcore_to_socket_id(lcore_id); //获取lcore_id所在的socketid
else
socketid = ;//默认socketid为0 printf("txq=%u,%d,%d ", lcore_id, queueid, socketid);
fflush(stdout);//清空标准输出(屏幕)的缓冲区 rte_eth_dev_info_get(portid, &dev_info);//获取设备信息
txconf = &dev_info.default_txconf;//得到发送的配置结构体指针
if (port_conf.rxmode.jumbo_frame)
txconf->txq_flags = ;
ret = rte_eth_tx_queue_setup(portid, queueid, nb_txd, //第二步,建立发送队列
socketid, txconf); //一个port上可能有多个queue,每个queue用一个lcore来绑定
if (ret < )
rte_exit(EXIT_FAILURE, "rte_eth_tx_queue_setup: err=%d, "
"port=%d\n", ret, portid); qconf = &lcore_conf[lcore_id]; //得到lcore_id的配置结构体指针
qconf->tx_queue_id[portid] = queueid; //记录发送队列的编号到lcore_conf中
queueid++; //发送队列的编号自增
} //end of for(lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++)
printf("\n");
} //end of for(portid = 0; portid < nb_ports; portid++) for (lcore_id = ; lcore_id < RTE_MAX_LCORE; lcore_id++) { //遍历所有的lcores
if (rte_lcore_is_enabled(lcore_id) == )
continue; //忽略未启用的lcore
qconf = &lcore_conf[lcore_id];
printf("\nInitializing rx queues on lcore %u ... ", lcore_id );
fflush(stdout);
/* init RX queues 初始化接收队列 */
for(queue = ; queue < qconf->n_rx_queue; ++queue) { //遍历所有的接收队列
portid = qconf->rx_queue_list[queue].port_id; //物理端口的编号
queueid = qconf->rx_queue_list[queue].queue_id;//接收队列的编号 if (numa_on)//一般启用numa
socketid = (uint8_t)rte_lcore_to_socket_id(lcore_id);//获取lcore_id所在的socketid
else
socketid = ;//默认socketid为0 printf("rxq=%d,%d,%d ", portid, queueid, socketid);
fflush(stdout);//清空标准输出(屏幕)的缓冲区 ret = rte_eth_rx_queue_setup(portid, queueid, nb_rxd, //第三步,建立接收队列
socketid, //一个port上可能有多个queue,每个queue用一个lcore来绑定
NULL,
pktmbuf_pool[socketid]);
if (ret < )
rte_exit(EXIT_FAILURE, "rte_eth_rx_queue_setup: err=%d,"
"port=%d\n", ret, portid);
} //for(queue = 0; queue < qconf->n_rx_queue; ++queue)
}//for(lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) printf("\n"); /* start ports 启动物理端口 */
for (portid = ; portid < nb_ports; portid++) { //遍历所有的物理端口
if ((enabled_port_mask & ( << portid)) == ) {
continue; //忽略未启用的物理端口
}
/* Start device 启动设备 */
ret = rte_eth_dev_start(portid); //第四步,启动物理端口
if (ret < )
rte_exit(EXIT_FAILURE, "rte_eth_dev_start: err=%d, port=%d\n",
ret, portid); /*
* If enabled, put device in promiscuous mode.
* This allows IO forwarding mode to forward packets
* to itself through 2 cross-connected ports of the
* target machine.
*/
if (promiscuous_on) //如果开始混杂模式
rte_eth_promiscuous_enable(portid); //启动混杂模式
}//end of for (portid = 0; portid < nb_ports; portid++) check_all_ports_link_status((uint8_t)nb_ports, enabled_port_mask); /* launch per-lcore init on every lcore 在每一个lcore上至多启动一个线程 */
rte_eal_mp_remote_launch(main_loop, NULL, CALL_MASTER);//CALL_MASTER表示在master也会启动线程
RTE_LCORE_FOREACH_SLAVE(lcore_id) { //遍历每个slave lcore
if (rte_eal_wait_lcore(lcore_id) < ) //等待线程结束
return -;
} return ;
}