Bridge转发逻辑
本文主要介绍linux bridge的转发流程,以及bridge设计的几个hook点。首先看一张完整的转发图。
下面逐一看每个函数的分析
l netif_receive_skb:网卡接收函数
/net/core/dev.c
点击(此处)折叠或打开
- int netif_receive_skb(struct sk_buff *skb)
- {
- //…
- skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
- //…
- }
l handle_bridge:网桥处理函数
/net/core/dev.c
点击(此处)折叠或打开
- static inline struct sk_buff *handle_bridge(struct sk_buff *skb,
- struct packet_type **pt_prev, int *ret,
- struct net_device *orig_dev)
- {
- struct net_bridge_port *port;
- if (skb->pkt_type == PACKET_LOOPBACK ||
- (port = rcu_dereference(skb->dev->br_port)) == NULL)// A注意这个判断
- return skb;
-
- if (*pt_prev) { //一般来说pt_prev为NULL
- *ret = deliver_skb(skb, *pt_prev, orig_dev);
- *pt_prev = NULL;
- }
- //调用bridge挂载函数,改函数在bridge模块装载时初始化
- return br_handle_frame_hook(port, skb);
- }
- br_handle_frame_hook函数在bridge模块装载时初始化
- /* net/bridge/br.c */
- static int __init br_init(void)
- {
- //......
- br_handle_frame_hook = br_handle_frame;
- //......
- }
l br_handle_frame
/net/bridge/br_input.c
点击(此处)折叠或打开
- struct sk_buff *br_handle_frame(struct net_bridge_port *p, struct sk_buff *skb)
- {
- const unsigned char *dest = eth_hdr(skb)->h_dest;
- int (*rhook)(struct sk_buff *skb);
- //…
- if (unlikely(is_link_local(dest))) {//如果是本地多播地址(形如:01:80:c2:00:00:0X)
- //…
- // 自身包进入PF_BEIDGE的INPUT点, 一般处理的包数不多
- if (NF_HOOK(PF_BRIDGE, NF_BR_LOCAL_IN, skb, skb->dev,
- NULL, br_handle_local_finish))
- return NULL; /* frame consumed by filter */
- else
- return skb; /* continue processing */
- }
- //进入转发逻辑
- forward:
- switch (p->state) {
- case BR_STATE_FORWARDING:
- rhook = rcu_dereference(br_should_route_hook);
- if (rhook != NULL) {
- if (rhook(skb))
- return skb;
- dest = eth_hdr(skb)->h_dest;
- }
- /* fall through */
- case BR_STATE_LEARNING://如果数据包的目的mac为bridge的mac
- if (!compare_ether_addr(p->br->dev->dev_addr, dest))
- skb->pkt_type = PACKET_HOST;
- //bridge的PRE_ROUTING
- NF_HOOK(PF_BRIDGE, NF_BR_PRE_ROUTING, skb, skb->dev, NULL,
- br_handle_frame_finish);
- break;
- default:
- drop:
- kfree_skb(skb);
- }
- return NULL;
- }
l br_handle_frame_finish:这个函数完成更新mac表、查找mac表确定出口dev。
/net/bridge/br_input.c
点击(此处)折叠或打开
- int br_handle_frame_finish(struct sk_buff *skb)
- {
- const unsigned char *dest = eth_hdr(skb)->h_dest;
- struct net_bridge_port *p = rcu_dereference(skb->dev->br_port);
- struct net_bridge *br;
- struct net_bridge_fdb_entry *dst;
- struct sk_buff *skb2;
- /* insert into forwarding database after filtering to avoid spoofing */
- br = p->br;
- br_fdb_update(br, p, eth_hdr(skb)->h_source);
- /* The packet skb2 goes to the local host (NULL to skip). */
- skb2 = NULL;
- if (br->dev->flags & IFF_PROMISC) //如果网桥设备被设置为混杂模式
- skb2 = skb;
- dst = NULL;
- if (is_multicast_ether_addr(dest)) {//如果是多播(首字节为x01)
- br->dev->stats.multicast++;
- skb2 = skb;
- } else if ((dst = __br_fdb_get(br, dest)) && dst->is_local) { //如果目的mac为本机mac
- skb2 = skb;
- /* Do not forward the packet since it's local. */
- skb = NULL; //skb2为要发往本机上层协议栈的,skb为要转发的
- }
-
- if (skb2 == skb)
- skb2 = skb_clone(skb, GFP_ATOMIC);
- if (skb2) //发往本机上层协议栈
- br_pass_frame_up(br, skb2);
- if (skb) { //转发
- if (dst)
- br_forward(dst->dst, skb);
- else
- br_flood_forward(br, skb);
- }
- out:
- return 0;
- drop:
- kfree_skb(skb);
- goto out;
- }
l br_pass_frame_up发往本地
// net/bridge/br_input.c
点击(此处)折叠或打开
- static void br_pass_frame_up(struct net_bridge *br, struct sk_buff *skb)
- {
- struct net_device *indev, *brdev = br->dev;
- brdev->stats.rx_packets++;
- brdev->stats.rx_bytes += skb->len;
- indev = skb->dev;
- skb->dev = brdev;
- //bridge的LOCAL_IN
- NF_HOOK(PF_BRIDGE, NF_BR_LOCAL_IN, skb, indev, NULL,
- netif_receive_skb);
- }
这段代码非常简单,对net_bridge的数据统计进行更新以后,再更新skb->dev,最后通过NF_HOOK在NF_BR_LOCAL_IN挂接点上调用回了netif_receive_skb。
前面已经提到,在netif_receive_skb函数中,调用了handle_bridge函数,并且触发了网桥的处理流程,现在发往网桥虚拟设备的数据包又回到了netif_receive_skb,那么网桥的处理过程会不会又被调用到呢?
在 linux/net/bridge/br_if.c里面可以看到br_add_if函数,实际上的操作是将某一网口(dev)加入网桥组,这个函数调用了new_nbp(br, dev); 用以填充net_bridge以及dev结构的重要成员,里面将dev->br_port(这里dev是加入bridge的dev而不是bridge自身对应的dev)设定为一个新建的net_bridge_port结构。而上面的br_pass_frame_up函数将skb->dev赋成了br->dev,实际上skb->dev变成了网桥建立的虚拟设备(bridge自身对应的dev),这个设备是网桥本身而不是桥组的某一端口,系统没有为其调用br_add_if,所以这个net_device结构的br_port指针没有进行赋值。
在handle_bridge中有这样的检查
if (skb->pkt_type == PACKET_LOOPBACK ||
(port = rcu_dereference(skb->dev->br_port)) == NULL)
return skb;
经过br_pass_frame_up 函数后,skb->dev->br_port为空,所以将直接返回skb而不进行网桥处理。
另外,我们看到,系统在NF_BR_LOCAL_IN挂接点上调用了netif_receive_skb,但是net_if_receive_skb还会调用ip_rcv函数,所以数据包在NF_IP_LOCAL_IN还可以被捕获到。
l br_forwar转发
// net/bridge/br_forward.c
点击(此处)折叠或打开
- void br_forward(const struct net_bridge_port *to, struct sk_buff *skb)
- {
- //接口检查,确认端口处于BR_STATE_FORWARDING状态,网桥允许转发,并且转发的出口和入口的dev不相等
- if (should_deliver(to, skb)) {
- __br_forward(to, skb);
- return;
- }
- kfree_skb(skb);
- }
l __br_forward
// net/bridge/br_forward.c
点击(此处)折叠或打开
- static void __br_forward(const struct net_bridge_port *to, struct sk_buff *skb)
- {
- struct net_device *indev;
- indev = skb->dev;
- skb->dev = to->dev; //修改skb->dev为目的出口对应的dev
- skb_forward_csum(skb); //计算校验和
- //bridge的FORWARD
- NF_HOOK(PF_BRIDGE, NF_BR_FORWARD, skb, indev, skb->dev,
- br_forward_finish);
- }
l br_forward_finish
点击(此处)折叠或打开
- int br_forward_finish(struct sk_buff *skb)
- { //bridge的POST_ROUTING
- return NF_HOOK(PF_BRIDGE, NF_BR_POST_ROUTING, skb, NULL, skb->dev,
- br_dev_queue_push_xmit);
- }
l br_dev_queue_push_xmit
// net/bridge/br_forward.c
点击(此处)折叠或打开
- int br_dev_queue_push_xmit(struct sk_buff *skb)
- {
- /* drop mtu oversized packets except gso */
- if (packet_length(skb) > skb->dev->mtu && !skb_is_gso(skb))
- kfree_skb(skb);
- else {
- /* ip_refrag calls ip_fragment, doesn't copy the MAC header. */
- if (nf_bridge_maybe_copy_header(skb))
- kfree_skb(skb);
- else {
- skb_push(skb, ETH_HLEN);
- dev_queue_xmit(skb);
- }
- }
- return 0;
- }
在dev_queue_xmit()会判断skb中的dev字段,根据这个字段指示的设备调用该设备的发送函数hard_start_xmit来对skb进行转发。其实到这里bridge的转发逻辑基本就完成了,但是如果目的dev依然是bridge呢,那就调用bridge的hard_start_xmit,而bridge的hard_start_xmit 在bridge初始化中由br_dev_setup设置。
/* net/bridge/br_device.c */
点击(此处)折叠或打开
- void br_dev_setup(struct net_device *dev)
- {
- //......
- dev->hard_start_xmit = br_dev_xmit;
- //......
- }
l br_dev_xmit
/* net/bridge/br_device.c */
/* net device transmit always called with no BH (preempt_disabled) */
点击(此处)折叠或打开
- int br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
- {
- struct net_bridge *br = netdev_priv(dev);
- const unsigned char *dest = skb->data;
- struct net_bridge_fdb_entry *dst;
- br->statistics.tx_packets++;
- br->statistics.tx_bytes += skb->len;
- skb->mac.raw = skb->data;
- skb_pull(skb, ETH_HLEN);
- if (dest[0] & 1)
- // 多播发送
- br_flood_deliver(br, skb, 0);
- else if ((dst = __br_fdb_get(br, dest)) != NULL) //查转发表
- // 单播发送
- br_deliver(dst->dst, skb);
- else
- // 广播发送
- br_flood_deliver(br, skb, 0);
- // 这些发送函数最终都会调用__br_deliver()函数
- return 0;
- }
l __br_deliver
/* net/bridge/br_forward.c */
点击(此处)折叠或打开
- static void __br_deliver(const struct net_bridge_port *to, struct sk_buff *skb)
- {
- skb->dev = to->dev; //设置为出口dev
- // 此处是PF_BRIDGE的OUTPUT点
- NF_HOOK(PF_BRIDGE, NF_BR_LOCAL_OUT, skb, NULL, skb->dev,
- br_forward_finish);
- }
注意这里调用完成后又要调用 br_forward_finish,但这不是循环,因为__br_delive中skb->dev已经改变,下一轮调用的hard_start_xmit 也会不同。最后再看一遍全局的转发图,应该就比较清晰了。