|
|
|
|
# 46 | 发送网络包(下):如何表达我们想让合作伙伴做什么?
|
|
|
|
|
|
|
|
|
|
上一节我们讲网络包的发送,讲了上半部分,也即从VFS层一直到IP层,这一节我们接着看下去,看IP层和MAC层是如何发送数据的。
|
|
|
|
|
|
|
|
|
|
## 解析ip\_queue\_xmit函数
|
|
|
|
|
|
|
|
|
|
从ip\_queue\_xmit函数开始,我们就要进入IP层的发送逻辑了。
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)
|
|
|
|
|
{
|
|
|
|
|
struct inet_sock *inet = inet_sk(sk);
|
|
|
|
|
struct net *net = sock_net(sk);
|
|
|
|
|
struct ip_options_rcu *inet_opt;
|
|
|
|
|
struct flowi4 *fl4;
|
|
|
|
|
struct rtable *rt;
|
|
|
|
|
struct iphdr *iph;
|
|
|
|
|
int res;
|
|
|
|
|
|
|
|
|
|
inet_opt = rcu_dereference(inet->inet_opt);
|
|
|
|
|
fl4 = &fl->u.ip4;
|
|
|
|
|
rt = skb_rtable(skb);
|
|
|
|
|
/* Make sure we can route this packet. */
|
|
|
|
|
rt = (struct rtable *)__sk_dst_check(sk, 0);
|
|
|
|
|
if (!rt) {
|
|
|
|
|
__be32 daddr;
|
|
|
|
|
/* Use correct destination address if we have options. */
|
|
|
|
|
daddr = inet->inet_daddr;
|
|
|
|
|
......
|
|
|
|
|
rt = ip_route_output_ports(net, fl4, sk,
|
|
|
|
|
daddr, inet->inet_saddr,
|
|
|
|
|
inet->inet_dport,
|
|
|
|
|
inet->inet_sport,
|
|
|
|
|
sk->sk_protocol,
|
|
|
|
|
RT_CONN_FLAGS(sk),
|
|
|
|
|
sk->sk_bound_dev_if);
|
|
|
|
|
if (IS_ERR(rt))
|
|
|
|
|
goto no_route;
|
|
|
|
|
sk_setup_caps(sk, &rt->dst);
|
|
|
|
|
}
|
|
|
|
|
skb_dst_set_noref(skb, &rt->dst);
|
|
|
|
|
|
|
|
|
|
packet_routed:
|
|
|
|
|
/* OK, we know where to send it, allocate and build IP header. */
|
|
|
|
|
skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
|
|
|
|
|
skb_reset_network_header(skb);
|
|
|
|
|
iph = ip_hdr(skb);
|
|
|
|
|
*((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
|
|
|
|
|
if (ip_dont_fragment(sk, &rt->dst) && !skb->ignore_df)
|
|
|
|
|
iph->frag_off = htons(IP_DF);
|
|
|
|
|
else
|
|
|
|
|
iph->frag_off = 0;
|
|
|
|
|
iph->ttl = ip_select_ttl(inet, &rt->dst);
|
|
|
|
|
iph->protocol = sk->sk_protocol;
|
|
|
|
|
ip_copy_addrs(iph, fl4);
|
|
|
|
|
|
|
|
|
|
/* Transport layer set skb->h.foo itself. */
|
|
|
|
|
|
|
|
|
|
if (inet_opt && inet_opt->opt.optlen) {
|
|
|
|
|
iph->ihl += inet_opt->opt.optlen >> 2;
|
|
|
|
|
ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ip_select_ident_segs(net, skb, sk,
|
|
|
|
|
skb_shinfo(skb)->gso_segs ?: 1);
|
|
|
|
|
|
|
|
|
|
/* TODO : should we use skb->sk here instead of sk ? */
|
|
|
|
|
skb->priority = sk->sk_priority;
|
|
|
|
|
skb->mark = sk->sk_mark;
|
|
|
|
|
|
|
|
|
|
res = ip_local_out(net, sk, skb);
|
|
|
|
|
......
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
在ip\_queue\_xmit中,也即IP层的发送函数里面,有三部分逻辑。
|
|
|
|
|
|
|
|
|
|
第一部分,选取路由,也即我要发送这个包应该从哪个网卡出去。
|
|
|
|
|
|
|
|
|
|
这件事情主要由ip\_route\_output\_ports函数完成。接下来的调用链为:ip\_route\_output\_ports->ip\_route\_output\_flow->\_\_ip\_route\_output\_key->ip\_route\_output\_key\_hash->ip\_route\_output\_key\_hash\_rcu。
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4, struct fib_result *res, const struct sk_buff *skb)
|
|
|
|
|
{
|
|
|
|
|
struct net_device *dev_out = NULL;
|
|
|
|
|
int orig_oif = fl4->flowi4_oif;
|
|
|
|
|
unsigned int flags = 0;
|
|
|
|
|
struct rtable *rth;
|
|
|
|
|
......
|
|
|
|
|
err = fib_lookup(net, fl4, res, 0);
|
|
|
|
|
......
|
|
|
|
|
make_route:
|
|
|
|
|
rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
|
|
|
|
|
......
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
ip\_route\_output\_key\_hash\_rcu先会调用fib\_lookup。
|
|
|
|
|
|
|
|
|
|
**FIB**全称是Forwarding Information Base,**转发信息表。**其实就是咱们常说的路由表。
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
static inline int fib_lookup(struct net *net, const struct flowi4 *flp, struct fib_result *res, unsigned int flags)
|
|
|
|
|
{ struct fib_table *tb;
|
|
|
|
|
......
|
|
|
|
|
tb = fib_get_table(net, RT_TABLE_MAIN);
|
|
|
|
|
if (tb)
|
|
|
|
|
err = fib_table_lookup(tb, flp, res, flags | FIB_LOOKUP_NOREF);
|
|
|
|
|
......
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
路由表可以有多个,一般会有一个主表,RT\_TABLE\_MAIN。然后fib\_table\_lookup函数在这个表里面进行查找。
|
|
|
|
|
|
|
|
|
|
路由表是一个什么样的结构呢?
|
|
|
|
|
|
|
|
|
|
路由就是在Linux服务器上的路由表里面配置的一条一条规则。这些规则大概是这样的:想访问某个网段,从某个网卡出去,下一跳是某个IP。
|
|
|
|
|
|
|
|
|
|
之前我们讲过一个简单的拓扑图,里面的三台Linux机器的路由表都可以通过ip route命令查看。
|
|
|
|
|
|
|
|
|
|
![](https://static001.geekbang.org/resource/image/f6/0e/f6982eb85dc66bd04200474efb3a050e.png)
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
# Linux服务器A
|
|
|
|
|
default via 192.168.1.1 dev eth0
|
|
|
|
|
192.168.1.0/24 dev eth0 proto kernel scope link src 192.168.1.100 metric 100
|
|
|
|
|
|
|
|
|
|
# Linux服务器B
|
|
|
|
|
default via 192.168.2.1 dev eth0
|
|
|
|
|
192.168.2.0/24 dev eth0 proto kernel scope link src 192.168.2.100 metric 100
|
|
|
|
|
|
|
|
|
|
# Linux服务器做路由器
|
|
|
|
|
192.168.1.0/24 dev eth0 proto kernel scope link src 192.168.1.1
|
|
|
|
|
192.168.2.0/24 dev eth1 proto kernel scope link src 192.168.2.1
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
其实,对于两端的服务器来讲,我们没有太多路由可以选,但是对于中间的Linux服务器做路由器来讲,这里有两条路可以选,一个是往左面转发,一个是往右面转发,就需要路由表的查找。
|
|
|
|
|
|
|
|
|
|
fib\_table\_lookup的代码逻辑比较复杂,好在注释比较清楚。因为路由表要按照前缀进行查询,希望找到最长匹配的那一个,例如192.168.2.0/24和192.168.0.0/16都能匹配192.168.2.100/24。但是,我们应该使用192.168.2.0/24的这一条。
|
|
|
|
|
|
|
|
|
|
为了更方面的做这个事情,我们使用了Trie树这种结构。比如我们有一系列的字符串:{bcs#, badge#, baby#, back#, badger#, badness#}。之所以每个字符串都加上#,是希望不要一个字符串成为另外一个字符串的前缀。然后我们把它们放在Trie树中,如下图所示:
|
|
|
|
|
|
|
|
|
|
![](https://static001.geekbang.org/resource/image/3f/11/3f0a99cf1c47afcd0bd740c4b7802511.png)
|
|
|
|
|
|
|
|
|
|
对于将IP地址转成二进制放入trie树,也是同样的道理,可以很快进行路由的查询。
|
|
|
|
|
|
|
|
|
|
找到了路由,就知道了应该从哪个网卡发出去。
|
|
|
|
|
|
|
|
|
|
然后,ip\_route\_output\_key\_hash\_rcu会调用\_\_mkroute\_output,创建一个struct rtable,表示找到的路由表项。这个结构是由rt\_dst\_alloc函数分配的。
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
struct rtable *rt_dst_alloc(struct net_device *dev,
|
|
|
|
|
unsigned int flags, u16 type,
|
|
|
|
|
bool nopolicy, bool noxfrm, bool will_cache)
|
|
|
|
|
{
|
|
|
|
|
struct rtable *rt;
|
|
|
|
|
|
|
|
|
|
rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
|
|
|
|
|
(will_cache ? 0 : DST_HOST) |
|
|
|
|
|
(nopolicy ? DST_NOPOLICY : 0) |
|
|
|
|
|
(noxfrm ? DST_NOXFRM : 0));
|
|
|
|
|
|
|
|
|
|
if (rt) {
|
|
|
|
|
rt->rt_genid = rt_genid_ipv4(dev_net(dev));
|
|
|
|
|
rt->rt_flags = flags;
|
|
|
|
|
rt->rt_type = type;
|
|
|
|
|
rt->rt_is_input = 0;
|
|
|
|
|
rt->rt_iif = 0;
|
|
|
|
|
rt->rt_pmtu = 0;
|
|
|
|
|
rt->rt_gateway = 0;
|
|
|
|
|
rt->rt_uses_gateway = 0;
|
|
|
|
|
rt->rt_table_id = 0;
|
|
|
|
|
INIT_LIST_HEAD(&rt->rt_uncached);
|
|
|
|
|
|
|
|
|
|
rt->dst.output = ip_output;
|
|
|
|
|
if (flags & RTCF_LOCAL)
|
|
|
|
|
rt->dst.input = ip_local_deliver;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return rt;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
最终返回struct rtable实例,第一部分也就完成了。
|
|
|
|
|
|
|
|
|
|
第二部分,就是准备IP层的头,往里面填充内容。这就要对着IP层的头的格式进行理解。
|
|
|
|
|
|
|
|
|
|
![](https://static001.geekbang.org/resource/image/6b/2b/6b2ea7148a8e04138a2228c5dbc7182b.png)
|
|
|
|
|
|
|
|
|
|
在这里面,服务类型设置为tos,标识位里面设置是否允许分片frag\_off。如果不允许,而遇到MTU太小过不去的情况,就发送ICMP报错。TTL是这个包的存活时间,为了防止一个IP包迷路以后一直存活下去,每经过一个路由器TTL都减一,减为零则“死去”。设置protocol,指的是更上层的协议,这里是TCP。源地址和目标地址由ip\_copy\_addrs设置。最后,设置options。
|
|
|
|
|
|
|
|
|
|
第三部分,就是调用ip\_local\_out发送IP包。
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
int ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
|
|
|
|
|
{
|
|
|
|
|
int err;
|
|
|
|
|
|
|
|
|
|
err = __ip_local_out(net, sk, skb);
|
|
|
|
|
if (likely(err == 1))
|
|
|
|
|
err = dst_output(net, sk, skb);
|
|
|
|
|
|
|
|
|
|
return err;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
|
|
|
|
|
{
|
|
|
|
|
struct iphdr *iph = ip_hdr(skb);
|
|
|
|
|
iph->tot_len = htons(skb->len);
|
|
|
|
|
skb->protocol = htons(ETH_P_IP);
|
|
|
|
|
|
|
|
|
|
return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT,
|
|
|
|
|
net, sk, skb, NULL, skb_dst(skb)->dev,
|
|
|
|
|
dst_output);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
ip\_local\_out先是调用\_\_ip\_local\_out,然后里面调用了nf\_hook。这是什么呢?nf的意思是Netfilter,这是Linux内核的一个机制,用于在网络发送和转发的关键节点上加上hook函数,这些函数可以截获数据包,对数据包进行干预。
|
|
|
|
|
|
|
|
|
|
一个著名的实现,就是内核模块ip\_tables。在用户态,还有一个客户端程序iptables,用命令行来干预内核的规则。
|
|
|
|
|
|
|
|
|
|
![](https://static001.geekbang.org/resource/image/75/4d/75c8257049eed99499e802fcc2eacf4d.png)
|
|
|
|
|
|
|
|
|
|
iptables有表和链的概念,最终要的是两个表。
|
|
|
|
|
|
|
|
|
|
filter表处理过滤功能,主要包含以下三个链。
|
|
|
|
|
|
|
|
|
|
* INPUT链:过滤所有目标地址是本机的数据包
|
|
|
|
|
* FORWARD链:过滤所有路过本机的数据包
|
|
|
|
|
* OUTPUT链:过滤所有由本机产生的数据包
|
|
|
|
|
|
|
|
|
|
nat表主要处理网络地址转换,可以进行SNAT(改变源地址)、DNAT(改变目标地址),包含以下三个链。
|
|
|
|
|
|
|
|
|
|
* PREROUTING链:可以在数据包到达时改变目标地址
|
|
|
|
|
* OUTPUT链:可以改变本地产生的数据包的目标地址
|
|
|
|
|
* POSTROUTING链:在数据包离开时改变数据包的源地址
|
|
|
|
|
|
|
|
|
|
![](https://static001.geekbang.org/resource/image/76/da/765e5431fe4b17f62b1b5712cc82abda.png)
|
|
|
|
|
|
|
|
|
|
在这里,网络包马上就要发出去了,因而是NF\_INET\_LOCAL\_OUT,也即ouput链,如果用户曾经在iptables里面写过某些规则,就会在nf\_hook这个函数里面起作用。
|
|
|
|
|
|
|
|
|
|
ip\_local\_out再调用dst\_output,就是真正的发送数据。
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
/* Output packet to network from transport. */
|
|
|
|
|
static inline int dst_output(struct net *net, struct sock *sk, struct sk_buff *skb)
|
|
|
|
|
{
|
|
|
|
|
return skb_dst(skb)->output(net, sk, skb);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
这里调用的就是struct rtable成员dst的ouput函数。在rt\_dst\_alloc中,我们可以看到,output函数指向的是ip\_output。
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb)
|
|
|
|
|
{
|
|
|
|
|
struct net_device *dev = skb_dst(skb)->dev;
|
|
|
|
|
skb->dev = dev;
|
|
|
|
|
skb->protocol = htons(ETH_P_IP);
|
|
|
|
|
|
|
|
|
|
return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
|
|
|
|
|
net, sk, skb, NULL, dev,
|
|
|
|
|
ip_finish_output,
|
|
|
|
|
!(IPCB(skb)->flags & IPSKB_REROUTED));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
在ip\_output里面,我们又看到了熟悉的NF\_HOOK。这一次是NF\_INET\_POST\_ROUTING,也即POSTROUTING链,处理完之后,调用ip\_finish\_output。
|
|
|
|
|
|
|
|
|
|
## 解析ip\_finish\_output函数
|
|
|
|
|
|
|
|
|
|
从ip\_finish\_output函数开始,发送网络包的逻辑由第三层到达第二层。ip\_finish\_output最终调用ip\_finish\_output2。
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
|
|
|
|
|
{
|
|
|
|
|
struct dst_entry *dst = skb_dst(skb);
|
|
|
|
|
struct rtable *rt = (struct rtable *)dst;
|
|
|
|
|
struct net_device *dev = dst->dev;
|
|
|
|
|
unsigned int hh_len = LL_RESERVED_SPACE(dev);
|
|
|
|
|
struct neighbour *neigh;
|
|
|
|
|
u32 nexthop;
|
|
|
|
|
......
|
|
|
|
|
nexthop = (__force u32) rt_nexthop(rt, ip_hdr(skb)->daddr);
|
|
|
|
|
neigh = __ipv4_neigh_lookup_noref(dev, nexthop);
|
|
|
|
|
if (unlikely(!neigh))
|
|
|
|
|
neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);
|
|
|
|
|
if (!IS_ERR(neigh)) {
|
|
|
|
|
int res;
|
|
|
|
|
sock_confirm_neigh(skb, neigh);
|
|
|
|
|
res = neigh_output(neigh, skb);
|
|
|
|
|
return res;
|
|
|
|
|
}
|
|
|
|
|
......
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
在ip\_finish\_output2中,先找到struct rtable路由表里面的下一跳,下一跳一定和本机在同一个局域网中,可以通过二层进行通信,因而通过\_\_ipv4\_neigh\_lookup\_noref,查找如何通过二层访问下一跳。
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
static inline struct neighbour *__ipv4_neigh_lookup_noref(struct net_device *dev, u32 key)
|
|
|
|
|
{
|
|
|
|
|
return ___neigh_lookup_noref(&arp_tbl, neigh_key_eq32, arp_hashfn, &key, dev);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
\_\_ipv4\_neigh\_lookup\_noref是从本地的ARP表中查找下一跳的MAC地址。ARP表的定义如下:
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
struct neigh_table arp_tbl = {
|
|
|
|
|
.family = AF_INET,
|
|
|
|
|
.key_len = 4,
|
|
|
|
|
.protocol = cpu_to_be16(ETH_P_IP),
|
|
|
|
|
.hash = arp_hash,
|
|
|
|
|
.key_eq = arp_key_eq,
|
|
|
|
|
.constructor = arp_constructor,
|
|
|
|
|
.proxy_redo = parp_redo,
|
|
|
|
|
.id = "arp_cache",
|
|
|
|
|
......
|
|
|
|
|
.gc_interval = 30 * HZ,
|
|
|
|
|
.gc_thresh1 = 128,
|
|
|
|
|
.gc_thresh2 = 512,
|
|
|
|
|
.gc_thresh3 = 1024,
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
如果在ARP表中没有找到相应的项,则调用\_\_neigh\_create进行创建。
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, struct net_device *dev, bool want_ref)
|
|
|
|
|
{
|
|
|
|
|
u32 hash_val;
|
|
|
|
|
int key_len = tbl->key_len;
|
|
|
|
|
int error;
|
|
|
|
|
struct neighbour *n1, *rc, *n = neigh_alloc(tbl, dev);
|
|
|
|
|
struct neigh_hash_table *nht;
|
|
|
|
|
|
|
|
|
|
memcpy(n->primary_key, pkey, key_len);
|
|
|
|
|
n->dev = dev;
|
|
|
|
|
dev_hold(dev);
|
|
|
|
|
|
|
|
|
|
/* Protocol specific setup. */
|
|
|
|
|
if (tbl->constructor && (error = tbl->constructor(n)) < 0) {
|
|
|
|
|
......
|
|
|
|
|
}
|
|
|
|
|
......
|
|
|
|
|
if (atomic_read(&tbl->entries) > (1 << nht->hash_shift))
|
|
|
|
|
nht = neigh_hash_grow(tbl, nht->hash_shift + 1);
|
|
|
|
|
|
|
|
|
|
hash_val = tbl->hash(pkey, dev, nht->hash_rnd) >> (32 - nht->hash_shift);
|
|
|
|
|
|
|
|
|
|
for (n1 = rcu_dereference_protected(nht->hash_buckets[hash_val],
|
|
|
|
|
lockdep_is_held(&tbl->lock));
|
|
|
|
|
n1 != NULL;
|
|
|
|
|
n1 = rcu_dereference_protected(n1->next,
|
|
|
|
|
lockdep_is_held(&tbl->lock))) {
|
|
|
|
|
if (dev == n1->dev && !memcmp(n1->primary_key, pkey, key_len)) {
|
|
|
|
|
if (want_ref)
|
|
|
|
|
neigh_hold(n1);
|
|
|
|
|
rc = n1;
|
|
|
|
|
goto out_tbl_unlock;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
......
|
|
|
|
|
rcu_assign_pointer(n->next,
|
|
|
|
|
rcu_dereference_protected(nht->hash_buckets[hash_val],
|
|
|
|
|
lockdep_is_held(&tbl->lock)));
|
|
|
|
|
rcu_assign_pointer(nht->hash_buckets[hash_val], n);
|
|
|
|
|
......
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
\_\_neigh\_create先调用neigh\_alloc,创建一个struct neighbour结构,用于维护MAC地址和ARP相关的信息。这个名字也很好理解,大家都是在一个局域网里面,可以通过MAC地址访问到,当然是邻居了。
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device *dev)
|
|
|
|
|
{
|
|
|
|
|
struct neighbour *n = NULL;
|
|
|
|
|
unsigned long now = jiffies;
|
|
|
|
|
int entries;
|
|
|
|
|
......
|
|
|
|
|
n = kzalloc(tbl->entry_size + dev->neigh_priv_len, GFP_ATOMIC);
|
|
|
|
|
if (!n)
|
|
|
|
|
goto out_entries;
|
|
|
|
|
|
|
|
|
|
__skb_queue_head_init(&n->arp_queue);
|
|
|
|
|
rwlock_init(&n->lock);
|
|
|
|
|
seqlock_init(&n->ha_lock);
|
|
|
|
|
n->updated = n->used = now;
|
|
|
|
|
n->nud_state = NUD_NONE;
|
|
|
|
|
n->output = neigh_blackhole;
|
|
|
|
|
seqlock_init(&n->hh.hh_lock);
|
|
|
|
|
n->parms = neigh_parms_clone(&tbl->parms);
|
|
|
|
|
setup_timer(&n->timer, neigh_timer_handler, (unsigned long)n);
|
|
|
|
|
|
|
|
|
|
NEIGH_CACHE_STAT_INC(tbl, allocs);
|
|
|
|
|
n->tbl = tbl;
|
|
|
|
|
refcount_set(&n->refcnt, 1);
|
|
|
|
|
n->dead = 1;
|
|
|
|
|
......
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
在neigh\_alloc中,我们先分配一个struct neighbour结构并且初始化。这里面比较重要的有两个成员,一个是arp\_queue,所以上层想通过ARP获取MAC地址的任务,都放在这个队列里面。另一个是timer定时器,我们设置成,过一段时间就调用neigh\_timer\_handler,来处理这些ARP任务。
|
|
|
|
|
|
|
|
|
|
\_\_neigh\_create然后调用了arp\_tbl的constructor函数,也即调用了arp\_constructor,在这里面定义了ARP的操作arp\_hh\_ops。
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
static int arp_constructor(struct neighbour *neigh)
|
|
|
|
|
{
|
|
|
|
|
__be32 addr = *(__be32 *)neigh->primary_key;
|
|
|
|
|
struct net_device *dev = neigh->dev;
|
|
|
|
|
struct in_device *in_dev;
|
|
|
|
|
struct neigh_parms *parms;
|
|
|
|
|
......
|
|
|
|
|
neigh->type = inet_addr_type_dev_table(dev_net(dev), dev, addr);
|
|
|
|
|
|
|
|
|
|
parms = in_dev->arp_parms;
|
|
|
|
|
__neigh_parms_put(neigh->parms);
|
|
|
|
|
neigh->parms = neigh_parms_clone(parms);
|
|
|
|
|
......
|
|
|
|
|
neigh->ops = &arp_hh_ops;
|
|
|
|
|
......
|
|
|
|
|
neigh->output = neigh->ops->output;
|
|
|
|
|
......
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static const struct neigh_ops arp_hh_ops = {
|
|
|
|
|
.family = AF_INET,
|
|
|
|
|
.solicit = arp_solicit,
|
|
|
|
|
.error_report = arp_error_report,
|
|
|
|
|
.output = neigh_resolve_output,
|
|
|
|
|
.connected_output = neigh_resolve_output,
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
\_\_neigh\_create最后是将创建的struct neighbour结构放入一个哈希表,从里面的代码逻辑比较容易看出,这是一个数组加链表的链式哈希表,先计算出哈希值hash\_val,得到相应的链表,然后循环这个链表找到对应的项,如果找不到就在最后插入一项。
|
|
|
|
|
|
|
|
|
|
我们回到ip\_finish\_output2,在\_\_neigh\_create之后,会调用neigh\_output发送网络包。
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
static inline int neigh_output(struct neighbour *n, struct sk_buff *skb)
|
|
|
|
|
{
|
|
|
|
|
......
|
|
|
|
|
return n->output(n, skb);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
按照上面对于struct neighbour的操作函数arp\_hh\_ops 的定义,output调用的是neigh\_resolve\_output。
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb)
|
|
|
|
|
{
|
|
|
|
|
if (!neigh_event_send(neigh, skb)) {
|
|
|
|
|
......
|
|
|
|
|
rc = dev_queue_xmit(skb);
|
|
|
|
|
}
|
|
|
|
|
......
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
在neigh\_resolve\_output里面,首先neigh\_event\_send触发一个事件,看能否激活ARP。
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
|
|
|
|
|
{
|
|
|
|
|
int rc;
|
|
|
|
|
bool immediate_probe = false;
|
|
|
|
|
|
|
|
|
|
if (!(neigh->nud_state & (NUD_STALE | NUD_INCOMPLETE))) {
|
|
|
|
|
if (NEIGH_VAR(neigh->parms, MCAST_PROBES) +
|
|
|
|
|
NEIGH_VAR(neigh->parms, APP_PROBES)) {
|
|
|
|
|
unsigned long next, now = jiffies;
|
|
|
|
|
|
|
|
|
|
atomic_set(&neigh->probes,
|
|
|
|
|
NEIGH_VAR(neigh->parms, UCAST_PROBES));
|
|
|
|
|
neigh->nud_state = NUD_INCOMPLETE;
|
|
|
|
|
neigh->updated = now;
|
|
|
|
|
next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME),
|
|
|
|
|
HZ/2);
|
|
|
|
|
neigh_add_timer(neigh, next);
|
|
|
|
|
immediate_probe = true;
|
|
|
|
|
}
|
|
|
|
|
......
|
|
|
|
|
} else if (neigh->nud_state & NUD_STALE) {
|
|
|
|
|
neigh_dbg(2, "neigh %p is delayed\n", neigh);
|
|
|
|
|
neigh->nud_state = NUD_DELAY;
|
|
|
|
|
neigh->updated = jiffies;
|
|
|
|
|
neigh_add_timer(neigh, jiffies +
|
|
|
|
|
NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (neigh->nud_state == NUD_INCOMPLETE) {
|
|
|
|
|
if (skb) {
|
|
|
|
|
.......
|
|
|
|
|
__skb_queue_tail(&neigh->arp_queue, skb);
|
|
|
|
|
neigh->arp_queue_len_Bytes += skb->truesize;
|
|
|
|
|
}
|
|
|
|
|
rc = 1;
|
|
|
|
|
}
|
|
|
|
|
out_unlock_bh:
|
|
|
|
|
if (immediate_probe)
|
|
|
|
|
neigh_probe(neigh);
|
|
|
|
|
.......
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
在\_\_neigh\_event\_send中,激活ARP分两种情况,第一种情况是马上激活,也即immediate\_probe。另一种情况是延迟激活则仅仅设置一个timer。然后将ARP包放在arp\_queue上。如果马上激活,就直接调用neigh\_probe;如果延迟激活,则定时器到了就会触发neigh\_timer\_handler,在这里面还是会调用neigh\_probe。
|
|
|
|
|
|
|
|
|
|
我们就来看neigh\_probe的实现,在这里面会从arp\_queue中拿出ARP包来,然后调用struct neighbour的solicit操作。
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
static void neigh_probe(struct neighbour *neigh)
|
|
|
|
|
__releases(neigh->lock)
|
|
|
|
|
{
|
|
|
|
|
struct sk_buff *skb = skb_peek_tail(&neigh->arp_queue);
|
|
|
|
|
......
|
|
|
|
|
if (neigh->ops->solicit)
|
|
|
|
|
neigh->ops->solicit(neigh, skb);
|
|
|
|
|
......
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
按照上面对于struct neighbour的操作函数arp\_hh\_ops 的定义,solicit调用的是arp\_solicit,在这里我们可以找到对于arp\_send\_dst的调用,创建并发送一个arp包,得到结果放在struct dst\_entry里面。
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
static void arp_send_dst(int type, int ptype, __be32 dest_ip,
|
|
|
|
|
struct net_device *dev, __be32 src_ip,
|
|
|
|
|
const unsigned char *dest_hw,
|
|
|
|
|
const unsigned char *src_hw,
|
|
|
|
|
const unsigned char *target_hw,
|
|
|
|
|
struct dst_entry *dst)
|
|
|
|
|
{
|
|
|
|
|
struct sk_buff *skb;
|
|
|
|
|
......
|
|
|
|
|
skb = arp_create(type, ptype, dest_ip, dev, src_ip,
|
|
|
|
|
dest_hw, src_hw, target_hw);
|
|
|
|
|
......
|
|
|
|
|
skb_dst_set(skb, dst_clone(dst));
|
|
|
|
|
arp_xmit(skb);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
我们回到neigh\_resolve\_output中,当ARP发送完毕,就可以调用dev\_queue\_xmit发送二层网络包了。
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
/**
|
|
|
|
|
* __dev_queue_xmit - transmit a buffer
|
|
|
|
|
* @skb: buffer to transmit
|
|
|
|
|
* @accel_priv: private data used for L2 forwarding offload
|
|
|
|
|
*
|
|
|
|
|
* Queue a buffer for transmission to a network device.
|
|
|
|
|
*/
|
|
|
|
|
static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
|
|
|
|
|
{
|
|
|
|
|
struct net_device *dev = skb->dev;
|
|
|
|
|
struct netdev_queue *txq;
|
|
|
|
|
struct Qdisc *q;
|
|
|
|
|
......
|
|
|
|
|
txq = netdev_pick_tx(dev, skb, accel_priv);
|
|
|
|
|
q = rcu_dereference_bh(txq->qdisc);
|
|
|
|
|
|
|
|
|
|
if (q->enqueue) {
|
|
|
|
|
rc = __dev_xmit_skb(skb, q, dev, txq);
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
|
|
|
|
......
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
就像咱们在讲述硬盘块设备的时候讲过,每个块设备都有队列,用于将内核的数据放到队列里面,然后设备驱动从队列里面取出后,将数据根据具体设备的特性发送给设备。
|
|
|
|
|
|
|
|
|
|
网络设备也是类似的,对于发送来说,有一个发送队列struct netdev\_queue \*txq。
|
|
|
|
|
|
|
|
|
|
这里还有另一个变量叫做struct Qdisc,这个是什么呢?如果我们在一台Linux机器上运行ip addr,我们能看到对于一个网卡,都有下面的输出。
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
# ip addr
|
|
|
|
|
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000
|
|
|
|
|
link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
|
|
|
|
|
inet 127.0.0.1/8 scope host lo
|
|
|
|
|
valid_lft forever preferred_lft forever
|
|
|
|
|
inet6 ::1/128 scope host
|
|
|
|
|
valid_lft forever preferred_lft forever
|
|
|
|
|
2: eth0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1400 qdisc pfifo_fast state UP group default qlen 1000
|
|
|
|
|
link/ether fa:16:3e:75:99:08 brd ff:ff:ff:ff:ff:ff
|
|
|
|
|
inet 10.173.32.47/21 brd 10.173.39.255 scope global noprefixroute dynamic eth0
|
|
|
|
|
valid_lft 67104sec preferred_lft 67104sec
|
|
|
|
|
inet6 fe80::f816:3eff:fe75:9908/64 scope link
|
|
|
|
|
valid_lft forever preferred_lft forever
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
这里面有个关键字qdisc pfifo\_fast是什么意思呢?qdisc全称是queueing discipline,中文叫排队规则。内核如果需要通过某个网络接口发送数据包,都需要按照为这个接口配置的qdisc(排队规则)把数据包加入队列。
|
|
|
|
|
|
|
|
|
|
最简单的qdisc是pfifo,它不对进入的数据包做任何的处理,数据包采用先入先出的方式通过队列。pfifo\_fast稍微复杂一些,它的队列包括三个波段(band)。在每个波段里面,使用先进先出规则。
|
|
|
|
|
|
|
|
|
|
三个波段的优先级也不相同。band 0的优先级最高,band 2的最低。如果band 0里面有数据包,系统就不会处理band 1里面的数据包,band 1和band 2之间也是一样。
|
|
|
|
|
|
|
|
|
|
数据包是按照服务类型(Type of Service,TOS)被分配到三个波段里面的。TOS是IP头里面的一个字段,代表了当前的包是高优先级的,还是低优先级的。
|
|
|
|
|
|
|
|
|
|
pfifo\_fast分为三个先入先出的队列,我们能称为三个Band。根据网络包里面的TOS,看这个包到底应该进入哪个队列。TOS总共四位,每一位表示的意思不同,总共十六种类型。
|
|
|
|
|
|
|
|
|
|
![](https://static001.geekbang.org/resource/image/ab/d9/ab6af2f9e1a64868636080a05cfde0d9.png)
|
|
|
|
|
|
|
|
|
|
通过命令行tc qdisc show dev eth0,我们可以输出结果priomap,也是十六个数字。在0到2之间,和TOS的十六种类型对应起来。不同的TOS对应不同的队列。其中Band 0优先级最高,发送完毕后才轮到Band 1发送,最后才是Band 2。
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
# tc qdisc show dev eth0
|
|
|
|
|
qdisc pfifo_fast 0: root refcnt 2 bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
接下来,\_\_dev\_xmit\_skb开始进行网络包发送。
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
|
|
|
|
|
struct net_device *dev,
|
|
|
|
|
struct netdev_queue *txq)
|
|
|
|
|
{
|
|
|
|
|
......
|
|
|
|
|
rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
|
|
|
|
|
if (qdisc_run_begin(q)) {
|
|
|
|
|
......
|
|
|
|
|
__qdisc_run(q);
|
|
|
|
|
}
|
|
|
|
|
......
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void __qdisc_run(struct Qdisc *q)
|
|
|
|
|
{
|
|
|
|
|
int quota = dev_tx_weight;
|
|
|
|
|
int packets;
|
|
|
|
|
while (qdisc_restart(q, &packets)) {
|
|
|
|
|
/*
|
|
|
|
|
* Ordered by possible occurrence: Postpone processing if
|
|
|
|
|
* 1. we've exceeded packet quota
|
|
|
|
|
* 2. another process needs the CPU;
|
|
|
|
|
*/
|
|
|
|
|
quota -= packets;
|
|
|
|
|
if (quota <= 0 || need_resched()) {
|
|
|
|
|
__netif_schedule(q);
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
qdisc_run_end(q);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
\_\_dev\_xmit\_skb会将请求放入队列,然后调用\_\_qdisc\_run处理队列中的数据。qdisc\_restart用于数据的发送。根据注释中的说法,qdisc的另一个功能是用于控制网络包的发送速度,因而如果超过速度,就需要重新调度,则会调用\_\_netif\_schedule。
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
static void __netif_reschedule(struct Qdisc *q)
|
|
|
|
|
{
|
|
|
|
|
struct softnet_data *sd;
|
|
|
|
|
unsigned long flags;
|
|
|
|
|
local_irq_save(flags);
|
|
|
|
|
sd = this_cpu_ptr(&softnet_data);
|
|
|
|
|
q->next_sched = NULL;
|
|
|
|
|
*sd->output_queue_tailp = q;
|
|
|
|
|
sd->output_queue_tailp = &q->next_sched;
|
|
|
|
|
raise_softirq_irqoff(NET_TX_SOFTIRQ);
|
|
|
|
|
local_irq_restore(flags);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
\_\_netif\_schedule会调用\_\_netif\_reschedule,发起一个软中断NET\_TX\_SOFTIRQ。咱们讲设备驱动程序的时候讲过,设备驱动程序处理中断,分两个过程,一个是屏蔽中断的关键处理逻辑,一个是延迟处理逻辑。当时说工作队列是延迟处理逻辑的处理方案,软中断也是一种方案。
|
|
|
|
|
|
|
|
|
|
在系统初始化的时候,我们会定义软中断的处理函数。例如,NET\_TX\_SOFTIRQ的处理函数是net\_tx\_action,用于发送网络包。还有一个NET\_RX\_SOFTIRQ的处理函数是net\_rx\_action,用于接收网络包。接收网络包的过程咱们下一节解析。
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
open_softirq(NET_TX_SOFTIRQ, net_tx_action);
|
|
|
|
|
open_softirq(NET_RX_SOFTIRQ, net_rx_action);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
这里我们来解析一下net\_tx\_action。
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
static __latent_entropy void net_tx_action(struct softirq_action *h)
|
|
|
|
|
{
|
|
|
|
|
struct softnet_data *sd = this_cpu_ptr(&softnet_data);
|
|
|
|
|
......
|
|
|
|
|
if (sd->output_queue) {
|
|
|
|
|
struct Qdisc *head;
|
|
|
|
|
|
|
|
|
|
local_irq_disable();
|
|
|
|
|
head = sd->output_queue;
|
|
|
|
|
sd->output_queue = NULL;
|
|
|
|
|
sd->output_queue_tailp = &sd->output_queue;
|
|
|
|
|
local_irq_enable();
|
|
|
|
|
|
|
|
|
|
while (head) {
|
|
|
|
|
struct Qdisc *q = head;
|
|
|
|
|
spinlock_t *root_lock;
|
|
|
|
|
|
|
|
|
|
head = head->next_sched;
|
|
|
|
|
......
|
|
|
|
|
qdisc_run(q);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
我们会发现,net\_tx\_action还是调用了qdisc\_run,还是会调用\_\_qdisc\_run,然后调用qdisc\_restart发送网络包。
|
|
|
|
|
|
|
|
|
|
我们来看一下qdisc\_restart的实现。
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
static inline int qdisc_restart(struct Qdisc *q, int *packets)
|
|
|
|
|
{
|
|
|
|
|
struct netdev_queue *txq;
|
|
|
|
|
struct net_device *dev;
|
|
|
|
|
spinlock_t *root_lock;
|
|
|
|
|
struct sk_buff *skb;
|
|
|
|
|
bool validate;
|
|
|
|
|
|
|
|
|
|
/* Dequeue packet */
|
|
|
|
|
skb = dequeue_skb(q, &validate, packets);
|
|
|
|
|
if (unlikely(!skb))
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
|
|
root_lock = qdisc_lock(q);
|
|
|
|
|
dev = qdisc_dev(q);
|
|
|
|
|
txq = skb_get_tx_queue(dev, skb);
|
|
|
|
|
|
|
|
|
|
return sch_direct_xmit(skb, q, dev, txq, root_lock, validate);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
qdisc\_restart将网络包从Qdisc的队列中拿下来,然后调用sch\_direct\_xmit进行发送。
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
|
|
|
|
|
struct net_device *dev, struct netdev_queue *txq,
|
|
|
|
|
spinlock_t *root_lock, bool validate)
|
|
|
|
|
{
|
|
|
|
|
int ret = NETDEV_TX_BUSY;
|
|
|
|
|
|
|
|
|
|
if (likely(skb)) {
|
|
|
|
|
if (!netif_xmit_frozen_or_stopped(txq))
|
|
|
|
|
skb = dev_hard_start_xmit(skb, dev, txq, &ret);
|
|
|
|
|
}
|
|
|
|
|
......
|
|
|
|
|
if (dev_xmit_complete(ret)) {
|
|
|
|
|
/* Driver sent out skb successfully or skb was consumed */
|
|
|
|
|
ret = qdisc_qlen(q);
|
|
|
|
|
} else {
|
|
|
|
|
/* Driver returned NETDEV_TX_BUSY - requeue skb */
|
|
|
|
|
ret = dev_requeue_skb(skb, q);
|
|
|
|
|
}
|
|
|
|
|
......
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
在sch\_direct\_xmit中,调用dev\_hard\_start\_xmit进行发送,如果发送不成功,会返回NETDEV\_TX\_BUSY。这说明网络卡很忙,于是就调用dev\_requeue\_skb,重新放入队列。
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev, struct netdev_queue *txq, int *ret)
|
|
|
|
|
{
|
|
|
|
|
struct sk_buff *skb = first;
|
|
|
|
|
int rc = NETDEV_TX_OK;
|
|
|
|
|
|
|
|
|
|
while (skb) {
|
|
|
|
|
struct sk_buff *next = skb->next;
|
|
|
|
|
rc = xmit_one(skb, dev, txq, next != NULL);
|
|
|
|
|
skb = next;
|
|
|
|
|
if (netif_xmit_stopped(txq) && skb) {
|
|
|
|
|
rc = NETDEV_TX_BUSY;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
......
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
在dev\_hard\_start\_xmit中,是一个while循环。每次在队列中取出一个sk\_buff,调用xmit\_one发送。
|
|
|
|
|
|
|
|
|
|
接下来的调用链为:xmit\_one->netdev\_start\_xmit->\_\_netdev\_start\_xmit。
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
static inline netdev_tx_t __netdev_start_xmit(const struct net_device_ops *ops, struct sk_buff *skb, struct net_device *dev, bool more)
|
|
|
|
|
{
|
|
|
|
|
skb->xmit_more = more ? 1 : 0;
|
|
|
|
|
return ops->ndo_start_xmit(skb, dev);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
这个时候,已经到了设备驱动层了。我们能看到,drivers/net/ethernet/intel/ixgb/ixgb\_main.c里面有对于这个网卡的操作的定义。
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
static const struct net_device_ops ixgb_netdev_ops = {
|
|
|
|
|
.ndo_open = ixgb_open,
|
|
|
|
|
.ndo_stop = ixgb_close,
|
|
|
|
|
.ndo_start_xmit = ixgb_xmit_frame,
|
|
|
|
|
.ndo_set_rx_mode = ixgb_set_multi,
|
|
|
|
|
.ndo_validate_addr = eth_validate_addr,
|
|
|
|
|
.ndo_set_mac_address = ixgb_set_mac,
|
|
|
|
|
.ndo_change_mtu = ixgb_change_mtu,
|
|
|
|
|
.ndo_tx_timeout = ixgb_tx_timeout,
|
|
|
|
|
.ndo_vlan_rx_add_vid = ixgb_vlan_rx_add_vid,
|
|
|
|
|
.ndo_vlan_rx_kill_vid = ixgb_vlan_rx_kill_vid,
|
|
|
|
|
.ndo_fix_features = ixgb_fix_features,
|
|
|
|
|
.ndo_set_features = ixgb_set_features,
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
在这里面,我们可以找到对于ndo\_start\_xmit的定义,调用ixgb\_xmit\_frame。
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
static netdev_tx_t
|
|
|
|
|
ixgb_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
|
|
|
|
|
{
|
|
|
|
|
struct ixgb_adapter *adapter = netdev_priv(netdev);
|
|
|
|
|
......
|
|
|
|
|
if (count) {
|
|
|
|
|
ixgb_tx_queue(adapter, count, vlan_id, tx_flags);
|
|
|
|
|
/* Make sure there is space in the ring for the next send. */
|
|
|
|
|
ixgb_maybe_stop_tx(netdev, &adapter->tx_ring, DESC_NEEDED);
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
......
|
|
|
|
|
return NETDEV_TX_OK;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
在ixgb\_xmit\_frame中,我们会得到这个网卡对应的适配器,然后将其放入硬件网卡的队列中。
|
|
|
|
|
|
|
|
|
|
至此,整个发送才算结束。
|
|
|
|
|
|
|
|
|
|
## 总结时刻
|
|
|
|
|
|
|
|
|
|
这一节,我们继续解析了发送一个网络包的过程,我们整个过程的图画在了下面。
|
|
|
|
|
|
|
|
|
|
![](https://static001.geekbang.org/resource/image/79/6f/79cc42f3163d159a66e163c006d9f36f.png)
|
|
|
|
|
|
|
|
|
|
这个过程分成几个层次。
|
|
|
|
|
|
|
|
|
|
* VFS层:write系统调用找到struct file,根据里面的file\_operations的定义,调用sock\_write\_iter函数。sock\_write\_iter函数调用sock\_sendmsg函数。
|
|
|
|
|
* Socket层:从struct file里面的private\_data得到struct socket,根据里面ops的定义,调用inet\_sendmsg函数。
|
|
|
|
|
* Sock层:从struct socket里面的sk得到struct sock,根据里面sk\_prot的定义,调用tcp\_sendmsg函数。
|
|
|
|
|
* TCP层:tcp\_sendmsg函数会调用tcp\_write\_xmit函数,tcp\_write\_xmit函数会调用tcp\_transmit\_skb,在这里实现了TCP层面向连接的逻辑。
|
|
|
|
|
* IP层:扩展struct sock,得到struct inet\_connection\_sock,根据里面icsk\_af\_ops的定义,调用ip\_queue\_xmit函数。
|
|
|
|
|
* IP层:ip\_route\_output\_ports函数里面会调用fib\_lookup查找路由表。FIB全称是Forwarding Information Base,转发信息表,也就是路由表。
|
|
|
|
|
* 在IP层里面要做的另一个事情是填写IP层的头。
|
|
|
|
|
* 在IP层还要做的一件事情就是通过iptables规则。
|
|
|
|
|
* MAC层:IP层调用ip\_finish\_output进行MAC层。
|
|
|
|
|
* MAC层需要ARP获得MAC地址,因而要调用\_\_\_neigh\_lookup\_noref查找属于同一个网段的邻居,他会调用neigh\_probe发送ARP。
|
|
|
|
|
* 有了MAC地址,就可以调用dev\_queue\_xmit发送二层网络包了,它会调用\_\_dev\_xmit\_skb会将请求放入队列。
|
|
|
|
|
* 设备层:网络包的发送会触发一个软中断NET\_TX\_SOFTIRQ来处理队列中的数据。这个软中断的处理函数是net\_tx\_action。
|
|
|
|
|
* 在软中断处理函数中,会将网络包从队列上拿下来,调用网络设备的传输函数ixgb\_xmit\_frame,将网络包发到设备的队列上去。
|
|
|
|
|
|
|
|
|
|
## 课堂练习
|
|
|
|
|
|
|
|
|
|
上一节你应该通过tcpdump看到了TCP包头的格式,这一节,请你查看一下IP包的格式以及ARP的过程。
|
|
|
|
|
|
|
|
|
|
欢迎留言和我分享你的疑惑和见解 ,也欢迎可以收藏本节内容,反复研读。你也可以把今天的内容分享给你的朋友,和他一起学习和进步。
|
|
|
|
|
|
|
|
|
|
![](https://static001.geekbang.org/resource/image/8c/37/8c0a95fa07a8b9a1abfd394479bdd637.jpg)
|
|
|
|
|
|