Linux内核收包过程

net/core/dev.c

int __init net_dev_init(void)

{

	queue->backlog.poll = process_backlog;

	open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL);

	open_softirq(NET_RX_SOFTIRQ, net_rx_action, NULL);

}

/net/core/dev.c

int process_backlog(struct napi_struct *napi, int quota)

{

	int work = 0;

	struct softnet_data *queue = &__get_cpu_var(softnet_data);	// 取出cpu变量(就一个指针)

	unsigned long start_time = jiffies;

	napi->weight = weight_p;

	do {

		struct sk_buff *skb;	// 关键

		struct net_device *dev;

		local_irq_disable();	// 关中断

		skb = __skb_dequeue(&queue->input_pkt_queue);	// 从队列中取出来(暂不管怎么挂到队列的)

		if (!skb) {

			__napi_complete(napi);

			local_irq_enable();

			break;

		}

		local_irq_enable();	// 开中断

		dev = skb->dev;

		netif_receive_skb(skb);	// 交给IP层

		dev_put(dev);	// 允许释放dev结构体

	} while (++work < quota && jiffies == start_time);

	return work;

}

include/linux/skbuff.h

struct sk_buff *__skb_dequeue(struct sk_buff_head *list)

{

	struct sk_buff *next, *prev, *result;

	prev = (struct sk_buff *) list;

	next = prev->next;

	result = NULL;

	if (next != prev) {

		result	     = next;

		next	     = next->next;

		list->qlen--;

		next->prev   = prev;

		prev->next   = next;

		result->next = result->prev = NULL;

	}

	return result;

}

net/core/dev.c

/*

 *	netif_receive_skb - process receive buffer from network

 *	@skb: buffer to process

 *

 *	netif_receive_skb() is the main receive data processing function.

 *	It always succeeds. The buffer may be dropped during processing

 *	for congestion control or by the protocol layers.

 *

 *	This function may only be called from softirq context and interrupts

 *	should be enabled. (看这行)

 *

 *	Return values (usually ignored):

 *	NET_RX_SUCCESS: no congestion

 *	NET_RX_DROP: packet was dropped

 */

int netif_receive_skb(struct sk_buff *skb)

{

	struct packet_type *ptype, *pt_prev;

	struct net_device *orig_dev;

	int ret = NET_RX_DROP;

	__be16 type;

	/* if we've gotten here through NAPI, check netpoll */

	if (netpoll_receive_skb(skb))

		return NET_RX_DROP;

	if (!skb->tstamp.tv64)

		net_timestamp(skb);

	if (!skb->iif)

		skb->iif = skb->dev->ifindex;

	orig_dev = skb_bond(skb);

	if (!orig_dev)

		return NET_RX_DROP;

	__get_cpu_var(netdev_rx_stat).total++;

	skb_reset_network_header(skb);	// 调整相关header指针

	skb_reset_transport_header(skb);

	skb->mac_len = skb->network_header - skb->mac_header;

	pt_prev = NULL;

	rcu_read_lock();

#ifdef CONFIG_NET_CLS_ACT

	if (skb->tc_verd & TC_NCLS) {

		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);

		goto ncls;

	}

#endif

	// 这里 ptype_all 是针对任何协议的，视为空即可

	list_for_each_entry_rcu(ptype, &ptype_all, list) {

		if (!ptype->dev || ptype->dev == skb->dev) {

			if (pt_prev)

				ret = deliver_skb(skb, pt_prev, orig_dev);	// 交给上层

			pt_prev = ptype;

		}

	}

#ifdef CONFIG_NET_CLS_ACT

	skb = handle_ing(skb, &pt_prev, &ret, orig_dev);

	if (!skb)

		goto out;

ncls:

#endif

	skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);

	if (!skb)

		goto out;

	skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);

	if (!skb)

		goto out;

	type = skb->protocol;

	list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type)&15], list) {

		// ptype_base是协议栈，见inet_init(void)中的dev_add_pack(&ip_packet_type)

		if (ptype->type == type && (!ptype->dev || ptype->dev == skb->dev)) {

			if (pt_prev)

				ret = deliver_skb(skb, pt_prev, orig_dev);	// 交给上层

			pt_prev = ptype;

		}

	}

	if (pt_prev) {

		ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);	// 这个也关键

	} else {

		kfree_skb(skb);

		ret = NET_RX_DROP;

	}

out:

	rcu_read_unlock();

	return ret;

}

net/core/dev.c

static struct packet_type ip_packet_type = {

    .type = __constant_htons(ETH_P_IP),

    .func = ip_rcv,

    .gso_send_check = inet_gso_send_check,

    .gso_segment = inet_gso_segment,

};

static int __init inet_init(void)

{

    ...

    dev_add_pack(&ip_packet_type);

    ...

}

net/core/dev.c

/**

 *	dev_add_pack - add packet handler

 *	@pt: packet type declaration

 *

 *	Add a protocol handler to the networking stack. The passed &packet_type

 *	is linked into kernel lists and may not be freed until it has been

 *	removed from the kernel lists.

 *

 *	This call does not sleep therefore it can not

 *	guarantee all CPU's that are in middle of receiving packets

 *	will see the new packet type (until the next received packet).

 */

void dev_add_pack(struct packet_type *pt)

{

    int hash;

    spin_lock_bh(&ptype_lock);

    if (pt->type == htons(ETH_P_ALL))

        list_add_rcu(&pt->list, &ptype_all);

    else {

        hash = ntohs(pt->type) & 15;

        list_add_rcu(&pt->list, &ptype_base[hash]);	// 这里的ptype_base

    }

    spin_unlock_bh(&ptype_lock);

}

net/ipv4/ip_input.c

/*

 * 	Main IP Receive routine.

 * 	IP层入口函数

 */

int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)

{

    struct iphdr *iph;

    u32 len;

    if (dev->nd_net != &init_net)

	    goto drop;

    if (skb->pkt_type == PACKET_OTHERHOST)

	    goto drop;

    IP_INC_STATS_BH(IPSTATS_MIB_INRECEIVES);

    if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) {

	    IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS);

	    goto out;

    }

    // 若包不完整，考虑重组

    if (!pskb_may_pull(skb, sizeof(struct iphdr)))

	    goto inhdr_error;

    iph = ip_hdr(skb);

    if (iph->ihl < 5 || iph->version != 4)

	    goto inhdr_error;

    if (!pskb_may_pull(skb, iph->ihl*4))

	    goto inhdr_error;

    iph = ip_hdr(skb);

    if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))

	    goto inhdr_error;

    len = ntohs(iph->tot_len);

    if (skb->len < len) {

	    IP_INC_STATS_BH(IPSTATS_MIB_INTRUNCATEDPKTS);

	    goto drop;

    } else if (len < (iph->ihl*4))

	    goto inhdr_error;

    if (pskb_trim_rcsum(skb, len)) {

	    IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS);

	    goto drop;

    }

    /* Remove any debris in the socket control block */

    memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));

    // 钩子: NF_IP_PRE_ROUTING,关键看ip_rcv_finish。

    return NF_HOOK(PF_INET, NF_IP_PRE_ROUTING, skb, dev, NULL,

	           ip_rcv_finish);

inhdr_error:

    IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS);

drop:

    kfree_skb(skb);

out:

    return NET_RX_DROP;

}

net/ipv4/ip_input.c

static int ip_rcv_finish(struct sk_buff *skb)

{

    const struct iphdr *iph = ip_hdr(skb);

    struct rtable *rt;

    if (skb->dst == NULL) {

        // 初始化skb->dst,即路由过程

	    int err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos,

				     skb->dev);

	    if (unlikely(err)) {

		    if (err == -EHOSTUNREACH)

			    IP_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);

		    else if (err == -ENETUNREACH)

			    IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES);

		    goto drop;

	    }

    }

#ifdef CONFIG_NET_CLS_ROUTE

    if (unlikely(skb->dst->tclassid)) {

	    struct ip_rt_acct *st = ip_rt_acct + 256*smp_processor_id();

	    u32 idx = skb->dst->tclassid;

	    st[idx&0xFF].o_packets++;

	    st[idx&0xFF].o_bytes+=skb->len;

	    st[(idx>>16)&0xFF].i_packets++;

	    st[(idx>>16)&0xFF].i_bytes+=skb->len;

    }

#endif

    if (iph->ihl > 5 && ip_rcv_options(skb))

	    goto drop;

    rt = (struct rtable*)skb->dst;

    if (rt->rt_type == RTN_MULTICAST)

	    IP_INC_STATS_BH(IPSTATS_MIB_INMCASTPKTS);

    else if (rt->rt_type == RTN_BROADCAST)

	    IP_INC_STATS_BH(IPSTATS_MIB_INBCASTPKTS);

    return dst_input(skb);  // 交给上层

drop:

    kfree_skb(skb);

    return NET_RX_DROP;

}

net/ipv4/ip_input.c

// 交给上层，没什么好看的

int dst_input(struct sk_buff *skb)

{

    int err;

    for (;;) {

	    err = skb->dst->input(skb); // 执行

	    if (likely(err == 0))

		    return err;

	    /* Oh, Jamal... Seems, I will not forgive you this mess. :-) */

	    if (unlikely(err != NET_XMIT_BYPASS))

		    return err;

    }

}

对于ipv4来说，路由结果其实只有两种:ip_local_deliver和ip_forward，分别表示提交到本地、转发。咱只关注提交到本地的。

net/ipv4/ip_input.c

/*

 * 	Deliver IP Packets to the higher protocol layers.

 */

int ip_local_deliver(struct sk_buff *skb)

{

    /*

     *	Reassemble IP fragments.

     */

    if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {

	    if (ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER))

		    return 0;

    }

    // 又一钩子

    return NF_HOOK(PF_INET, NF_IP_LOCAL_IN, skb, skb->dev, NULL,

	           ip_local_deliver_finish);    // 看回调

}

net/ipv4/ip_input.c

static int ip_local_deliver_finish(struct sk_buff *skb)

{

    __skb_pull(skb, ip_hdrlen(skb));

    /* Point into the IP datagram, just past the header. */

    skb_reset_transport_header(skb);

    rcu_read_lock();

    {

	    /* Note: See raw.c and net/raw.h, RAWV4_HTABLE_SIZE==MAX_INET_PROTOS */

	    int protocol = ip_hdr(skb)->protocol;

	    int hash;

	    struct sock *raw_sk;

	    struct net_protocol *ipprot;

    resubmit:

	    hash = protocol & (MAX_INET_PROTOS - 1);

	    raw_sk = sk_head(&raw_v4_htable[hash]);

	    /* If there maybe a raw socket we must check - if not we

	     * don't care less

	     */

	    if (raw_sk && !raw_v4_input(skb, ip_hdr(skb), hash))    // 关键

		    raw_sk = NULL;

	    if ((ipprot = rcu_dereference(inet_protos[hash])) != NULL) {

		    int ret;

		    if (!ipprot->no_policy) {

			    if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {

				    kfree_skb(skb);

				    goto out;

			    }

			    nf_reset(skb);

		    }

		    ret = ipprot->handler(skb);

		    if (ret < 0) {

			    protocol = -ret;

			    goto resubmit;

		    }

		    IP_INC_STATS_BH(IPSTATS_MIB_INDELIVERS);

	    } else {

		    if (!raw_sk) {

			    if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {

				    IP_INC_STATS_BH(IPSTATS_MIB_INUNKNOWNPROTOS);

				    icmp_send(skb, ICMP_DEST_UNREACH,

					      ICMP_PROT_UNREACH, 0);

			    }

		    } else

			    IP_INC_STATS_BH(IPSTATS_MIB_INDELIVERS);

		    kfree_skb(skb);

	    }

    }

 out:

    rcu_read_unlock();

    return 0;

}

/net/ipv4/raw.c

int raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash)

{

    struct sock *sk;

    struct hlist_head *head;

    int delivered = 0;

    read_lock(&raw_v4_lock);

    head = &raw_v4_htable[hash];

    if (hlist_empty(head))

	    goto out;

    sk = __raw_v4_lookup(__sk_head(head), iph->protocol,

		         iph->saddr, iph->daddr,

		         skb->dev->ifindex);

    while (sk) {

	    delivered = 1;

	    if (iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) {

		    struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC);

		    /* Not releasing hash table! */

		    if (clone)

			    raw_rcv(sk, clone); // 关键

	    }

	    sk = __raw_v4_lookup(sk_next(sk), iph->protocol,

			         iph->saddr, iph->daddr,

			         skb->dev->ifindex);

    }

out:

    read_unlock(&raw_v4_lock);

    return delivered;

}

/net/ipv4/raw.c

int raw_rcv(struct sock *sk, struct sk_buff *skb)

{

    if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {

	    kfree_skb(skb);

	    return NET_RX_DROP;

    }

    nf_reset(skb);

    skb_push(skb, skb->data - skb_network_header(skb));

    raw_rcv_skb(sk, skb);   // 关键

    return 0;

}

/net/ipv4/raw.c

static int raw_rcv_skb(struct sock * sk, struct sk_buff * skb)

{

    /* Charge it to the socket. */

    if (sock_queue_rcv_skb(sk, skb) < 0) {  // 关键

	    /* FIXME: increment a raw drops counter here */

	    kfree_skb(skb);

	    return NET_RX_DROP;

    }

    return NET_RX_SUCCESS;

}

/net/core/sock.c

int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)

{

    int err = 0;

    int skb_len;

    /* Cast skb->rcvbuf to unsigned... It's pointless, but reduces

       number of warnings when compiling with -W --ANK

     */

    if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=

        (unsigned)sk->sk_rcvbuf) {

	    err = -ENOMEM;

	    goto out;

    }

    err = sk_filter(sk, skb);

    if (err)

	    goto out;

    skb->dev = NULL;

    skb_set_owner_r(skb, sk);

    /* Cache the SKB length before we tack it onto the receive

     * queue.  Once it is added it no longer belongs to us and

     * may be freed by other threads of control pulling packets

     * from the queue.

     */

    skb_len = skb->len;

    skb_queue_tail(&sk->sk_receive_queue, skb); // 关键

    if (!sock_flag(sk, SOCK_DEAD))

	    sk->sk_data_ready(sk, skb_len); // 通知已有数据了，如果有block的监听者就可以返回。

out:

    return err;

}

/net/core/skbuff.c

void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk)

{

    unsigned long flags;

    spin_lock_irqsave(&list->lock, flags);

    __skb_queue_tail(list, newsk);      // 关键

    spin_unlock_irqrestore(&list->lock, flags);

}

Linux内核收包过程的更多相关文章

linux网络收包过程
记录一下linux数据包从网卡进入协议栈的过程,不涉及驱动,不涉及其他层的协议处理. 内核是如何知道网卡收到数据的,这就涉及到网卡和内核的交互方式: 轮询(poll):内核周期性的检查网卡,查看是否收 ...
记录一次坎坷的linux内网渗透过程瞎折腾的坑
版权声明:本文为博主的原创文章,未经博主同意不得转载. 写在前面每个人都有自己的思路和技巧,以前遇到一些linux的环境.这次找来一个站点来进行内网,写下自己的想法目标环境 1.linux 2. ...
代码学习-Linux内核网卡收包过程(NAPI)【转】
转自:https://blog.csdn.net/crazycoder8848/article/details/46333761 版权声明:本文没有任何版权限制,任何人可以以任何方式使用本文. htt ...
Linux源码包安装过程及注意事项
众做周知RedHat Linux使用RPM包管理器安装rpm包,但是RPM包是由红帽编译打包的,通常相对于最新版落后了很多,甚至可能缺少我们所要使用的功能.如果你需要定制想要的软件功能.自定义安装路径 ...
[转]Linux网络 - 数据包的发送过程
转, 原文:https://segmentfault.com/a/1190000008926093 -------------------------------------------------- ...
linux下抓包工具tcpdump详解
本文转自:http://www.cnblogs.com/ggjucheng/archive/2012/01/14/2322659.html 简介用简单的话来定义tcpdump,就是:dump the ...
探索 Linux 系统的启动过程
引言之所以想到写这些东西,那是因为我确实想让大家也和我一样,把 Linux 桌面系统打造成真真正正日常使用的工具,而不是安装之后试用几把再删掉.我是真的在日常生活和工作中都使用 Linux,比如在 ...
在Linux下安装PHP过程中，编译时出现错误的解决办法
在Linux下安装PHP过程中,编译时出现configure: error: libjpeg.(a|so) not found 错误的解决办法 configure: error: libjpeg.(a ...
Linux系统的启动过程
Linux 系统启动过程 Linux系统的启动过程可以分为5个阶段: BIOS自检内核的引导. 运行init. 系统初始化. 用户登录系统. BIOS自检: BIOS是英文"Basic I ...

随机推荐

剑指offer等算法总结归类
从数据结构分一.链表: 3.题目描述:输入一个链表,从尾到头打印链表每个节点的值(递归) 思路:递归调用,调一次,加一次到list中 14.题目描述:输入一个链表,输出该链表中倒数第k个结点两个指 ...
(转)nginx location在配置中的优先级
原文:https://www.bo56.com/nginx-location%E5%9C%A8%E9%85%8D%E7%BD%AE%E4%B8%AD%E7%9A%84%E4%BC%98%E5%85%8 ...
COALESCE操作符
一.应用场景 1.购买的零件和本地生产的零件都是零件,尽管多重的实体类型在数据存储上略有不同,但是它们有太多的相同之处,因此通常使用一个表格而不是两个. 所以这是如果我们需要计算零件的实际话费的话,那 ...
Oracle运算符收录(易忘记,但是又很重要的运算符)
Create Table Test6( id ), name ), age ), sex ) ) 1. || 符字符串连接字符串,注意:文字和日期一定嵌入在单引号里面 select ID,Nam ...
九度oj 1032 ZOJ 2009年浙江大学计算机及软件工程研究生机试真题
题目1032:ZOJ 时间限制:1 秒内存限制:32 兆特殊判题:否提交:4102 解决:2277 题目描述: 读入一个字符串,字符串中包含ZOJ三个字符,个数不一定相等,按ZOJ的顺序输出,当 ...
自定义控件如何给特殊类型的属性添加默认值 z（转）
自定义控件如何给特殊类型的属性添加默认值 z 定义控件如何给特殊类型的属性添加默认值了,附自定义GroupBox一枚标题有点那啥,但确实能表达我掌握此法后的心情. 写自定义控件时往往会有一个需求,就 ...
leetcode_787【K 站中转内最便宜的航班】
有 n 个城市通过 m 个航班连接.每个航班都从城市 u 开始,以价格 w 抵达 v. 现在给定所有的城市和航班,以及出发城市 src 和目的地 dst,你的任务是找到从 src 到 dst 最多经过 ...
在ASPNETCORE中获得所有Action
在ASPNETCORE中获得所有Action 本文旨在记录自己在aspnetcore工作中需要获取所有Action,在查询了资料后进行了几种方法的记录.后期有发现其它方式再进行追加. 一.通过反射 ...
jQuery ajax async
jQuery 同步调用: jQuery.ajax({ type:'POST', async: false, url:'qcTask/add', contentType:'application/jso ...
centos6.x硬件信息统计脚本
#!/bin/bash Line='===========' #linux发行版名称 if [[ -f /usr/bin/lsb_release ]]; then OS=$(/usr/bin/lsb_ ...

Linux内核收包过程

Linux内核收包过程的更多相关文章

随机推荐

热门专题