Linux内核收包过程
net/core/dev.c
int __init net_dev_init(void)
{
	queue->backlog.poll = process_backlog;
	open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL);
	open_softirq(NET_RX_SOFTIRQ, net_rx_action, NULL);
}
/net/core/dev.c
int process_backlog(struct napi_struct *napi, int quota)
{
	int work = 0;
	struct softnet_data *queue = &__get_cpu_var(softnet_data);	// 取出cpu变量(就一个指针)
	unsigned long start_time = jiffies;
	napi->weight = weight_p;
	do {
		struct sk_buff *skb;	// 关键
		struct net_device *dev;
		local_irq_disable();	// 关中断
		skb = __skb_dequeue(&queue->input_pkt_queue);	// 从队列中取出来(暂不管怎么挂到队列的)
		if (!skb) {
			__napi_complete(napi);
			local_irq_enable();
			break;
		}
		local_irq_enable();	// 开中断
		dev = skb->dev;
		netif_receive_skb(skb);	// 交给IP层
		dev_put(dev);	// 允许释放dev结构体
	} while (++work < quota && jiffies == start_time);
	return work;
}
include/linux/skbuff.h
struct sk_buff *__skb_dequeue(struct sk_buff_head *list)
{
	struct sk_buff *next, *prev, *result;
	prev = (struct sk_buff *) list;
	next = prev->next;
	result = NULL;
	if (next != prev) {
		result	     = next;
		next	     = next->next;
		list->qlen--;
		next->prev   = prev;
		prev->next   = next;
		result->next = result->prev = NULL;
	}
	return result;
}
net/core/dev.c
/*
 *	netif_receive_skb - process receive buffer from network
 *	@skb: buffer to process
 *
 *	netif_receive_skb() is the main receive data processing function.
 *	It always succeeds. The buffer may be dropped during processing
 *	for congestion control or by the protocol layers.
 *
 *	This function may only be called from softirq context and interrupts
 *	should be enabled. (看这行)
 *
 *	Return values (usually ignored):
 *	NET_RX_SUCCESS: no congestion
 *	NET_RX_DROP: packet was dropped
 */
int netif_receive_skb(struct sk_buff *skb)
{
	struct packet_type *ptype, *pt_prev;
	struct net_device *orig_dev;
	int ret = NET_RX_DROP;
	__be16 type;
	/* if we've gotten here through NAPI, check netpoll */
	if (netpoll_receive_skb(skb))
		return NET_RX_DROP;
	if (!skb->tstamp.tv64)
		net_timestamp(skb);
	if (!skb->iif)
		skb->iif = skb->dev->ifindex;
	orig_dev = skb_bond(skb);
	if (!orig_dev)
		return NET_RX_DROP;
	__get_cpu_var(netdev_rx_stat).total++;
	skb_reset_network_header(skb);	// 调整相关header指针
	skb_reset_transport_header(skb);
	skb->mac_len = skb->network_header - skb->mac_header;
	pt_prev = NULL;
	rcu_read_lock();
#ifdef CONFIG_NET_CLS_ACT
	if (skb->tc_verd & TC_NCLS) {
		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
		goto ncls;
	}
#endif
	// 这里 ptype_all 是针对任何协议的,视为空即可
	list_for_each_entry_rcu(ptype, &ptype_all, list) {
		if (!ptype->dev || ptype->dev == skb->dev) {
			if (pt_prev)
				ret = deliver_skb(skb, pt_prev, orig_dev);	// 交给上层
			pt_prev = ptype;
		}
	}
#ifdef CONFIG_NET_CLS_ACT
	skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
	if (!skb)
		goto out;
ncls:
#endif
	skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
	if (!skb)
		goto out;
	skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);
	if (!skb)
		goto out;
	type = skb->protocol;
	list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type)&15], list) {
		// ptype_base是协议栈,见inet_init(void)中的dev_add_pack(&ip_packet_type)
		if (ptype->type == type && (!ptype->dev || ptype->dev == skb->dev)) {
			if (pt_prev)
				ret = deliver_skb(skb, pt_prev, orig_dev);	// 交给上层
			pt_prev = ptype;
		}
	}
	if (pt_prev) {
		ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);	// 这个也关键
	} else {
		kfree_skb(skb);
		ret = NET_RX_DROP;
	}
out:
	rcu_read_unlock();
	return ret;
}
net/core/dev.c
static struct packet_type ip_packet_type = {
    .type = __constant_htons(ETH_P_IP),
    .func = ip_rcv,
    .gso_send_check = inet_gso_send_check,
    .gso_segment = inet_gso_segment,
};
static int __init inet_init(void)
{
    ...
    dev_add_pack(&ip_packet_type);
    ...
}
net/core/dev.c
/**
 *	dev_add_pack - add packet handler
 *	@pt: packet type declaration
 *
 *	Add a protocol handler to the networking stack. The passed &packet_type
 *	is linked into kernel lists and may not be freed until it has been
 *	removed from the kernel lists.
 *
 *	This call does not sleep therefore it can not
 *	guarantee all CPU's that are in middle of receiving packets
 *	will see the new packet type (until the next received packet).
 */
void dev_add_pack(struct packet_type *pt)
{
    int hash;
    spin_lock_bh(&ptype_lock);
    if (pt->type == htons(ETH_P_ALL))
        list_add_rcu(&pt->list, &ptype_all);
    else {
        hash = ntohs(pt->type) & 15;
        list_add_rcu(&pt->list, &ptype_base[hash]);	// 这里的ptype_base
    }
    spin_unlock_bh(&ptype_lock);
}
net/ipv4/ip_input.c
/*
 * 	Main IP Receive routine.
 * 	IP层入口函数
 */
int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
{
    struct iphdr *iph;
    u32 len;
    if (dev->nd_net != &init_net)
	    goto drop;
    if (skb->pkt_type == PACKET_OTHERHOST)
	    goto drop;
    IP_INC_STATS_BH(IPSTATS_MIB_INRECEIVES);
    if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) {
	    IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS);
	    goto out;
    }
    // 若包不完整,考虑重组
    if (!pskb_may_pull(skb, sizeof(struct iphdr)))
	    goto inhdr_error;
    iph = ip_hdr(skb);
    if (iph->ihl < 5 || iph->version != 4)
	    goto inhdr_error;
    if (!pskb_may_pull(skb, iph->ihl*4))
	    goto inhdr_error;
    iph = ip_hdr(skb);
    if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
	    goto inhdr_error;
    len = ntohs(iph->tot_len);
    if (skb->len < len) {
	    IP_INC_STATS_BH(IPSTATS_MIB_INTRUNCATEDPKTS);
	    goto drop;
    } else if (len < (iph->ihl*4))
	    goto inhdr_error;
    if (pskb_trim_rcsum(skb, len)) {
	    IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS);
	    goto drop;
    }
    /* Remove any debris in the socket control block */
    memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
    // 钩子: NF_IP_PRE_ROUTING,关键看ip_rcv_finish。
    return NF_HOOK(PF_INET, NF_IP_PRE_ROUTING, skb, dev, NULL,
	           ip_rcv_finish);
inhdr_error:
    IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS);
drop:
    kfree_skb(skb);
out:
    return NET_RX_DROP;
}
net/ipv4/ip_input.c
static int ip_rcv_finish(struct sk_buff *skb)
{
    const struct iphdr *iph = ip_hdr(skb);
    struct rtable *rt;
    if (skb->dst == NULL) {
        // 初始化skb->dst,即路由过程
	    int err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos,
				     skb->dev);
	    if (unlikely(err)) {
		    if (err == -EHOSTUNREACH)
			    IP_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
		    else if (err == -ENETUNREACH)
			    IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES);
		    goto drop;
	    }
    }
#ifdef CONFIG_NET_CLS_ROUTE
    if (unlikely(skb->dst->tclassid)) {
	    struct ip_rt_acct *st = ip_rt_acct + 256*smp_processor_id();
	    u32 idx = skb->dst->tclassid;
	    st[idx&0xFF].o_packets++;
	    st[idx&0xFF].o_bytes+=skb->len;
	    st[(idx>>16)&0xFF].i_packets++;
	    st[(idx>>16)&0xFF].i_bytes+=skb->len;
    }
#endif
    if (iph->ihl > 5 && ip_rcv_options(skb))
	    goto drop;
    rt = (struct rtable*)skb->dst;
    if (rt->rt_type == RTN_MULTICAST)
	    IP_INC_STATS_BH(IPSTATS_MIB_INMCASTPKTS);
    else if (rt->rt_type == RTN_BROADCAST)
	    IP_INC_STATS_BH(IPSTATS_MIB_INBCASTPKTS);
    return dst_input(skb);  // 交给上层
drop:
    kfree_skb(skb);
    return NET_RX_DROP;
}
net/ipv4/ip_input.c
// 交给上层,没什么好看的
int dst_input(struct sk_buff *skb)
{
    int err;
    for (;;) {
	    err = skb->dst->input(skb); // 执行
	    if (likely(err == 0))
		    return err;
	    /* Oh, Jamal... Seems, I will not forgive you this mess. :-) */
	    if (unlikely(err != NET_XMIT_BYPASS))
		    return err;
    }
}
对于ipv4来说,路由结果其实只有两种:ip_local_deliver和ip_forward,分别表示提交到本地、转发。咱只关注提交到本地的。
net/ipv4/ip_input.c
/*
 * 	Deliver IP Packets to the higher protocol layers.
 */
int ip_local_deliver(struct sk_buff *skb)
{
    /*
     *	Reassemble IP fragments.
     */
    if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
	    if (ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER))
		    return 0;
    }
    // 又一钩子
    return NF_HOOK(PF_INET, NF_IP_LOCAL_IN, skb, skb->dev, NULL,
	           ip_local_deliver_finish);    // 看回调
}
net/ipv4/ip_input.c
static int ip_local_deliver_finish(struct sk_buff *skb)
{
    __skb_pull(skb, ip_hdrlen(skb));
    /* Point into the IP datagram, just past the header. */
    skb_reset_transport_header(skb);
    rcu_read_lock();
    {
	    /* Note: See raw.c and net/raw.h, RAWV4_HTABLE_SIZE==MAX_INET_PROTOS */
	    int protocol = ip_hdr(skb)->protocol;
	    int hash;
	    struct sock *raw_sk;
	    struct net_protocol *ipprot;
    resubmit:
	    hash = protocol & (MAX_INET_PROTOS - 1);
	    raw_sk = sk_head(&raw_v4_htable[hash]);
	    /* If there maybe a raw socket we must check - if not we
	     * don't care less
	     */
	    if (raw_sk && !raw_v4_input(skb, ip_hdr(skb), hash))    // 关键
		    raw_sk = NULL;
	    if ((ipprot = rcu_dereference(inet_protos[hash])) != NULL) {
		    int ret;
		    if (!ipprot->no_policy) {
			    if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
				    kfree_skb(skb);
				    goto out;
			    }
			    nf_reset(skb);
		    }
		    ret = ipprot->handler(skb);
		    if (ret < 0) {
			    protocol = -ret;
			    goto resubmit;
		    }
		    IP_INC_STATS_BH(IPSTATS_MIB_INDELIVERS);
	    } else {
		    if (!raw_sk) {
			    if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
				    IP_INC_STATS_BH(IPSTATS_MIB_INUNKNOWNPROTOS);
				    icmp_send(skb, ICMP_DEST_UNREACH,
					      ICMP_PROT_UNREACH, 0);
			    }
		    } else
			    IP_INC_STATS_BH(IPSTATS_MIB_INDELIVERS);
		    kfree_skb(skb);
	    }
    }
 out:
    rcu_read_unlock();
    return 0;
}
/net/ipv4/raw.c
int raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash)
{
    struct sock *sk;
    struct hlist_head *head;
    int delivered = 0;
    read_lock(&raw_v4_lock);
    head = &raw_v4_htable[hash];
    if (hlist_empty(head))
	    goto out;
    sk = __raw_v4_lookup(__sk_head(head), iph->protocol,
		         iph->saddr, iph->daddr,
		         skb->dev->ifindex);
    while (sk) {
	    delivered = 1;
	    if (iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) {
		    struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC);
		    /* Not releasing hash table! */
		    if (clone)
			    raw_rcv(sk, clone); // 关键
	    }
	    sk = __raw_v4_lookup(sk_next(sk), iph->protocol,
			         iph->saddr, iph->daddr,
			         skb->dev->ifindex);
    }
out:
    read_unlock(&raw_v4_lock);
    return delivered;
}
/net/ipv4/raw.c
int raw_rcv(struct sock *sk, struct sk_buff *skb)
{
    if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
	    kfree_skb(skb);
	    return NET_RX_DROP;
    }
    nf_reset(skb);
    skb_push(skb, skb->data - skb_network_header(skb));
    raw_rcv_skb(sk, skb);   // 关键
    return 0;
}
/net/ipv4/raw.c
static int raw_rcv_skb(struct sock * sk, struct sk_buff * skb)
{
    /* Charge it to the socket. */
    if (sock_queue_rcv_skb(sk, skb) < 0) {  // 关键
	    /* FIXME: increment a raw drops counter here */
	    kfree_skb(skb);
	    return NET_RX_DROP;
    }
    return NET_RX_SUCCESS;
}
/net/core/sock.c
int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
    int err = 0;
    int skb_len;
    /* Cast skb->rcvbuf to unsigned... It's pointless, but reduces
       number of warnings when compiling with -W --ANK
     */
    if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
        (unsigned)sk->sk_rcvbuf) {
	    err = -ENOMEM;
	    goto out;
    }
    err = sk_filter(sk, skb);
    if (err)
	    goto out;
    skb->dev = NULL;
    skb_set_owner_r(skb, sk);
    /* Cache the SKB length before we tack it onto the receive
     * queue.  Once it is added it no longer belongs to us and
     * may be freed by other threads of control pulling packets
     * from the queue.
     */
    skb_len = skb->len;
    skb_queue_tail(&sk->sk_receive_queue, skb); // 关键
    if (!sock_flag(sk, SOCK_DEAD))
	    sk->sk_data_ready(sk, skb_len); // 通知已有数据了,如果有block的监听者就可以返回。
out:
    return err;
}
/net/core/skbuff.c
void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk)
{
    unsigned long flags;
    spin_lock_irqsave(&list->lock, flags);
    __skb_queue_tail(list, newsk);      // 关键
    spin_unlock_irqrestore(&list->lock, flags);
}Linux内核收包过程的更多相关文章
- linux网络收包过程
		记录一下linux数据包从网卡进入协议栈的过程,不涉及驱动,不涉及其他层的协议处理. 内核是如何知道网卡收到数据的,这就涉及到网卡和内核的交互方式: 轮询(poll):内核周期性的检查网卡,查看是否收 ... 
- 记录一次坎坷的linux内网渗透过程瞎折腾的坑
		版权声明:本文为博主的原创文章,未经博主同意不得转载. 写在前面 每个人都有自己的思路和技巧,以前遇到一些linux的环境.这次找来一个站点来进行内网,写下自己的想法 目标环境 1.linux 2. ... 
- 代码学习-Linux内核网卡收包过程(NAPI)【转】
		转自:https://blog.csdn.net/crazycoder8848/article/details/46333761 版权声明:本文没有任何版权限制,任何人可以以任何方式使用本文. htt ... 
- Linux源码包安装过程及注意事项
		众做周知RedHat Linux使用RPM包管理器安装rpm包,但是RPM包是由红帽编译打包的,通常相对于最新版落后了很多,甚至可能缺少我们所要使用的功能.如果你需要定制想要的软件功能.自定义安装路径 ... 
- [转]Linux网络 - 数据包的发送过程
		转, 原文:https://segmentfault.com/a/1190000008926093 -------------------------------------------------- ... 
- linux下抓包工具tcpdump详解
		本文转自:http://www.cnblogs.com/ggjucheng/archive/2012/01/14/2322659.html 简介 用简单的话来定义tcpdump,就是:dump the ... 
- 探索 Linux 系统的启动过程
		引言 之所以想到写这些东西,那是因为我确实想让大家也和我一样,把 Linux 桌面系统打造成真真正正日常使用的工具,而不是安装之后试用几把再删掉.我是真的在日常生活和工作中都使用 Linux,比如在 ... 
- 在Linux下安装PHP过程中,编译时出现错误的解决办法
		在Linux下安装PHP过程中,编译时出现configure: error: libjpeg.(a|so) not found 错误的解决办法 configure: error: libjpeg.(a ... 
- Linux系统的启动过程
		Linux 系统启动过程 Linux系统的启动过程可以分为5个阶段: BIOS自检 内核的引导. 运行init. 系统初始化. 用户登录系统. BIOS自检: BIOS是英文"Basic I ... 
随机推荐
- Jmeter基础元件
			测试计划 1.Test Plan (测试计划) 用来描述一个性能测试,包含与本次性能测试所有相关的功能.也就说JMeter创建性能测试的所有内容是于基于一个计划的. 下面看看一个计划下面都有哪些功能模 ... 
- Junit参数化
			数据驱动测试:测试代码要与测试数据分离 哎~~学习时发现一个问题,写完了测试文件,右击文件时没有run as ->junit test的选项,找了好半天,终于知道了原因,是方法前没有加stati ... 
- 如何在cmd窗口里快速且正确打开任意位置路径(各版本windows系统都适合)(图文详解)(博主推荐)
			问题的由来 有时候,我们很苦恼,总是先系统键 + R,然后再去手动敲.尤其对win7系统比较麻烦 解决办法 方法一:复制路径(这点对win10系统做得好,直接可以复制) ,win7系统的话可能还需要设 ... 
- 安装和部署Jenkins
			安装和部署Jenkins 环境 操作系统:ubuntu 14.04.4 LTS 下载Jenkins wget https://mirrors.tuna.tsinghua.edu.cn/jenkins/ ... 
- div+css 制作表格
			<div class="table"> <h2 class="table-caption">花名册:</h2> <di ... 
- 通过管道传输快速将MySQL的数据导入Redis
			通过管道传输pipe将MySQL数据批量导入Redis 自Redis 2.6以上版本起,Redis支持快速大批量导入数据,即官网的Redis Mass Insertion,即Pipe传输, ... 
- (原创).Net将EF运用于Oralce一 准备工作
			网上有很多EF运用于Oracle的博文,但是找了半天发现大多数博文大都语焉不详,于是决定自己折腾. 首先我的开发工具为vs2010,那么最适用于VS2010的EF版本为多少呢?答案是EF5.我在Sta ... 
- 流畅的python和cookbook学习笔记(二)
			1.元组拆包和解压序列赋值 任何的序列 (或者是可迭代对象) 可以通过一个简单的赋值语句解压并赋值给多个 变量.唯一的前提就是变量的数量必须跟序列元素的数量是一样的. 1.平行赋值: >> ... 
- 三个缓存数据库Redis、Memcache、MongoDB
			>>Memcached Memcached的优点:Memcached可以利用多核优势,单实例吞吐量极高,可以达到几十万QPS(取决于key.value的字节大小以及服务器硬件性能,日常环境 ... 
- java web 开发入门
			Java web,是java技术用来解决web互联网领域的技术总和.Java web技术主要包括客户端和服务端,java在客户端的服务有java applet,不过用的非常少,大部分应用在服务端,比如 ... 
