概括

现在的OVS使用microflow+megaflow缓存查询流表，ovs整体流程是从ovs_vport_receive（datapath/vport.c）开始，然后进入ovs_dp_process_packet（datapath/datapath.c），这个时候调用ovs_flow_tbl_lookup_stats（datapath/flow_table.c）开始查，查microflow获得mask_array里的索引索找到mask，通过mask去找megaflow里的掩码元素，再去定位哈希桶，如果没找到就upcall去用户态：

查找 microflow 缓存：根据数据报文 SKB 的 hash 值，定位到 mask_cache_entry 数组中的某个元素，并得到该元素缓存的掩码数组索引值；

查找 megaflow 缓存：根据步骤 1 中查找到的掩码数组索引值，定位到掩码数组中的某个元素，并得到该元素的掩码，然后根据掩码定位到具体的哈希桶，并遍历该哈希桶中的所有节点，直到找到匹配的 flow。

说的是流表过程，所以就从ovs_flow_tbl_lookup_stats开始。

正文

查找 microflow 缓存

OVS 内核态流表查找的入口函数是定义在 datapath/flow_table.c 文件中的,在ovs_dp_process_packet里调用： flow = ovs_flow_tbl_lookup_stats(&dp->table, key, skb_get_hash(skb), &n_mask_hit);

struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl,

                                          const struct sw_flow_key *key,

                                          u32 skb_hash,

                                          u32 *n_mask_hit)

{

        struct mask_array *ma = rcu_dereference(tbl->mask_array);

        struct table_instance *ti = rcu_dereference(tbl->ti);

        struct mask_cache_entry *entries, *ce;

        struct sw_flow *flow;

        u32 hash;

        int seg;

        *n_mask_hit = 0;

        if (unlikely(!skb_hash)) {

                u32 mask_index = 0;

                return flow_lookup(tbl, ti, ma, key, n_mask_hit, &mask_index);

        }

        /* Pre and post recirulation flows usually have the same skb_hash

         * value. To avoid hash collisions, rehash the 'skb_hash' with

         * 'recirc_id'.  */

        if (key->recirc_id)

                skb_hash = jhash_1word(skb_hash, key->recirc_id);

        ce = NULL;

        hash = skb_hash;

        entries = this_cpu_ptr(tbl->mask_cache);

        /* Find the cache entry 'ce' to operate on. */

        for (seg = 0; seg < MC_HASH_SEGS; seg++) {

                int index = hash & (MC_HASH_ENTRIES - 1);

                struct mask_cache_entry *e;

                e = &entries[index];

                if (e->skb_hash == skb_hash) {

                        flow = flow_lookup(tbl, ti, ma, key, n_mask_hit,

                                           &e->mask_index);

                        if (!flow)

                                e->skb_hash = 0;

                        return flow;

                }

                if (!ce || e->skb_hash < ce->skb_hash)

                        ce = e;  /* A better replacement cache candidate. */

                hash >>= MC_HASH_SHIFT;

        }

        /* Cache miss, do full lookup. */

        flow = flow_lookup(tbl, ti, ma, key, n_mask_hit, &ce->mask_index);

        if (flow)

                ce->skb_hash = skb_hash;

        return flow;

}

ovs_flow_tbl_lookup_stats() 的函数参数如下：

tbl：类型为 struct flow_table，表示专属于每个 datapath 的流表组织结构；

key：类型为 struct sw_flow_key，表示从数据报文提取出来的匹配关键字；

skb_hash：表示数据报文 SKB 的 hash 值；

n_mask_hit：输出参数，表示尝试匹配掩码的次数。

1.当skb_hash为0的时候，完全查找mask_array表，不更新cache

// 如果 skb_hash 为 0，则 full lookup

if (unlikely(!skb_hash)) {

    u32 mask_index = 0;

    return flow_lookup(tbl, ti, ma, key, n_mask_hit, &mask_index);

}

// 当数据报文需要在 OVS 中重新进入流水线

if (key->recirc_id)

    skb_hash = jhash_1word(skb_hash, key->recirc_id);

这里说的不更新的：因为skb_hash默认就是0，如果找到了，更不更新都是0，没找到就更不影响了

2.找到mask_cache_entry存在mask_index

ce = NULL;

hash = skb_hash;

// mask_cache_entry 数组，大小为 256，即 microflow cache

// 获取当前cpu的mash_cache

entries = this_cpu_ptr(tbl->mask_cache);

/* Find the cache entry 'ce' to operate on. */

// 将 hash 分为 4 个字节，从低到高的顺序，进行查找，这样一个hash可以用4个桶，效率高

//MC_HASH_SEGS = 4

for (seg = 0; seg < MC_HASH_SEGS; seg++) {

    //MC_HASH_ENTRIES = 256

    int index = hash & (MC_HASH_ENTRIES - 1); // 255是8位1，这样就是获得最后8位（1字节）

    struct mask_cache_entry *e;

    e = &entries[index];

    if (e->skb_hash == skb_hash) {

        flow = flow_lookup(tbl, ti, ma, key, n_mask_hit,

                   &e->mask_index);

        if (!flow)

            e->skb_hash = 0;

        return flow;

    }

    // 选出 4 个字节中 skb hash 值最小的那个，作为没找到缓存时的最佳候选

    if (!ce || e->skb_hash < ce->skb_hash)

        ce = e;  /* A better replacement cache candidate. */

    // MC_HASH_SHIFT = 8

    hash >>= MC_HASH_SHIFT;

}

主要说一下hash：

32位的hash值，变成4个8位，正好是mask_cache_entry[256]大小，相当于一个hash值对应4个桶的位置，有一个匹配就行，这种好处就是减小hash冲突的覆盖，如果4个桶都没有匹配，就找一个的最小的mask_cache_entry->skb_hash，更新这个mask_cache_entry。

3.没找到mask_cache_entry就遍历找mask_array表，并且更新

flow = flow_lookup(tbl, ti, ma, key, n_mask_hit, &ce->mask_index);

if (flow)

    ce->skb_hash = skb_hash;

flow_lookup里： 

flow = masked_flow_lookup(ti, key, mask, n_mask_hit);

if (flow) { /* Found */

    *index = i;

    return flow;

}

首先是更新mask_index，传的就是地址，在flow_lookup里会更新。如果找到了flow，把skb_hash更新一下就行了。这整个过程就是相当于一级缓存。

查找 megaflow 缓存

查找 megaflow 缓存的入口函数是定义在 datapath/flow_table.c 文件中的 flow_lookup 函数：

static struct sw_flow *flow_lookup(struct flow_table *tbl,

                   struct table_instance *ti,

                   const struct mask_array *ma,

                   const struct sw_flow_key *key,

                   u32 *n_mask_hit,

                   u32 *index)

{

    struct sw_flow_mask *mask;

    struct sw_flow *flow;

    int i;

    if (*index < ma->max) {

        mask = rcu_dereference_ovsl(ma->masks[*index]);

        if (mask) {

            flow = masked_flow_lookup(ti, key, mask, n_mask_hit);

            if (flow)

                return flow;

        }

    }

    for (i = 0; i < ma->max; i++)  {

        if (i == *index)

            continue;

        mask = rcu_dereference_ovsl(ma->masks[i]);

        if (!mask)

            continue;

        flow = masked_flow_lookup(ti, key, mask, n_mask_hit);

        if (flow) { /* Found */

            *index = i;

            return flow;

        }

    }

    return NULL;

}

1.传进来的mask_array索引值index有效

// 根据传入的 index 获取到掩码数组的掩码，根据该掩码进行查找

if (*index < ma->max) {

    // 从掩码数组里获取掩码

    mask = rcu_dereference_ovsl(ma->masks[*index]);

    if (mask) {

        flow = masked_flow_lookup(ti, key, mask, n_mask_hit);

        if (flow)

            return flow;

    }

}

index在掩码数组的范围内，先通过rcu_dereference_ovsl获取mask，然后看能否找到flow，找到了就可以返回了。真正进行megaflow查询的是masked_flow_lookup函数，下边讲。

2.索引值index无效，就遍历每个mask_array

for (i = 0; i < ma->max; i++)  {

    if (i == *index) // 刚才已经查找过

        continue;

    mask = rcu_dereference_ovsl(ma->masks[i]); // 从掩码数组里获取掩码

    if (!mask)

        continue;

    flow = masked_flow_lookup(ti, key, mask, n_mask_hit);

    if (flow) { /* Found */

        *index = i;   // 找到了就更新mask_cache_entry

        return flow;

    }

}

真正查找megaflow的函数：masked_flow_lookup()

static struct sw_flow *masked_flow_lookup(struct table_instance *ti,

                      const struct sw_flow_key *unmasked,

                      const struct sw_flow_mask *mask,

                      u32 *n_mask_hit)

{

    struct sw_flow *flow;

    struct hlist_head *head;

    u32 hash;

    struct sw_flow_key masked_key;

    // 根据mask，计算masked后的key，用以支持通配符

    ovs_flow_mask_key(&masked_key, unmasked, false, mask);

    // 根据masked key和mask.range 计算hash值

    hash = flow_hash(&masked_key, &mask->range);

    // 根据hash值，找到sw_flow的链表头

    head = find_bucket(ti, hash);

    // mask命中次数+1

    (*n_mask_hit)++;

    // 遍历链表，解决hash冲突用的拉链法，所以是一条链

    hlist_for_each_entry_rcu(flow, head, flow_table.node[ti->node_ver]) {

        // mask相同、hash相同并且key相同，则匹配到流表

        if (flow->mask == mask && flow->flow_table.hash == hash &&

            flow_cmp_masked_key(flow, &masked_key, &mask->range))

            return flow;

    }

    return NULL;

}

find_bucket 函数

static struct hlist_head *find_bucket(struct table_instance *ti, u32 hash)

{

        hash = jhash_1word(hash, ti->hash_seed);

        return &ti->buckets[hash & (ti->n_buckets - 1)]; // hash的低N位作为index

}

这样就找到了flow

缓存没有命中

直接看datapath.c

未命中就会执行upcall了

疑问

1.关于查找microflow里面的skb_hash

把一个hash分成了4份，如果都没有命中，更新skb_hash最小的mask_cache_entry，那这个利用率岂不是很低，如果4个桶，有3个不怎么用到的桶skb_hash很大，那相当于这个机制没啥用了。也可能这个算法就这个样。

2.关于mask命中次数+1，n_mask_hit没有初始化

最后找到直接：(*n_mask_hit)++;

OVS的TSS算法

算法原理

OVS 在内核态使用了元组空间搜索算法（Tuple Space Search，简称 TSS）进行流表查找，元组空间搜索算法的核心思想是，把所有规则按照每个字段的前缀长度进行组合，并划分为不同的元组中，然后在这些元组集合中进行哈希查找。我们举例说明，假设现有 10 条规则以及 3 个匹配字段，每个匹配字段长度均为 4：

我们将每条规则各匹配字段的前缀长度提取出来，按照前缀长度进行组合，并根据前缀长度组合进行分组：

我们将每个前缀长度组合称为元组，每个元组对应于哈希表的一个桶，同一前缀长度组合内的所有规则放置在同一个哈希桶内：

10 条规则被划分为 4 个元组，因此最多只需要四次查找，就可以找到对应的规则。

算法优缺点

为什么OVS选择TSS，而不选择其他查找算法？论文给出了以下三点解释：

（1）在虚拟化数据中心环境下，流的添加删除比较频繁，TSS支持高效的、常数时间的表项更新；（2）TSS支持任意匹配域的组合；（3）TSS存储空间随着流的数量线性增长，空间复杂度为 O(N)，N 为规则数目。

元组空间搜索算法的缺点是，由于基于哈希表实现，因此查找的时间复杂度不能确定。当所有规则各个字段的前缀长度组合数目过多时，查找性能会大大降低，最坏情况下需要查找所有规则。

参考：

深入理解 Open vSwitch（四）：内核态流表查找分析

【OVS2.5.0源码分析】datapath之流表查询

OVS内核流表查询过程的更多相关文章

ovs源码阅读--流表查询原理
背景在ovs交换机中,报文的处理流程可以划分为一下三个步骤:协议解析,表项查找和动作执行,其中最耗时的步骤在于表项查找,往往一个流表中有数目巨大的表项,如何根据数据报文的信息快速的查找到对应的流表项 ...
openVswitch（OVS）源代码分析之工作流程（flow流表查询）
原文链接: openVswitch(OVS)源代码分析之工作流程(flow流表查询)
Neutron 理解 (4): Neutron OVS OpenFlow 流表和 L2 Population [Netruon OVS OpenFlow tables + L2 Population]
学习 Neutron 系列文章: (1)Neutron 所实现的虚拟化网络 (2)Neutron OpenvSwitch + VLAN 虚拟网络 (3)Neutron OpenvSwitch + GR ...
openvswitch datapath 内核态流表创建过程（ovs_flow_cmd_new）
datapath流表更新的入口函数都定义在dp_flow_genl_ops中,流表创建的入口函数是ovs_flow_cmd_new函数,通过该函数,我们可以一窥流表相关信息的建立. 1.ovs_flo ...
ovs 删除流表指定 actions 中字段
例: ovs-ofctl del-flows br-int in_port=100,out_group=100 -O openflow13 ovs-ofctl del-flows br-int in_ ...
OVS 流表offload
原文链接:https://www.dazhuanlan.com/2019/12/31/5e0af1858dada/ 最近开始调研网卡的OVS流表offload功能,所以目前查看一下OVS这块是怎么做的 ...
openvswitch 流表操作
流表组成每条流表规则由一些列字段组成,可以分为**基础字段.匹配字段和动作字段**三部分. 在打印流表时,在流表中还存在一些显示字段,如duration,idle_age等,此处把这些字段也暂时归之 ...
OVS 内核KEY值提取及匹配流表代码分析
原文链接:http://ry0117.com/2016/12/24/OVS内核KEY值提取及匹配流表代码分析/ 当开启OVS后,创建datapath类型为system的网桥并他添加相关接口,OVS网桥 ...
Openvswitch原理与代码分析(5): 内核中的流表flow table操作
当一个数据包到达网卡的时候,首先要经过内核Openvswitch.ko,流表Flow Table在内核中有一份,通过key查找内核中的flow table,即可以得到action,然后执行acti ...
OVS中arp响应的流表的实现
总结: 1.br-int 流表总体是按照Normal 的方式,即常规的交换机的转发方式进行转发.而br-tun 交换机则主要按照流表的方式进行转发. 2.一般情况下,VM发出的ARP请求,会在该VM的 ...

随机推荐

js-模态框的拖动
效果如下: 代码如下: <div class="clickBtn">点击,弹出登录框</div> <div class="login&quo ...
【SSO单点系列】（1）：CAS4.0 之环境的搭建
[SSO单点系列](1):CAS4.0 环境的搭建一.概述今天开始写CAS相关的第一篇文章,这篇文章主要是关于CAS环境的搭配,提供给刚刚接触CAS的一个入门指南,并演示一个CAS的最简单的实例 ...
删除没有刀路的刀具.txt
1 UF_initialize(); 2 std::vector<tag_t>tool_tag; 3 std::vector<tag_t>del_tag; 4 tag_t ...
element的el-table合计显示不出来
在updated中写入 this.$nextTick(() => { this.$refs["printStatisticsTableData"].doLayout(); } ...
Android 系统完整的权限列表
访问登记属性 android.permission.ACCESS_CHECKIN_PROPERTIES ,读取或写入登记check-in数据库属性表的权限获取错略位置 android.perm ...
Study python_04
数组 a = [1,2,3] print(a) 数组替换 a = [1,2,3] a[0] = 100 print(a) 数组去重复 def delete_chong(): a = [1,1,2,2, ...
linux命令关机和重启命令及文件查找管道符用法linux grep
关机和重启命令 Shutdown Reboot Halt poweroff grep和管道符昨天的时候 leader 给我出了道问题: 找出文件夹下包含 "aaa" 同时不包含 ...
Leetcode——二分法bisect_left,bisect_right
!前提--列表有序 case 1 如果列表中没有元素x,那么bisect_left(ls, x)和bisec_right(ls, x)返回相同的值,该值是x在ls中"合适的插入点索引,使得数 ...
【APT】Patchwork APT组织针对巴基斯坦国防官员攻击活动分析
前言 Patchwork(白象.摩诃草.APT-C-09.Dropping Elephant)是一个疑似具有印度国家背景的APT组织,该组织长期针对中国.巴基斯坦等南亚地区国家进行网络攻击窃密活动.本 ...
【LeetCode】 907 子数组的最小值之和
Decrisption Given an array of integers arr, find the sum of min(b), where b ranges over every (conti ...

OVS内核流表查询过程

概括

正文