一图胜千言：

flow和miniflow

在介绍之前先说一些概念：里面有两个结构很重要，一个是flow一个是miniflow这里介绍一下他们的数据结构和构造函数。

flow：

flow的特点是8字节对齐的，存储报文相关字段和其他原数据，用于匹配流表，数据包含四个层次：

metadata: 入端口号，寄存器等信息
l2: 源目的mac，vlan和mpls等信息
l3: ipv4/ipv6源目的ip，ttl等信息
l4: 源目的端口号，icmp code和type等信息。

flow的坏处就是占用了很大的字节，并且有很多字段都是0，在2.8版本中flow的大小是672字节。

miniflow

miniflow是flow的压缩版，因为flow占用字节很大，比如可以支持ARP，IP等报文，填充了arp字段，icmp报文就是空的了，浪费了很多信息。过程中用到hash作为key，也是根据miniflow计算hash值，不是用的flow。

struct miniflow {

    struct flowmap map;

};

struct flowmap {

    map_t bits[FLOWMAP_UNITS];

};

miniflow其包含两部分内容：

struct flowmap map;是bit数组，使用其中的bit表示flow中哪个8字节存在有效数据，flow中占多少个8字节，那么就需要map中多个个bit，并且按照64bit向上取整。
第二部分是有效数据，有效数据动态分配，根据struct flowmap map;中1bit数个数进行分配，大小为bit数*8字节，该部分直接跟在map后面。该部分存储在netdev_flow_key结构中的buf数组。

miniflow数据结构：

//flow是8字节对齐的，除8得到flow中包含8字节的个数

#define FLOW_U64S (sizeof(struct flow) / sizeof(uint64_t))

//map大小为8字节，MAP_T_BITS 为64位

typedef unsigned long long map_t;

#define MAP_T_BITS (sizeof(map_t) * CHAR_BIT)

//每位表示一个u64，FLOWMAP_UNITS 表示最少需要几个64位

#define FLOWMAP_UNITS DIV_ROUND_UP(FLOW_U64S, MAP_T_BITS)

struct flowmap {

    map_t bits[FLOWMAP_UNITS];

};

struct miniflow {

    struct flowmap map;

    /* Followed by:

     *     uint64_t values[n];

     * where 'n' is miniflow_n_values(miniflow). */

};

struct netdev_flow_key {

    uint32_t hash;

    uint32_t len;

    struct miniflow mf;  // bits

    uint64_t buf[FLOW_MAX_PACKET_U64S];  // 就是上边所说的value

};

// 有些字段是互斥的

#define FLOW_MAX_PACKET_U64S (FLOW_U64S                                   \

    /* Unused in datapath */  - FLOW_U64_SIZE(regs)                       \

                              - FLOW_U64_SIZE(metadata)                   \

    /* L2.5/3 */              - FLOW_U64_SIZE(nw_src)  /* incl. nw_dst */ \

                              - FLOW_U64_SIZE(mpls_lse)                   \

    /* L4 */                  - FLOW_U64_SIZE(tp_src)                     \

                             )

miniflow优点：

使用miniflow可以节省内存
如果只想遍历flow中的非0字段时，使用miniflow找到对应的非0字段，可以节省时间

flow->miniflow函数：miniflow_extract()

void

miniflow_extract(struct dp_packet *packet, struct miniflow *dst)

{

    ...

    // 初始化赋值有两个关键，一个是这个values： return (uint64_t *)(mf + 1);

    // 就是上边说的

    uint64_t *values = miniflow_values(dst);

    struct mf_ctx mf = { FLOWMAP_EMPTY_INITIALIZER, values,

                         values + FLOW_U64S };

    ...

    if (md->skb_priority || md->pkt_mark) {

        miniflow_push_uint32(mf, skb_priority, md->skb_priority);

        miniflow_push_uint32(mf, pkt_mark, md->pkt_mark);

    }

    miniflow_push_be16(mf, dl_type, dl_type);

    miniflow_pad_to_64(mf, dl_type);

    ...

    // 去取网络层信息,从这里可以看出，ovs暂时只支持IP,IPV6,ARP,RARP报文

    if (OVS_LIKELY(dl_type == htons(ETH_TYPE_IP))){...}

    else if

    ...

    // 提取传输层，从这里可以看出，ovs暂时支持传输层协议有TCP,UDP,SCTP,ICMP,ICMPV6

    if (OVS_LIKELY(nw_proto == IPPROTO_TCP)){...}

    else if

    ...

miniflow_push_uint32()

在上面将value保存到miniflow时，用到了几个辅助函数，比如下面的miniflow_push_uint32用来将一个32位的值保存到miniflow中FIELD对应的位置。其首先调用offsetof获取field在flow中的偏移字节数，因为flow是8字节对齐的，所以一个四字节的成员变量要么位于8字节的起始位置，要么位于8字节的中间位置，即对8取模值肯定为0或者4，再调用miniflow_push_uint32_保存到对应的位置，并设置map中对应的bit为1。

#define miniflow_push_uint32(MF, FIELD, VALUE)                      \

    miniflow_push_uint32_(MF, offsetof(struct flow, FIELD), VALUE)

#define miniflow_push_uint32_(MF, OFS, VALUE)   \

    {                                           \

    MINIFLOW_ASSERT(MF.data < MF.end);          \

                                                \

    //成员变量位于起始位置，需要调用miniflow_set_map设置对应的bit为1

    if ((OFS) % 8 == 0) {                       \

        miniflow_set_map(MF, OFS / 8);          \

        *(uint32_t *)MF.data = VALUE;           \

    } else if ((OFS) % 8 == 4) {                \

    //成员变量不在起始位置，要判断此变量所在的bit为1

        miniflow_assert_in_map(MF, OFS / 8);    \

        *((uint32_t *)MF.data + 1) = VALUE;     \

        MF.data++;                              \

    }                                           \

}

miniflow->flow函数：miniflow_expand()

/* Initializes 'dst' as a copy of 'src'. */

void

miniflow_expand(const struct miniflow *src, struct flow *dst)

{

    memset(dst, 0, sizeof *dst);

    flow_union_with_miniflow(dst, src);

}

/* Perform a bitwise OR of miniflow 'src' flow data with the equivalent

 * fields in 'dst', storing the result in 'dst'. */

static inline void

flow_union_with_miniflow(struct flow *dst, const struct miniflow *src)

{

    flow_union_with_miniflow_subset(dst, src, src->map);

}

static inline void

flow_union_with_miniflow_subset(struct flow *dst, const struct miniflow *src,

                                struct flowmap subset)

{

    uint64_t *dst_u64 = (uint64_t *) dst;

    const uint64_t *p = miniflow_get_values(src);

    map_t map;

    //遍历所有的map

    FLOWMAP_FOR_EACH_MAP (map, subset) {

        size_t idx;

        //遍历map中所有的非0bit

        MAP_FOR_EACH_INDEX(idx, map) {

            dst_u64[idx] |= *p++;

        }

        dst_u64 += MAP_T_BITS;

    }

}

流表查询过程

概要

该部分入口在lib/dpif-netdev.c，就是最开始的那个图。

查询的缓存分为两层：一个是DFC，一个是dpcls，相当于microflow和megaflow，DFC由两部分组成，DFC（datapath flow cache）：EMC（Exact match cache）+SMC（Signature match cache），另一部分就是dpcls(datapath classifer)。

SMC默认关闭：bool smc_enable = smap_get_bool(other_config, "smc-enable", false);

函数执行流程(不包含SMC的)：

入口在dp_netdev_input__()

static void

dp_netdev_input__(struct dp_netdev_pmd_thread *pmd,

                  struct dp_packet_batch *packets,

                  bool md_is_valid, odp_port_t port_no)

{

#if !defined(__CHECKER__) && !defined(_WIN32)

    const size_t PKT_ARRAY_SIZE = dp_packet_batch_size(packets);

#else

    /* Sparse or MSVC doesn't like variable length array. */

    enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };

#endif

    OVS_ALIGNED_VAR(CACHE_LINE_SIZE)

        struct netdev_flow_key keys[PKT_ARRAY_SIZE];

    struct netdev_flow_key *missed_keys[PKT_ARRAY_SIZE];

    struct packet_batch_per_flow batches[PKT_ARRAY_SIZE];

    size_t n_batches;

    struct dp_packet_flow_map flow_map[PKT_ARRAY_SIZE];

    uint8_t index_map[PKT_ARRAY_SIZE];

    size_t n_flows, i;

    odp_port_t in_port;

    n_batches = 0;

    // 1. dfc_processing之后会把miss的放到packets里

    //    找到的可能已经batched了，或者放到flow_map里了

    //    flow_map里是未bathed的，可能直接是*flow或者是NULL，是NULL再去下一层cache查

    dfc_processing(pmd, packets, keys, missed_keys, batches, &n_batches,

                   flow_map, &n_flows, index_map, md_is_valid, port_no);

    // 2. 如果有miss的，再去找fast-path，也就是查dpcls

    if (!dp_packet_batch_is_empty(packets)) {

        in_port = packets->packets[0]->md.in_port.odp_port;

        fast_path_processing(pmd, packets, missed_keys,

                             flow_map, index_map, in_port);

    }

    /* Batch rest of packets which are in flow map. */

    for (i = 0; i < n_flows; i++) {

        struct dp_packet_flow_map *map = &flow_map[i];

        if (OVS_UNLIKELY(!map->flow)) {

            continue;

        }

        dp_netdev_queue_batches(map->packet, map->flow, map->tcp_flags,

                                batches, &n_batches);

    }

    for (i = 0; i < n_batches; i++) {

        batches[i].flow->batch = NULL;

    }

    // 执行每个packet的action

    for (i = 0; i < n_batches; i++) {

        packet_batch_per_flow_execute(&batches[i], pmd);

    }

}

1. DFC查询：dfc_processing()

static inline size_t

dfc_processing(struct dp_netdev_pmd_thread *pmd,

               struct dp_packet_batch *packets_,

               struct netdev_flow_key *keys,

               struct netdev_flow_key **missed_keys,

               struct packet_batch_per_flow batches[], size_t *n_batches,

               struct dp_packet_flow_map *flow_map,

               size_t *n_flows, uint8_t *index_map,

               bool md_is_valid, odp_port_t port_no)

{

    struct netdev_flow_key *key = &keys[0];

    size_t n_missed = 0, n_emc_hit = 0;

    struct dfc_cache *cache = &pmd->flow_cache;

    struct dp_packet *packet;

    size_t cnt = dp_packet_batch_size(packets_);

    // emc的插入概率，如果为0，表示不开启emc

    uint32_t cur_min = pmd->ctx.emc_insert_min;

    int i;

    uint16_t tcp_flags;

    bool smc_enable_db;

    // 记录未batched的个数

    size_t map_cnt = 0;

    // 这个变量用于保序

    bool batch_enable = true;

    // 获取smc是否开启参数

    atomic_read_relaxed(&pmd->dp->smc_enable_db, &smc_enable_db);

    pmd_perf_update_counter(&pmd->perf_stats,

                            md_is_valid ? PMD_STAT_RECIRC : PMD_STAT_RECV,

                            cnt);

    do_dfc_hook(pmd, packets_, batches, n_batches);

    cnt = dp_packet_batch_size(packets_);     

    // 逐个对dp_packet_batch中的每一个packet进行处理

    DP_PACKET_BATCH_REFILL_FOR_EACH (i, cnt, packet, packets_) {

        struct dp_netdev_flow *flow;

        // 若packet包长小于以太头的长度直接丢包

        if (OVS_UNLIKELY(dp_packet_size(packet) < ETH_HEADER_LEN)) {

            dp_packet_delete(packet);

            COVERAGE_INC(datapath_drop_rx_invalid_packet);

            continue;

        }

        // 对数据手工预取可减少读取延迟，从而提高性能

        if (i != cnt - 1) {

            struct dp_packet **packets = packets_->packets;

            /* Prefetch next packet data and metadata. */

            OVS_PREFETCH(dp_packet_data(packets[i+1]));

            pkt_metadata_prefetch_init(&packets[i+1]->md);

        }

        // 初始化metadata首先将pkt_metadata中flow_in_port前的字节全部设为0

        // 将in_port.odp_port设为port_no， tunnel.ipv6_dst设为in6addr_any

        if (!md_is_valid) {

            pkt_metadata_init(&packet->md, port_no);

        }

        // 报文转化为miniflow, 上文有讲

        miniflow_extract(packet, &key->mf);

        key->len = 0; /* Not computed yet. */

        // 计算当前报文miniflow的hash值

        key->hash =

                (md_is_valid == false)

                ? dpif_netdev_packet_get_rss_hash_orig_pkt(packet, &key->mf)

                : dpif_netdev_packet_get_rss_hash(packet, &key->mf);

        // 根据key->hash,emc_entry alive,miniflow 3个条件得到dp_netdev_flow

        // cur_min = 0,表示不可能插入，后面有讲什么时候才会插入EMC

        flow = (cur_min != 0) ? emc_lookup(&cache->emc_cache, key) : NULL;

        if (OVS_LIKELY(flow)) {

            tcp_flags = miniflow_get_tcp_flags(&key->mf);

            n_emc_hit++; // 命中次数+1

            // 为了保证报文的顺序，所有的packet对应的flow都用flow_map存储

            // flow_map里面就是packet数量对应的(packet,flow,tcp_flag)

            // 最后会把这些在dp_netdev_input__里重新把顺序合并一下

            if (OVS_LIKELY(batch_enable)) {

                // 把查到的flow加到batches里第n_batches个batch里

                dp_netdev_queue_batches(packet, flow, tcp_flags, batches,

                                        n_batches);

            } else {

                packet_enqueue_to_flow_map(packet, flow, tcp_flags,

                                           flow_map, map_cnt++);

            }

        } else {

            // 这些数据结构用于smc查询时的记录

            // 没查到把packet放到packets_里，从下标0再开始放

            // 最后packets_都是未查到的

            dp_packet_batch_refill(packets_, packet, i);

            index_map[n_missed] = map_cnt;

            flow_map[map_cnt++].flow = NULL;

            missed_keys[n_missed] = key;

            key = &keys[++n_missed];

            batch_enable = false; // 之后的都是未batched的

        }

    }

    *n_flows = map_cnt;

    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_EXACT_HIT, n_emc_hit);

    // 如果没有开启smc，直接返回了

    if (!smc_enable_db) {

        return dp_packet_batch_size(packets_);

    }

    smc_lookup_batch(pmd, keys, missed_keys, packets_,

                     n_missed, flow_map, index_map);

    return dp_packet_batch_size(packets_);

}

1.1 emc查询：emc_lookup()

static inline struct dp_netdev_flow *

emc_lookup(struct emc_cache *cache, const struct netdev_flow_key *key)

{

    struct emc_entry *current_entry;

    // 这里说一下，一个hash分配两个桶，长度为13位，cache桶的大小为1<<13

    // struct emc_cache {

    //    struct emc_entry entries[EM_FLOW_HASH_ENTRIES];

    //    int sweep_idx;                /* For emc_cache_slow_sweep(). */

    // };

    EMC_FOR_EACH_POS_WITH_HASH (cache, current_entry, key->hash) {

        if (current_entry->key.hash == key->hash

            && emc_entry_alive(current_entry)

            && emc_flow_key_equal_mf(&current_entry->key, &key->mf)) {

            /* We found the entry with the 'key->mf' miniflow */

            return current_entry->flow;

        }

    }

    return NULL;

}

#define EM_FLOW_HASH_SHIFT 13

#define EM_FLOW_HASH_ENTRIES (1u << EM_FLOW_HASH_SHIFT)

#define EM_FLOW_HASH_MASK (EM_FLOW_HASH_ENTRIES - 1)

#define EM_FLOW_HASH_SEGS 2

#define EMC_FOR_EACH_POS_WITH_HASH(EMC, CURRENT_ENTRY, HASH)                 \

    for (uint32_t i__ = 0, srch_hash__ = (HASH);                             \

         (CURRENT_ENTRY) = &(EMC)->entries[srch_hash__ & EM_FLOW_HASH_MASK], \

         i__ < EM_FLOW_HASH_SEGS;                                            \

         i__++, srch_hash__ >>= EM_FLOW_HASH_SHIFT)

// 比较miniflow是否相同

static inline bool

emc_flow_key_equal_mf(const struct netdev_flow_key *key,

                         const struct miniflow *mf)

{

    return !memcmp(&key->mf, mf, key->len);

}

EMC查询函数执行：

1.2 smc查询：smc_lookup_batch()

static inline void

smc_lookup_batch(struct dp_netdev_pmd_thread *pmd,

            struct netdev_flow_key *keys,

            struct netdev_flow_key **missed_keys,

            struct dp_packet_batch *packets_,

            const int cnt,

            struct dp_packet_flow_map *flow_map,

            uint8_t *index_map)

{

    int i;

    struct dp_packet *packet;

    size_t n_smc_hit = 0, n_missed = 0;

    struct dfc_cache *cache = &pmd->flow_cache;

    struct smc_cache *smc_cache = &cache->smc_cache;

    const struct cmap_node *flow_node;

    int recv_idx;

    uint16_t tcp_flags;

    /* Prefetch buckets for all packets */

    for (i = 0; i < cnt; i++) {

        OVS_PREFETCH(&smc_cache->buckets[keys[i].hash & SMC_MASK]);

    }

    DP_PACKET_BATCH_REFILL_FOR_EACH (i, cnt, packet, packets_) {

        struct dp_netdev_flow *flow = NULL;

        // 找到hash相同的flow链表的头节点

        flow_node = smc_entry_get(pmd, keys[i].hash);

        bool hit = false;

        /* Get the original order of this packet in received batch. */

        recv_idx = index_map[i];

        if (OVS_LIKELY(flow_node != NULL)) {

            // 遍历一下看看哪一个是相同的，这个通过offsetof找到存放该cmap结构体的首地址

            // dp_netdev_flow里面的首地址就是，

            CMAP_NODE_FOR_EACH (flow, node, flow_node) {

                /* Since we dont have per-port megaflow to check the port

                 * number, we need to  verify that the input ports match. */

                if (OVS_LIKELY(dpcls_rule_matches_key(&flow->cr, &keys[i]) &&

                flow->flow.in_port.odp_port == packet->md.in_port.odp_port)) {

                    tcp_flags = miniflow_get_tcp_flags(&keys[i].mf);

                    keys[i].len =

                        netdev_flow_key_size(miniflow_n_values(&keys[i].mf));

                    if (emc_probabilistic_insert(pmd, &keys[i], flow)) {

                        if (flow->status == OFFLOAD_NONE) {

                            queue_netdev_flow_put(pmd->dp->dp_flow_offload, \

                                    pmd->dp->class, \

                                    flow, NULL, DP_NETDEV_FLOW_OFFLOAD_OP_ADD);

                        }

                    }

                    packet_enqueue_to_flow_map(packet, flow, tcp_flags,

                                               flow_map, recv_idx);

                    n_smc_hit++;

                    hit = true;

                    break;

                }

            }

            if (hit) {

                continue;

            }

        }

        // SMC也miss了，和之前一样，把miss的放packets_里，从0开始放

        dp_packet_batch_refill(packets_, packet, i);

        index_map[n_missed] = recv_idx;

        missed_keys[n_missed++] = &keys[i];

    }

    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SMC_HIT, n_smc_hit);

}

查找hash相同的链表头：smc_entry_get()

static inline const struct cmap_node *

smc_entry_get(struct dp_netdev_pmd_thread *pmd, const uint32_t hash)

{

    struct smc_cache *cache = &(pmd->flow_cache).smc_cache;

    // smc_cache桶的大小是（1<<18），SMC_MASK=（1<<18）- 1

    // 先通过后hash的后18位定位到桶

    struct smc_bucket *bucket = &cache->buckets[hash & SMC_MASK];

    // 一个桶有4个16位的sig，存key->hash前16位，正好是64位

    // 遍历4个元素看那个匹配，获得匹配后的cmap的下标

    uint16_t sig = hash >> 16;

    uint16_t index = UINT16_MAX;

    for (int i = 0; i < SMC_ENTRY_PER_BUCKET; i++) {

        if (bucket->sig[i] == sig) {

            index = bucket->flow_idx[i];

            break;

        }

    }

    // 通过index找到在dpcls里的桶位置

    if (index != UINT16_MAX) {

        return cmap_find_by_index(&pmd->flow_table, index);

    }

    return NULL;

}

1.3 更新emc：emc_probabilistic_insert()

命中SMC后，插入回上一层cache（EMC）里：emc_probabilistic_insert()

插入EMC的条件:

默认插入流表的概率是1%，可以通过ovs-vsctl set Open_vSwitch . other_config:emc-insert-prob=10 设置概率，表示平均10条流表有1条插入，当为0时禁用EMC，当为1的时候，百分百插入。设置后会在代码里设置emc_insert_min字段为uint_max/10，插入的时候生成一个uint_random()，如果随机数小于emc_insert_min才会插入。

static inline bool

emc_probabilistic_insert(struct dp_netdev_pmd_thread *pmd,

                         const struct netdev_flow_key *key,

                         struct dp_netdev_flow *flow)

{

    /* Insert an entry into the EMC based on probability value 'min'. By

     * default the value is UINT32_MAX / 100 which yields an insertion

     * probability of 1/100 ie. 1% */

    uint32_t min = pmd->ctx.emc_insert_min;

    if (min && random_uint32() <= min) {

        emc_insert(&(pmd->flow_cache).emc_cache, key, flow);

        return true;

    }

    return false;

}

emc_insert同样有我在内核查询里的问题，如果cache里没有该miniflow，会找一个hash值小的entry，覆盖这个entry，那如果有一个hash很大的flow被插入了，但是这个flow之后就没用过了，那岂不是这个entry就浪费了，不会被用到。

找到了合适的emc_entry。则将报文对应的netdev_dev_flow key信息存储到该表项中。而对于这个表项，原有的emc_entry.flow有可能还有指向一条旧的流表，需要将这条流表的引用计数减1，如果减1后达到0，则释放该流表空间。同时更新emc_entry.flow重新指向新的流表。到此为止，EMC表项更新完毕。

static inline void

emc_insert(struct emc_cache *cache, const struct netdev_flow_key *key,

           struct dp_netdev_flow *flow)

{

    struct emc_entry *to_be_replaced = NULL;

    struct emc_entry *current_entry;

    EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, key->hash) {

        if (netdev_flow_key_equal(&current_entry->key, key)) {

            /* We found the entry with the 'mf' miniflow */

            emc_change_entry(current_entry, flow, NULL);

            return;

        }

        /* Replacement policy: put the flow in an empty (not alive) entry, or

         * in the first entry where it can be */

        if (!to_be_replaced

            || (emc_entry_alive(to_be_replaced)

                && !emc_entry_alive(current_entry))

            || current_entry->key.hash < to_be_replaced->key.hash) {

            // 这个黄色判断就是我迷惑的地方

            to_be_replaced = current_entry;

        }

    }

    /* We didn't find the miniflow in the cache.

     * The 'to_be_replaced' entry is where the new flow will be stored */

    emc_change_entry(to_be_replaced, flow, key);

}

1.4 EMC的轮训更新

在pmd_thread_main()里面：

if (lc++ > 1024) {

    lc = 0;

    coverage_try_clear();

    // 这里的optimize是排序一下TSS

    dp_netdev_pmd_try_optimize(pmd, poll_list, poll_cnt);

    dp_netdev_pmd_hook_idle_run(pmd);

#ifdef ENABLE_EMC

    if (!ovsrcu_try_quiesce()) {

        emc_cache_slow_sweep(pmd->dp, &((pmd->flow_cache).emc_cache));

    }

#else

    ovsrcu_try_quiesce();

#endif

    for (i = 0; i < poll_cnt; i++) {

        uint64_t current_seq =

                 netdev_get_change_seq(poll_list[i].rxq->port->netdev);

        if (poll_list[i].change_seq != current_seq) {

            poll_list[i].change_seq = current_seq;

            poll_list[i].rxq_enabled =

                         netdev_rxq_enabled(poll_list[i].rxq->rx);

        }

    }

}

1.5 承上启下：OVS的TSS算法

dpcls是megaflow的查询过程，使用TSS算法，是个很老的算法了，看源码之前，先讲一下ovs里面的TSS，之前内核已经讲过，但是没有讲OVS里做的优化，下边再说一次，然后建议再去看一下这个有很多图的博客OVS-DPDK Datapath Classifier，这样之后对整个dpcls流程就有所了解了。

TSS算法原理

OVS 在内核态使用了元组空间搜索算法（Tuple Space Search，简称 TSS）进行流表查找，元组空间搜索算法的核心思想是，把所有规则按照每个字段的前缀长度进行组合，并划分为不同的元组中，然后在这些元组集合中进行哈希查找。我们举例说明，假设现有 10 条规则以及 3 个匹配字段，每个匹配字段长度均为 4：

我们将每条规则各匹配字段的前缀长度提取出来，按照前缀长度进行组合，并根据前缀长度组合进行分组：

我们将每个前缀长度组合称为元组，每个元组对应于哈希表的一个桶，同一前缀长度组合内的所有规则放置在同一个哈希桶内：

10 条规则被划分为 4 个元组，因此最多只需要四次查找，就可以找到对应的规则。

算法优缺点

为什么OVS选择TSS，而不选择其他查找算法？论文给出了以下三点解释：

（1）在虚拟化数据中心环境下，流的添加删除比较频繁，TSS支持高效的、常数时间的表项更新；（2）TSS支持任意匹配域的组合；（3）TSS存储空间随着流的数量线性增长，空间复杂度为 O(N)，N 为规则数目。

元组空间搜索算法的缺点是，由于基于哈希表实现，因此查找的时间复杂度不能确定。当所有规则各个字段的前缀长度组合数目过多时，查找性能会大大降低，最坏情况下需要查找所有规则。

OVS里做的排序优化

查找的过程需要从前向后遍历所有元组，命中了就不用往后查了。OVS给每个元组加了一个命中次数，命中次数越多，元组这个链表越靠前，这样就可以减少了查表次数。

2. dpcls查询

2.1 dpcls相关数据结构

// 线程安全的

#define OVSRCU_TYPE(TYPE) struct { ATOMIC(TYPE) p; }

struct cmap {

    OVSRCU_TYPE(struct cmap_impl *) impl;

};

/* The implementation of a concurrent hash map. */

struct cmap_impl {

    // 补齐64字节

    PADDED_MEMBERS_CACHELINE_MARKER(CACHE_LINE_SIZE, cacheline0,

        unsigned int n;             /* Number of in-use elements. */

        unsigned int max_n;         /* Max elements before enlarging. */

        unsigned int min_n;         /* Min elements before shrinking. */

        uint32_t mask;              /* Number of 'buckets', minus one. */

        uint32_t basis;             /* Basis for rehashing client's

                                       hash values. */

    );

    PADDED_MEMBERS_CACHELINE_MARKER(CACHE_LINE_SIZE, cacheline1,

        struct cmap_bucket buckets[1];

    );

};

struct cmap_bucket {

    /* Padding to make cmap_bucket exactly one cache line long. */

    PADDED_MEMBERS(CACHE_LINE_SIZE,

        // 锁机制，读和写都会+1，读的时候等到变成偶数再去读，保证安全

        atomic_uint32_t counter;

        // 桶中的每个槽用(hashs[i], nodes[i])元组来表示

        uint32_t hashes[CMAP_K];

        struct cmap_node nodes[CMAP_K];

    );

};

struct cmap_node {

    OVSRCU_TYPE(struct cmap_node *) next; /* Next node with same hash. */

};

/* 二级匹配表.每个报文接收端口对应一个 */

struct dpcls {

    struct cmap_node node; /* 链表节点 */

    odp_port_t in_port;    /* 报文接收端口 */

    struct cmap subtables_map; // 管理下边subtables的索引，用于遍历

    struct pvector subtables;  // 上文TSS算法所说的元组表

}

 

struct pvector {

    // 指向具体子表信息

    OVSRCU_TYPE(struct pvector_impl *) impl;

    // 平时,temp都是为NULL.只有当pvector扩充时,temp才用来临时缓存数据.

    // 待排好序后,再拷贝到impl中,temp再置NULL

    struct pvector_impl *temp;

};

// 相当于vector<pvector_entry>

struct pvector_impl {

    size_t size; /* Number of entries in the vector */

    size_t allocated; /* Number allocted entries */

    /* 初始化的时候只有4个元素.后续可能会扩充 */

    struct pvector_entry vector[];

}

 

struct pvector_entry {

        // pvector_impl中的vector是按照priority从小到大排序的

        // pmd_thread_main里会把priority赋值为hit_cnt，然后排序

        int priority;

        /* 实际指向了struct dpcls_subtable结构 */

        void *ptr;

}

 

// 子表信息

struct dpcls_subtable {

    /* The fields are only used by writers. */

    struct cmap_node cmap_node OVS_GUARDED; /* Within dpcls 'subtables_map'. */

    struct cmap rules; // 该表的bucket内容

    uint32_t hit_cnt;  // 命中该子表的次数

    // 下边是mask的miniflow前两个的bits里1的个数

    uint8_t mf_bits_set_unit0;

    uint8_t mf_bits_set_unit1;

    // 根据mf_bits_set_unit01选择最合适的查找算法

    dpcls_subtable_lookup_func lookup_func;

    /* Caches the masks to match a packet to, reducing runtime calculations. */

    uint64_t *mf_masks; // 由下边的mask->mf->bits[01]得来的，

    struct netdev_flow_key mask; // 该表的掩码信息

};

关于上边的mf_masks与mask，举个例子

mf_bits_set_unit0 = 4， mf_bits_set_unit1 = 0

netdev_flow_key.mf.bits[0] = 111010 (2进制)

mf_masks = [1, 111, 1111, 11111]  （2进制）

三个图对应他们的关系，链表三用于遍历的，查找过程中并不会通过链表三方式搜索。查找的时候走的就是链表二的流程。

2.2 dpcls查询入口：fast_path_processing->dpcls_lookup()

static bool

dpcls_lookup(struct dpcls *cls, const struct netdev_flow_key *keys[],

             struct dpcls_rule **rules, const size_t cnt,

             int *num_lookups_p)

{

#define MAP_BITS (sizeof(uint32_t) * CHAR_BIT)

    BUILD_ASSERT_DECL(MAP_BITS >= NETDEV_MAX_BURST);

    struct dpcls_subtable *subtable;

    uint32_t keys_map = TYPE_MAXIMUM(uint32_t); /* Set all bits. */

    if (cnt != MAP_BITS) {

        /*keys_map中置1位数为包的总数，并且第i位对应第i个包*/

        keys_map >>= MAP_BITS - cnt; /* Clear extra bits. */

    }

    memset(rules, 0, cnt * sizeof *rules);

    int lookups_match = 0, subtable_pos = 1;

    uint32_t found_map;

    PVECTOR_FOR_EACH (subtable, &cls->subtables) {

        // 查找函数，对应下边的lookup_generic()

        found_map = subtable->lookup_func(subtable, keys_map, keys, rules);

        uint32_t pkts_matched = count_1bits(found_map);

        // 搜索的子表个数，加上的是当前这几个key找了多少个表

        lookups_match += pkts_matched * subtable_pos;

        keys_map &= ~found_map;

        if (!keys_map) {

            if (num_lookups_p) {

                *num_lookups_p = lookups_match;

            }

            // 全找到了

            return true;

        }

        subtable_pos++;

    }

    if (num_lookups_p) {

        *num_lookups_p = lookups_match;

    }

    // 没有全找到

    return false;

}

lookup_generic()

ovs-dpdk里面有avx512-gather.c，使用avx512优化了look_up，整体逻辑还是一样的，这里只说dpif-netdev-lookup-generic

入口在这里，往下走,传进去subtable有效字段有多大

static uint32_t

dpcls_subtable_lookup_generic(struct dpcls_subtable *subtable,

                              uint32_t keys_map,

                              const struct netdev_flow_key *keys[],

                              struct dpcls_rule **rules)

{

    return lookup_generic_impl(subtable, keys_map, keys, rules,

                               subtable->mf_bits_set_unit0,

                               subtable->mf_bits_set_unit1);

}

static inline uint32_t ALWAYS_INLINE

lookup_generic_impl(struct dpcls_subtable *subtable,      // 当前的subtable

                    uint32_t keys_map,                    // miss_bit_map

                    const struct netdev_flow_key *keys[], // miss_key

                    struct dpcls_rule **rules,            // save hit_rule

                    const uint32_t bit_count_u0,

                    const uint32_t bit_count_u1)

{

    // 有几个包

    const uint32_t n_pkts = count_1bits(keys_map);

    ovs_assert(NETDEV_MAX_BURST >= n_pkts);

    uint32_t hashes[NETDEV_MAX_BURST];

    // 根据mask字段的大小开空间

    const uint32_t bit_count_total = bit_count_u0 + bit_count_u1;

    // 一个batch最大是NETDEV_MAX_BURST

    const uint32_t block_count_required = bit_count_total * NETDEV_MAX_BURST;

    uint64_t *mf_masks = subtable->mf_masks;

    int i;

    // 申请存储一个batch报文信息的数组，存放

    uint64_t *blocks_scratch = get_blocks_scratch(block_count_required); 

    // 获得每个key与当前表的mask“与运算”的结果

    ULLONG_FOR_EACH_1 (i, keys_map) {

            netdev_flow_key_flatten(keys[i],

                                    &subtable->mask, // 该表的掩码信息

                                    mf_masks,  // 由subtable->mask处理后的mask

                                    &blocks_scratch[i * bit_count_total],

                                    bit_count_u0,

                                    bit_count_u1);

    }

    // 算出来每一个key在该subtable里的hash值，该hash值由“mask字节数，key和mask与运算结果”得出

    ULLONG_FOR_EACH_1 (i, keys_map) {

        uint64_t *block_ptr = &blocks_scratch[i * bit_count_total];

        uint32_t hash = hash_add_words64(0, block_ptr, bit_count_total);

        hashes[i] = hash_finish(hash, bit_count_total * 8);

    }

    uint32_t found_map;

    const struct cmap_node *nodes[NETDEV_MAX_BURST];

    // 找到每个key在该subtable里的cmap，并且返回每个key有没有被找到，第i位是1则找到

    found_map = cmap_find_batch(&subtable->rules, keys_map, hashes, nodes);

    ULLONG_FOR_EACH_1 (i, found_map) {

        struct dpcls_rule *rule;

        // 可能不同的rule有相同的hash，看那个是匹配的

        CMAP_NODE_FOR_EACH (rule, cmap_node, nodes[i]) {

            const uint32_t cidx = i * bit_count_total;

            /*rule->mask & keys[i]的值与rule->flow相比较*/

            uint32_t match = netdev_rule_matches_key(rule, bit_count_total,

                                                     &blocks_scratch[cidx]);

            if (OVS_LIKELY(match)) {

                rules[i] = rule;

                subtable->hit_cnt++;

                goto next;

            }

        }

        ULLONG_SET0(found_map, i);  /* Did not match. */

    next:

        ; /* Keep Sparse happy. */

    }

    return found_map;

}

掩码运算netdev_flow_key_flatten()

// 这个函数对应dpif-netdev.c里面的dpcls_flow_key_gen_masks()

static inline void

netdev_flow_key_flatten(const struct netdev_flow_key *key,  // 要查找的miss_key

                        const struct netdev_flow_key *mask,

                        const uint64_t *mf_masks,

                        uint64_t *blocks_scratch,

                        const uint32_t u0_count,

                        const uint32_t u1_count)

{

    /* Load mask from subtable, mask with packet mf, popcount to get idx. */

    const uint64_t *pkt_blocks = miniflow_get_values(&key->mf);

    const uint64_t *tbl_blocks = miniflow_get_values(&mask->mf); // 获取miss_key和mask的miniflow

    /* Packet miniflow bits to be masked by pre-calculated mf_masks. */

    const uint64_t pkt_bits_u0 = key->mf.map.bits[0];

    const uint32_t pkt_bits_u0_pop = count_1bits(pkt_bits_u0);

    const uint64_t pkt_bits_u1 = key->mf.map.bits[1];

// 这个函数就是把miss_key与subtable的掩码进行&运算

// 会运算出该mask在意字段结果，放到blocks_scratch里

    netdev_flow_key_flatten_unit(&pkt_blocks[0],   // key-mf的数据段

                                 &tbl_blocks[0],   // mask->mf的数据段

                                 &mf_masks[0],     // mask->mf->bits得来mask

                                 &blocks_scratch[0], // 存放的地址

                                 pkt_bits_u0,      // key->mf里的bits[0]

                                 u0_count);        // mask->mf->bits[0]里1的个数

    netdev_flow_key_flatten_unit(&pkt_blocks[pkt_bits_u0_pop], // 上边bits[0]的已经算过了，从bits[1]开始算

                                 &tbl_blocks[u0_count],

                                 &mf_masks[u0_count],

                                 &blocks_scratch[u0_count],

                                 pkt_bits_u1,

                                 u1_count);

}

static inline void

netdev_flow_key_flatten_unit(const uint64_t *pkt_blocks, // key-mf的数据段

                             const uint64_t *tbl_blocks, // mask->mf里的数据段

                             const uint64_t *mf_masks,   // mask->mf->bits得来mask

                             uint64_t *blocks_scratch,   // 存放到这里

                             const uint64_t pkt_mf_bits, // key->mf里的bits[01]

                             const uint32_t count)       // mask->mf->bits[0]里1的个数

{

// 说一下意思，这个我们流程就是用key和subtable的mask与运算，肯定只需要与运算mask里

// 不为0的字段，其他的mask不关心，然后这个操作就是为了得到key对应字段是key->mf的第几位，

// 比如mask的bits[0]=11111, key的bits[0] = 10100, mask里的第3个1在key里面是第1个

// 这一位与的结果就是tbl_blocks[2]&pkt_blocks[0], 也就是怎么找到key里的下标0

// 就看key当前位之前有几个1就行了。这里这样做的1010111，

// 蓝色1之前有count_1bits(1010111 & 0001111) = 3

//  对上边的mask举个例子 count = 4;

//  mask->mf->bits[0] = 111010 (2进制)

//  mf_masks = [1, 111, 1111, 11111] (2进制);

//  pkt_mf_bits = 010100

//  blocks_scratch = [0,0,0,0,pkt_blocks[1]&tbl_blocks[4],0]

    uint32_t i;

    for (i = 0; i < count; i++) {

        // 拿i=2举例

        uint64_t mf_mask = mf_masks[i];             // mf_mask = 001111

        uint64_t idx_bits = mf_mask & pkt_mf_bits;  // idx_bits = 000100

        const uint32_t pkt_idx = count_1bits(idx_bits); // pkt_idx = 1

        uint64_t pkt_has_mf_bit = (mf_mask + 1) & pkt_mf_bits;  // pkt_has_mf_bit = 010000

        // 是否求掩码：mask当前位对应的key的字段，如果key在当前位是0，下边算掩码就会变成0

        uint64_t no_bit = ((!pkt_has_mf_bit) > 0) - 1; // 2^64 - 1

        // mask里第i个字段与运算key对应的字段

        blocks_scratch[i] = pkt_blocks[pkt_idx] & tbl_blocks[i] & no_bit; //

    }

}

key对应的cmap：cmap_find_batch()

unsigned long

cmap_find_batch(const struct cmap *cmap, unsigned long map,

                uint32_t hashes[], const struct cmap_node *nodes[])

{

    const struct cmap_impl *impl = cmap_get_impl(cmap);

    unsigned long result = map;

    int i;

    // 每一位就是一个包，一字节8个包

    uint32_t h1s[sizeof map * CHAR_BIT];

    const struct cmap_bucket *b1s[sizeof map * CHAR_BIT];

    const struct cmap_bucket *b2s[sizeof map * CHAR_BIT];

    uint32_t c1s[sizeof map * CHAR_BIT];

    // 每个impl里桶的数量为impl->mask+1

    // 为什么mask是桶的个数减1:因为下标从0开始，找下表的时候直接(hash & impl->mask)就行了

    // 至于为什么开两个？因为buckets存放的方法也是一个值对应两个hash

    // 第一次hash1 = rehash(impl->basis, hash), 找buckets[hash1 & impl->mask], 遍历里面CMAP_K个元素

    // 第二次hash2 = other_hash(hash1), 找buckets[hash2 & impl->mask], 遍历里面CMAP_K个元素

    /* Compute hashes and prefetch 1st buckets. */

    ULLONG_FOR_EACH_1(i, map) {

        h1s[i] = rehash(impl, hashes[i]);

        b1s[i] = &impl->buckets[h1s[i] & impl->mask];

        OVS_PREFETCH(b1s[i]);

    }

    /* Lookups, Round 1. Only look up at the first bucket. */

    ULLONG_FOR_EACH_1(i, map) {

        uint32_t c1;

        const struct cmap_bucket *b1 = b1s[i];

        const struct cmap_node *node;

        do {

            c1 = read_even_counter(b1);

            // 找一下这个cmap_bucket里面有没有相同hash的

            node = cmap_find_in_bucket(b1, hashes[i]);

        } while (OVS_UNLIKELY(counter_changed(b1, c1)));

        if (!node) {

            /* Not found (yet); Prefetch the 2nd bucket. */

            b2s[i] = &impl->buckets[other_hash(h1s[i]) & impl->mask];

            OVS_PREFETCH(b2s[i]);

            c1s[i] = c1; /* We may need to check this after Round 2. */

            continue;

        }

        /* Found. */

        ULLONG_SET0(map, i); /* Ignore this on round 2. */

        OVS_PREFETCH(node);

        nodes[i] = node;

    }

    /* Round 2. Look into the 2nd bucket, if needed. */

    ULLONG_FOR_EACH_1(i, map) {

        uint32_t c2;

        const struct cmap_bucket *b2 = b2s[i];

        const struct cmap_node *node;

        do {

            c2 = read_even_counter(b2);

            node = cmap_find_in_bucket(b2, hashes[i]);

        } while (OVS_UNLIKELY(counter_changed(b2, c2)));

        if (!node) {

            // 可能被修改了，

            if (OVS_UNLIKELY(counter_changed(b1s[i], c1s[i]))) {

                node = cmap_find__(b1s[i], b2s[i], hashes[i]);

                if (node) {

                    goto found;

                }

            }

            /* Not found. */

            ULLONG_SET0(result, i); /* Fix the result. */

            continue;

        }

found:

        OVS_PREFETCH(node);

        nodes[i] = node;

    }

    return result;

}

2.3 fast_path_processing()

static inline void

fast_path_processing(struct dp_netdev_pmd_thread *pmd,

                     struct dp_packet_batch *packets_,

                     struct netdev_flow_key **keys,

                     struct dp_packet_flow_map *flow_map,

                     uint8_t *index_map,

                     odp_port_t in_port)

{

    const size_t cnt = dp_packet_batch_size(packets_);

#if !defined(__CHECKER__) && !defined(_WIN32)

    const size_t PKT_ARRAY_SIZE = cnt;

#else

    /* Sparse or MSVC doesn't like variable length array. */

    enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };

#endif

    struct dp_packet *packet;

    struct dpcls *cls;

    struct dpcls_rule *rules[PKT_ARRAY_SIZE];

    struct dp_netdev *dp = pmd->dp;

    int upcall_ok_cnt = 0, upcall_fail_cnt = 0;

    int lookup_cnt = 0, add_lookup_cnt;

    bool any_miss;

    for (size_t i = 0; i < cnt; i++) {

        /* Key length is needed in all the cases, hash computed on demand. */

        keys[i]->len = netdev_flow_key_size(miniflow_n_values(&keys[i]->mf));

    }

    /* Get the classifier for the in_port */

    // 找到端口对应的dpcls结构，每个port有自己的dpcls，因为每个port收到的报文会更相似

    cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);

    if (OVS_LIKELY(cls)) {

        // 调用dpcls_lookup进行匹配

        any_miss = !dpcls_lookup(cls, (const struct netdev_flow_key **)keys,

                                rules, cnt, &lookup_cnt);

    } else {

        any_miss = true;

        memset(rules, 0, sizeof(rules));

    }

    // 如果有miss的，则需要进行openflow流表查询

    if (OVS_UNLIKELY(any_miss) && !fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {

        uint64_t actions_stub[512 / 8], slow_stub[512 / 8];

        struct ofpbuf actions, put_actions;

        ofpbuf_use_stub(&actions, actions_stub, sizeof actions_stub);

        ofpbuf_use_stub(&put_actions, slow_stub, sizeof slow_stub);

        DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {

            struct dp_netdev_flow *netdev_flow;

            if (OVS_LIKELY(rules[i])) {

                continue;

            }

            // 此时可能已经更新了，在进入upcall之前如果再查一次，如果能够查到，会比upcall消耗的少得多

            netdev_flow = dp_netdev_pmd_lookup_flow(pmd, keys[i],

                                                    &add_lookup_cnt);

            if (netdev_flow) {

                lookup_cnt += add_lookup_cnt;

                rules[i] = &netdev_flow->cr;

                continue;

            }

            // 第一级和第二级流表查找失败后，就要查找第三级流表了，即openflow流表，这也称为upcall调用。

            // 在普通ovs下是通过netlink实现的，在ovs+dpdk下，直接在pmd线程中调用upcall_cb即可。

            // 开始查找openflow流表。如果查找openflow流表成功并需要下发到dpcls时，需要判断是否超出最大流表限制

            int error = handle_packet_upcall(pmd, packet, keys[i],

                                             &actions, &put_actions);

            if (OVS_UNLIKELY(error)) {

                upcall_fail_cnt++;

            } else {

                upcall_ok_cnt++;

            }

        }

        ofpbuf_uninit(&actions);

        ofpbuf_uninit(&put_actions);

        fat_rwlock_unlock(&dp->upcall_rwlock);

    } else if (OVS_UNLIKELY(any_miss)) {

        DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {

            if (OVS_UNLIKELY(!rules[i])) {

                dp_packet_delete(packet);

                COVERAGE_INC(datapath_drop_lock_error);

                upcall_fail_cnt++;

            }

        }

    }

    DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {

        struct dp_netdev_flow *flow;

        /* Get the original order of this packet in received batch. */

        int recv_idx = index_map[i];

        uint16_t tcp_flags;

        if (OVS_UNLIKELY(!rules[i])) {

            continue;

        }

        flow = dp_netdev_flow_cast(rules[i]);

        bool hook_cached = false;

        if (pmd->cached_hook && \

                pmd->cached_hook_pmd && \

                pmd->cached_hook->hook_flow_miss) {

            hook_cached = pmd->cached_hook->hook_flow_miss(pmd->cached_hook_pmd, packet, flow);

        }

        if (!hook_cached) {

            bool smc_enable_db;

            atomic_read_relaxed(&pmd->dp->smc_enable_db, &smc_enable_db);

            // 查找到了packet，如果开启了smc，更新smc

            if (smc_enable_db) {

                uint32_t hash =  dp_netdev_flow_hash(&flow->ufid);

                smc_insert(pmd, keys[i], hash);

            }

            // 查到了packet，看是否写会更新上一层cache（EMC）

            if (emc_probabilistic_insert(pmd, keys[i], flow)) {

                if (flow->status == OFFLOAD_NONE) {

                    queue_netdev_flow_put(pmd->dp->dp_flow_offload, \

                            pmd->dp->class, \

                            flow, NULL, DP_NETDEV_FLOW_OFFLOAD_OP_ADD);

                }

            }

        }

        /* Add these packets into the flow map in the same order

         * as received.

         */

        tcp_flags = miniflow_get_tcp_flags(&keys[i]->mf);

        packet_enqueue_to_flow_map(packet, flow, tcp_flags,

                                   flow_map, recv_idx);

    }

    // 更新各个信息

    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_HIT,

                            cnt - upcall_ok_cnt - upcall_fail_cnt);

    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_LOOKUP,

                            lookup_cnt);

    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MISS,

                            upcall_ok_cnt);

    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_LOST,

                            upcall_fail_cnt);

}

2.4 smc更新smc_insert()

static inline void

smc_insert(struct dp_netdev_pmd_thread *pmd,

           const struct netdev_flow_key *key,

           uint32_t hash)

{

    struct smc_cache *smc_cache = &(pmd->flow_cache).smc_cache;

    struct smc_bucket *bucket = &smc_cache->buckets[key->hash & SMC_MASK];

    uint16_t index;

    uint32_t cmap_index;

    int i;

    //布谷鸟算法

    cmap_index = cmap_find_index(&pmd->flow_table, hash);

    index = (cmap_index >= UINT16_MAX) ? UINT16_MAX : (uint16_t)cmap_index;

    /* If the index is larger than SMC can handle (uint16_t), we don't

     * insert */

    if (index == UINT16_MAX) {

        //表明找到了

        return;

    }

    /* If an entry with same signature already exists, update the index */

    uint16_t sig = key->hash >> 16;

    for (i = 0; i < SMC_ENTRY_PER_BUCKET; i++) {

        if (bucket->sig[i] == sig) {

            bucket->flow_idx[i] = index;

            return;

        }

    }

    /* If there is an empty entry, occupy it. */

    for (i = 0; i < SMC_ENTRY_PER_BUCKET; i++) {

        if (bucket->flow_idx[i] == UINT16_MAX) {

            bucket->sig[i] = sig;

            bucket->flow_idx[i] = index;

            return;

        }

    }

    /* Otherwise, pick a random entry. */

    i = random_uint32() % SMC_ENTRY_PER_BUCKET;

    bucket->sig[i] = sig;

    bucket->flow_idx[i] = index;

}

3. upcall到openflow查找，然后更新dpcls

这里就不讲具体代码了，讲一下大概：到openflow查找后会更新dpcls，执行dp_netdev_flow_add() --> dpcls_insert() --> dpcls_find_subtable() --> cmap_insert()

dpcls_find_subtable()：

找一下是否存在相同mask的subtable，存在返回这个subtable，不存在就创建一个subtable，创建的时候会调用dpcls_create_subtable，里面有个dpcls_subtable_get_best_impl会根据mask的miniflow的bits[0]和bits[1]选择的查找算法。

cmap_insert里hash算法用的就是布谷鸟hash，hash两次，插入的核心代码：

static bool

cmap_try_insert(struct cmap_impl *impl, struct cmap_node *node, uint32_t hash)

{

    uint32_t h1 = rehash(impl, hash);

    uint32_t h2 = other_hash(h1);

    // hash两次找到两个桶

    struct cmap_bucket *b1 = &impl->buckets[h1 & impl->mask];

    struct cmap_bucket *b2 = &impl->buckets[h2 & impl->mask];

    // 插入规则：

    // 1.是否有相同hash的node，就插到对应链上

    // 2.没有相同hash，就看有没有空的node

    // 3.都不行就通过bfs，看能否让b1,b2空出来一个，把这个放进去

    // 都不行就插入失败

    return (OVS_UNLIKELY(cmap_insert_dup(node, hash, b1) ||

                         cmap_insert_dup(node, hash, b2)) ||

            OVS_LIKELY(cmap_insert_bucket(node, hash, b1) ||

                       cmap_insert_bucket(node, hash, b2)) ||

            cmap_insert_bfs(impl, node, hash, b1, b2));

}

参考博客：

OVS-DPDK Datapath Classifier ：这个是理论上的流程，看完就知道这个算法流程了

ovs分类器 flow和miniflow ：很重要的结构体miniflow

OVS-DPDK DataPath Classifier反向设计：这个有很多详细的解释，但不怎么流畅

OVS-DPDK 流表查询详解的更多相关文章

（转）Mysql 多表查询详解
MySQL 多表查询详解一.前言二.示例三.注意事项一.前言上篇讲到mysql中关键字执行的顺序,只涉及了一张表:实际应用大部分情况下,查询语句都会涉及到多张表格 : 1.1 多表连接有 ...
Mysql 多表查询详解
Reference: https://blog.csdn.net/jintao_ma/article/details/51260458 一.前言二.示例三.注意事项一.前言上篇讲到Mysql ...
MySQL简单查询详解-单表查询
MySQL简单查询详解-单表查询作者:尹正杰版权声明:原创作品,谢绝转载!否则将追究法律责任. 一.查询的执行路径一条SQL查询语句的执行过程大致如下图所示: 1>.客户端和服务端通过my ...
ovs源码阅读--流表查询原理
背景在ovs交换机中,报文的处理流程可以划分为一下三个步骤:协议解析,表项查找和动作执行,其中最耗时的步骤在于表项查找,往往一个流表中有数目巨大的表项,如何根据数据报文的信息快速的查找到对应的流表项 ...
openVswitch（OVS）源代码分析之工作流程（flow流表查询）
原文链接: openVswitch(OVS)源代码分析之工作流程(flow流表查询)
Neutron 理解 (4): Neutron OVS OpenFlow 流表和 L2 Population [Netruon OVS OpenFlow tables + L2 Population]
学习 Neutron 系列文章: (1)Neutron 所实现的虚拟化网络 (2)Neutron OpenvSwitch + VLAN 虚拟网络 (3)Neutron OpenvSwitch + GR ...
ElasticSearch第四步-查询详解
ElasticSearch系列学习 ElasticSearch第一步-环境配置 ElasticSearch第二步-CRUD之Sense ElasticSearch第三步-中文分词 ElasticSea ...
oracle表分区详解
原文来自:http://www.cnblogs.com/leiOOlei/archive/2012/06/08/2541306.html oracle表分区详解从以下几个方面来整理关于分区表的概念及 ...
ThinkPHP视图查询详解
ThinkPHP视图查询详解参考http://www.jb51.net/article/51674.htm 这篇文章主要介绍了ThinkPHP视图查询,需要的朋友可以参考下 ThinkP ...
SQL Server表分区详解
原文:SQL Server表分区详解什么是表分区一般情况下,我们建立数据库表时,表数据都存放在一个文件里. 但是如果是分区表的话,表数据就会按照你指定的规则分放到不同的文件里,把一个大的数据文件拆 ...

随机推荐

CSS3-transform位移实现双开门效果
transform可以用于实现位移,旋转,缩放等效果. 位移:transform: translate(水平距离,垂直距离); 这里先借助其位移属性实现双开门的效果,鼠标hover的时候最上面的图片向 ...
Redis设计实现-学习笔记
最近在准备面试,问到redis相关知识,只能说个皮毛,说的既不深入也不全面,所以抓紧突击一下,先学<redis设计与实现>. 选择看书的原因是: 书中全面深入,且能出书一定十分用心: 搜博 ...
[2001年NOIP普及组] 求先序排列
给出一棵二叉树的中序与后序排列.求出它的先序排列.(约定树结点用不同的大写字母表示,长度<=8). 输入第一行输入一个字符串表示二叉树的中序排列,第二行输入一个字符串表示二叉树的后序排列. 输 ...
微信小程序：微信web开发阶段性学习总结
小程序运行机制前台/后台状态小程序启动后,界面被展示给用户,此时小程序处于前台状态. 当用户点击右上角胶囊按钮关闭小程序,或者按了设备 Home 键离开微信时,小程序并没有完全终止运行,而是进入了 ...
浏览器F12，Network中各按钮的作用
Network下 preserve log:勾选,页面发生跳转,接口不丢失:(比如登录成功跳转到首页,登录的接口就没了,勾选Perserve log,会记录跳转前的接口): Disable cache ...
RDD编程
一.词频统计 1.读文本文件生成RDD lines 2.将一行一行的文本分割成单词 words flatmap() 3.全部转换为小写 lower() 4.去掉长度小于3的单词 filter() 5. ...
JavaScript 之数组在内存中的存储方式（连续或不连续）
最近在纠结一个问题,就是数组这个引用类型在JavaScript 中是不是和其他语言一样开辟了一个连续的内存来存储,但是在JS 中每个元素又可以是不同的类型,这就导致了没办法用一个相同大小的存储,所以数 ...
vscode php 方法无法跳转问题
安装扩展 PHP IntelliSense #路径感知 PHP Intelephense #自动补全 PHP DocBlocker #在类或属性.方法上面敲/**回车自动生成参数注释等 PHP Na ...
Debug --> python中的True False 0 1
今天看了下python中的一些基础知识,以offer64为例叭! 求 1+2+...+n ,要求不能使用乘除法.for.while.if.else.switch.case等关键字及条件判断语句(A?B ...
eset node32卸载记录
安装的是这个东西,卸载麻烦 1.一般的卸载软件比如wise program uninstall无论是普通卸载还是强制卸载都是实现不了的,火绒自带的文件粉碎是可以使用的,有两个目录要进行粉碎C:\Pro ...

OVS-DPDK 流表查询详解