OVS-DPDK 流表查询详解
一图胜千言:

flow和miniflow
在介绍之前先说一些概念:里面有两个结构很重要,一个是flow一个是miniflow这里介绍一下他们的数据结构和构造函数。
flow:
- metadata: 入端口号,寄存器等信息
- l2: 源目的mac,vlan和mpls等信息
- l3: ipv4/ipv6源目的ip,ttl等信息
- l4: 源目的端口号,icmp code和type等信息。
miniflow
struct miniflow {
struct flowmap map;
};
struct flowmap {
map_t bits[FLOWMAP_UNITS];
};
- struct flowmap map;是bit数组,使用其中的bit表示flow中哪个8字节存在有效数据,flow中占多少个8字节,那么就需要map中多个个bit,并且按照64bit向上取整。
- 第二部分是有效数据,有效数据动态分配,根据struct flowmap map;中1bit数个数进行分配,大小为bit数*8字节,该部分直接跟在map后面。该部分存储在netdev_flow_key结构中的buf数组。
//flow是8字节对齐的,除8得到flow中包含8字节的个数
#define FLOW_U64S (sizeof(struct flow) / sizeof(uint64_t)) //map大小为8字节,MAP_T_BITS 为64位
typedef unsigned long long map_t;
#define MAP_T_BITS (sizeof(map_t) * CHAR_BIT) //每位表示一个u64,FLOWMAP_UNITS 表示最少需要几个64位
#define FLOWMAP_UNITS DIV_ROUND_UP(FLOW_U64S, MAP_T_BITS) struct flowmap {
map_t bits[FLOWMAP_UNITS];
}; struct miniflow {
struct flowmap map;
/* Followed by:
* uint64_t values[n];
* where 'n' is miniflow_n_values(miniflow). */
}; struct netdev_flow_key {
uint32_t hash;
uint32_t len;
struct miniflow mf; // bits
uint64_t buf[FLOW_MAX_PACKET_U64S]; // 就是上边所说的value
}; // 有些字段是互斥的
#define FLOW_MAX_PACKET_U64S (FLOW_U64S \
/* Unused in datapath */ - FLOW_U64_SIZE(regs) \
- FLOW_U64_SIZE(metadata) \
/* L2.5/3 */ - FLOW_U64_SIZE(nw_src) /* incl. nw_dst */ \
- FLOW_U64_SIZE(mpls_lse) \
/* L4 */ - FLOW_U64_SIZE(tp_src) \
)
- 使用miniflow可以节省内存
- 如果只想遍历flow中的非0字段时,使用miniflow找到对应的非0字段,可以节省时间
flow->miniflow函数:miniflow_extract()
void
miniflow_extract(struct dp_packet *packet, struct miniflow *dst)
{
...
// 初始化赋值有两个关键,一个是这个values: return (uint64_t *)(mf + 1);
// 就是上边说的
uint64_t *values = miniflow_values(dst);
struct mf_ctx mf = { FLOWMAP_EMPTY_INITIALIZER, values,
values + FLOW_U64S };
...
if (md->skb_priority || md->pkt_mark) {
miniflow_push_uint32(mf, skb_priority, md->skb_priority);
miniflow_push_uint32(mf, pkt_mark, md->pkt_mark);
}
miniflow_push_be16(mf, dl_type, dl_type);
miniflow_pad_to_64(mf, dl_type);
... // 去取网络层信息,从这里可以看出,ovs暂时只支持IP,IPV6,ARP,RARP报文
if (OVS_LIKELY(dl_type == htons(ETH_TYPE_IP))){...}
else if
... // 提取传输层,从这里可以看出,ovs暂时支持传输层协议有TCP,UDP,SCTP,ICMP,ICMPV6
if (OVS_LIKELY(nw_proto == IPPROTO_TCP)){...}
else if
...
#define miniflow_push_uint32(MF, FIELD, VALUE) \
miniflow_push_uint32_(MF, offsetof(struct flow, FIELD), VALUE) #define miniflow_push_uint32_(MF, OFS, VALUE) \
{ \
MINIFLOW_ASSERT(MF.data < MF.end); \
\
//成员变量位于起始位置,需要调用miniflow_set_map设置对应的bit为1
if ((OFS) % 8 == 0) { \
miniflow_set_map(MF, OFS / 8); \
*(uint32_t *)MF.data = VALUE; \
} else if ((OFS) % 8 == 4) { \
//成员变量不在起始位置,要判断此变量所在的bit为1
miniflow_assert_in_map(MF, OFS / 8); \
*((uint32_t *)MF.data + 1) = VALUE; \
MF.data++; \
} \
}
miniflow->flow函数:miniflow_expand()
/* Initializes 'dst' as a copy of 'src'. */
void
miniflow_expand(const struct miniflow *src, struct flow *dst)
{
memset(dst, 0, sizeof *dst);
flow_union_with_miniflow(dst, src);
} /* Perform a bitwise OR of miniflow 'src' flow data with the equivalent
* fields in 'dst', storing the result in 'dst'. */
static inline void
flow_union_with_miniflow(struct flow *dst, const struct miniflow *src)
{
flow_union_with_miniflow_subset(dst, src, src->map);
} static inline void
flow_union_with_miniflow_subset(struct flow *dst, const struct miniflow *src,
struct flowmap subset)
{
uint64_t *dst_u64 = (uint64_t *) dst;
const uint64_t *p = miniflow_get_values(src);
map_t map;
//遍历所有的map
FLOWMAP_FOR_EACH_MAP (map, subset) {
size_t idx;
//遍历map中所有的非0bit
MAP_FOR_EACH_INDEX(idx, map) {
dst_u64[idx] |= *p++;
}
dst_u64 += MAP_T_BITS;
}
}
流表查询过程
概要
入口在dp_netdev_input__()
static void
dp_netdev_input__(struct dp_netdev_pmd_thread *pmd,
struct dp_packet_batch *packets,
bool md_is_valid, odp_port_t port_no)
{
#if !defined(__CHECKER__) && !defined(_WIN32)
const size_t PKT_ARRAY_SIZE = dp_packet_batch_size(packets);
#else
/* Sparse or MSVC doesn't like variable length array. */
enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
#endif
OVS_ALIGNED_VAR(CACHE_LINE_SIZE)
struct netdev_flow_key keys[PKT_ARRAY_SIZE];
struct netdev_flow_key *missed_keys[PKT_ARRAY_SIZE];
struct packet_batch_per_flow batches[PKT_ARRAY_SIZE];
size_t n_batches;
struct dp_packet_flow_map flow_map[PKT_ARRAY_SIZE];
uint8_t index_map[PKT_ARRAY_SIZE];
size_t n_flows, i; odp_port_t in_port; n_batches = 0;
// 1. dfc_processing之后会把miss的放到packets里
// 找到的可能已经batched了,或者放到flow_map里了
// flow_map里是未bathed的,可能直接是*flow或者是NULL,是NULL再去下一层cache查
dfc_processing(pmd, packets, keys, missed_keys, batches, &n_batches,
flow_map, &n_flows, index_map, md_is_valid, port_no); // 2. 如果有miss的,再去找fast-path,也就是查dpcls
if (!dp_packet_batch_is_empty(packets)) {
in_port = packets->packets[0]->md.in_port.odp_port;
fast_path_processing(pmd, packets, missed_keys,
flow_map, index_map, in_port);
} /* Batch rest of packets which are in flow map. */
for (i = 0; i < n_flows; i++) {
struct dp_packet_flow_map *map = &flow_map[i]; if (OVS_UNLIKELY(!map->flow)) {
continue;
}
dp_netdev_queue_batches(map->packet, map->flow, map->tcp_flags,
batches, &n_batches);
} for (i = 0; i < n_batches; i++) {
batches[i].flow->batch = NULL;
} // 执行每个packet的action
for (i = 0; i < n_batches; i++) {
packet_batch_per_flow_execute(&batches[i], pmd);
}
}
1. DFC查询:dfc_processing()
static inline size_t
dfc_processing(struct dp_netdev_pmd_thread *pmd,
struct dp_packet_batch *packets_,
struct netdev_flow_key *keys,
struct netdev_flow_key **missed_keys,
struct packet_batch_per_flow batches[], size_t *n_batches,
struct dp_packet_flow_map *flow_map,
size_t *n_flows, uint8_t *index_map,
bool md_is_valid, odp_port_t port_no)
{
struct netdev_flow_key *key = &keys[0];
size_t n_missed = 0, n_emc_hit = 0;
struct dfc_cache *cache = &pmd->flow_cache;
struct dp_packet *packet;
size_t cnt = dp_packet_batch_size(packets_);
// emc的插入概率,如果为0,表示不开启emc
uint32_t cur_min = pmd->ctx.emc_insert_min;
int i;
uint16_t tcp_flags;
bool smc_enable_db;
// 记录未batched的个数
size_t map_cnt = 0;
// 这个变量用于保序
bool batch_enable = true;
// 获取smc是否开启参数
atomic_read_relaxed(&pmd->dp->smc_enable_db, &smc_enable_db);
pmd_perf_update_counter(&pmd->perf_stats,
md_is_valid ? PMD_STAT_RECIRC : PMD_STAT_RECV,
cnt); do_dfc_hook(pmd, packets_, batches, n_batches);
cnt = dp_packet_batch_size(packets_); // 逐个对dp_packet_batch中的每一个packet进行处理
DP_PACKET_BATCH_REFILL_FOR_EACH (i, cnt, packet, packets_) {
struct dp_netdev_flow *flow;
// 若packet包长小于以太头的长度直接丢包
if (OVS_UNLIKELY(dp_packet_size(packet) < ETH_HEADER_LEN)) {
dp_packet_delete(packet);
COVERAGE_INC(datapath_drop_rx_invalid_packet);
continue;
}
// 对数据手工预取可减少读取延迟,从而提高性能
if (i != cnt - 1) {
struct dp_packet **packets = packets_->packets;
/* Prefetch next packet data and metadata. */
OVS_PREFETCH(dp_packet_data(packets[i+1]));
pkt_metadata_prefetch_init(&packets[i+1]->md);
} // 初始化metadata首先将pkt_metadata中flow_in_port前的字节全部设为0
// 将in_port.odp_port设为port_no, tunnel.ipv6_dst设为in6addr_any
if (!md_is_valid) {
pkt_metadata_init(&packet->md, port_no);
}
// 报文转化为miniflow, 上文有讲
miniflow_extract(packet, &key->mf);
key->len = 0; /* Not computed yet. */
// 计算当前报文miniflow的hash值
key->hash =
(md_is_valid == false)
? dpif_netdev_packet_get_rss_hash_orig_pkt(packet, &key->mf)
: dpif_netdev_packet_get_rss_hash(packet, &key->mf); // 根据key->hash,emc_entry alive,miniflow 3个条件得到dp_netdev_flow
// cur_min = 0,表示不可能插入,后面有讲什么时候才会插入EMC
flow = (cur_min != 0) ? emc_lookup(&cache->emc_cache, key) : NULL; if (OVS_LIKELY(flow)) {
tcp_flags = miniflow_get_tcp_flags(&key->mf);
n_emc_hit++; // 命中次数+1
// 为了保证报文的顺序,所有的packet对应的flow都用flow_map存储
// flow_map里面就是packet数量对应的(packet,flow,tcp_flag)
// 最后会把这些在dp_netdev_input__里重新把顺序合并一下
if (OVS_LIKELY(batch_enable)) {
// 把查到的flow加到batches里第n_batches个batch里
dp_netdev_queue_batches(packet, flow, tcp_flags, batches,
n_batches);
} else { packet_enqueue_to_flow_map(packet, flow, tcp_flags,
flow_map, map_cnt++);
}
} else {
// 这些数据结构用于smc查询时的记录
// 没查到把packet放到packets_里,从下标0再开始放
// 最后packets_都是未查到的
dp_packet_batch_refill(packets_, packet, i);
index_map[n_missed] = map_cnt;
flow_map[map_cnt++].flow = NULL;
missed_keys[n_missed] = key;
key = &keys[++n_missed];
batch_enable = false; // 之后的都是未batched的
}
}
*n_flows = map_cnt; pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_EXACT_HIT, n_emc_hit);
// 如果没有开启smc,直接返回了
if (!smc_enable_db) {
return dp_packet_batch_size(packets_);
} smc_lookup_batch(pmd, keys, missed_keys, packets_,
n_missed, flow_map, index_map); return dp_packet_batch_size(packets_);
}
1.1 emc查询:emc_lookup()
static inline struct dp_netdev_flow *
emc_lookup(struct emc_cache *cache, const struct netdev_flow_key *key)
{
struct emc_entry *current_entry;
// 这里说一下,一个hash分配两个桶,长度为13位,cache桶的大小为1<<13
// struct emc_cache {
// struct emc_entry entries[EM_FLOW_HASH_ENTRIES];
// int sweep_idx; /* For emc_cache_slow_sweep(). */
// };
EMC_FOR_EACH_POS_WITH_HASH (cache, current_entry, key->hash) {
if (current_entry->key.hash == key->hash
&& emc_entry_alive(current_entry)
&& emc_flow_key_equal_mf(¤t_entry->key, &key->mf)) {
/* We found the entry with the 'key->mf' miniflow */
return current_entry->flow;
}
}
return NULL;
} #define EM_FLOW_HASH_SHIFT 13
#define EM_FLOW_HASH_ENTRIES (1u << EM_FLOW_HASH_SHIFT)
#define EM_FLOW_HASH_MASK (EM_FLOW_HASH_ENTRIES - 1)
#define EM_FLOW_HASH_SEGS 2
#define EMC_FOR_EACH_POS_WITH_HASH(EMC, CURRENT_ENTRY, HASH) \
for (uint32_t i__ = 0, srch_hash__ = (HASH); \
(CURRENT_ENTRY) = &(EMC)->entries[srch_hash__ & EM_FLOW_HASH_MASK], \
i__ < EM_FLOW_HASH_SEGS; \
i__++, srch_hash__ >>= EM_FLOW_HASH_SHIFT) // 比较miniflow是否相同
static inline bool
emc_flow_key_equal_mf(const struct netdev_flow_key *key,
const struct miniflow *mf)
{
return !memcmp(&key->mf, mf, key->len);
}
1.2 smc查询:smc_lookup_batch()
static inline void
smc_lookup_batch(struct dp_netdev_pmd_thread *pmd,
struct netdev_flow_key *keys,
struct netdev_flow_key **missed_keys,
struct dp_packet_batch *packets_,
const int cnt,
struct dp_packet_flow_map *flow_map,
uint8_t *index_map)
{
int i;
struct dp_packet *packet;
size_t n_smc_hit = 0, n_missed = 0;
struct dfc_cache *cache = &pmd->flow_cache;
struct smc_cache *smc_cache = &cache->smc_cache;
const struct cmap_node *flow_node;
int recv_idx;
uint16_t tcp_flags; /* Prefetch buckets for all packets */
for (i = 0; i < cnt; i++) {
OVS_PREFETCH(&smc_cache->buckets[keys[i].hash & SMC_MASK]);
} DP_PACKET_BATCH_REFILL_FOR_EACH (i, cnt, packet, packets_) {
struct dp_netdev_flow *flow = NULL;
// 找到hash相同的flow链表的头节点
flow_node = smc_entry_get(pmd, keys[i].hash);
bool hit = false;
/* Get the original order of this packet in received batch. */
recv_idx = index_map[i]; if (OVS_LIKELY(flow_node != NULL)) {
// 遍历一下看看哪一个是相同的,这个通过offsetof找到存放该cmap结构体的首地址
// dp_netdev_flow里面的首地址就是,
CMAP_NODE_FOR_EACH (flow, node, flow_node) {
/* Since we dont have per-port megaflow to check the port
* number, we need to verify that the input ports match. */
if (OVS_LIKELY(dpcls_rule_matches_key(&flow->cr, &keys[i]) &&
flow->flow.in_port.odp_port == packet->md.in_port.odp_port)) {
tcp_flags = miniflow_get_tcp_flags(&keys[i].mf);
keys[i].len =
netdev_flow_key_size(miniflow_n_values(&keys[i].mf));
if (emc_probabilistic_insert(pmd, &keys[i], flow)) {
if (flow->status == OFFLOAD_NONE) {
queue_netdev_flow_put(pmd->dp->dp_flow_offload, \
pmd->dp->class, \
flow, NULL, DP_NETDEV_FLOW_OFFLOAD_OP_ADD);
}
}
packet_enqueue_to_flow_map(packet, flow, tcp_flags,
flow_map, recv_idx);
n_smc_hit++;
hit = true;
break;
}
}
if (hit) {
continue;
}
}
// SMC也miss了,和之前一样,把miss的放packets_里,从0开始放
dp_packet_batch_refill(packets_, packet, i);
index_map[n_missed] = recv_idx;
missed_keys[n_missed++] = &keys[i];
} pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SMC_HIT, n_smc_hit);
}
static inline const struct cmap_node *
smc_entry_get(struct dp_netdev_pmd_thread *pmd, const uint32_t hash)
{
struct smc_cache *cache = &(pmd->flow_cache).smc_cache;
// smc_cache桶的大小是(1<<18),SMC_MASK=(1<<18)- 1
// 先通过后hash的后18位定位到桶
struct smc_bucket *bucket = &cache->buckets[hash & SMC_MASK];
// 一个桶有4个16位的sig,存key->hash前16位,正好是64位
// 遍历4个元素看那个匹配,获得匹配后的cmap的下标
uint16_t sig = hash >> 16;
uint16_t index = UINT16_MAX; for (int i = 0; i < SMC_ENTRY_PER_BUCKET; i++) {
if (bucket->sig[i] == sig) {
index = bucket->flow_idx[i];
break;
}
}
// 通过index找到在dpcls里的桶位置
if (index != UINT16_MAX) {
return cmap_find_by_index(&pmd->flow_table, index);
}
return NULL;
}
1.3 更新emc:emc_probabilistic_insert()
static inline bool
emc_probabilistic_insert(struct dp_netdev_pmd_thread *pmd,
const struct netdev_flow_key *key,
struct dp_netdev_flow *flow)
{
/* Insert an entry into the EMC based on probability value 'min'. By
* default the value is UINT32_MAX / 100 which yields an insertion
* probability of 1/100 ie. 1% */
uint32_t min = pmd->ctx.emc_insert_min;
if (min && random_uint32() <= min) {
emc_insert(&(pmd->flow_cache).emc_cache, key, flow);
return true;
}
return false;
}
static inline void
emc_insert(struct emc_cache *cache, const struct netdev_flow_key *key,
struct dp_netdev_flow *flow)
{
struct emc_entry *to_be_replaced = NULL;
struct emc_entry *current_entry; EMC_FOR_EACH_POS_WITH_HASH(cache, current_entry, key->hash) {
if (netdev_flow_key_equal(¤t_entry->key, key)) {
/* We found the entry with the 'mf' miniflow */
emc_change_entry(current_entry, flow, NULL);
return;
}
/* Replacement policy: put the flow in an empty (not alive) entry, or
* in the first entry where it can be */ if (!to_be_replaced
|| (emc_entry_alive(to_be_replaced)
&& !emc_entry_alive(current_entry))
|| current_entry->key.hash < to_be_replaced->key.hash) {
// 这个黄色判断就是我迷惑的地方
to_be_replaced = current_entry;
}
}
/* We didn't find the miniflow in the cache.
* The 'to_be_replaced' entry is where the new flow will be stored */
emc_change_entry(to_be_replaced, flow, key);
}
1.4 EMC的轮训更新
if (lc++ > 1024) {
lc = 0;
coverage_try_clear();
// 这里的optimize是排序一下TSS
dp_netdev_pmd_try_optimize(pmd, poll_list, poll_cnt);
dp_netdev_pmd_hook_idle_run(pmd);
#ifdef ENABLE_EMC
if (!ovsrcu_try_quiesce()) {
emc_cache_slow_sweep(pmd->dp, &((pmd->flow_cache).emc_cache));
}
#else
ovsrcu_try_quiesce();
#endif
for (i = 0; i < poll_cnt; i++) {
uint64_t current_seq =
netdev_get_change_seq(poll_list[i].rxq->port->netdev);
if (poll_list[i].change_seq != current_seq) {
poll_list[i].change_seq = current_seq;
poll_list[i].rxq_enabled =
netdev_rxq_enabled(poll_list[i].rxq->rx);
}
}
}
1.5 承上启下:OVS的TSS算法
TSS算法原理
算法优缺点
OVS里做的排序优化
2. dpcls查询
2.1 dpcls相关数据结构
// 线程安全的
#define OVSRCU_TYPE(TYPE) struct { ATOMIC(TYPE) p; } struct cmap {
OVSRCU_TYPE(struct cmap_impl *) impl;
}; /* The implementation of a concurrent hash map. */
struct cmap_impl {
// 补齐64字节
PADDED_MEMBERS_CACHELINE_MARKER(CACHE_LINE_SIZE, cacheline0,
unsigned int n; /* Number of in-use elements. */
unsigned int max_n; /* Max elements before enlarging. */
unsigned int min_n; /* Min elements before shrinking. */
uint32_t mask; /* Number of 'buckets', minus one. */
uint32_t basis; /* Basis for rehashing client's
hash values. */
);
PADDED_MEMBERS_CACHELINE_MARKER(CACHE_LINE_SIZE, cacheline1,
struct cmap_bucket buckets[1];
);
}; struct cmap_bucket {
/* Padding to make cmap_bucket exactly one cache line long. */
PADDED_MEMBERS(CACHE_LINE_SIZE,
// 锁机制,读和写都会+1,读的时候等到变成偶数再去读,保证安全
atomic_uint32_t counter;
// 桶中的每个槽用(hashs[i], nodes[i])元组来表示
uint32_t hashes[CMAP_K];
struct cmap_node nodes[CMAP_K];
);
};
struct cmap_node {
OVSRCU_TYPE(struct cmap_node *) next; /* Next node with same hash. */
}; /* 二级匹配表.每个报文接收端口对应一个 */
struct dpcls {
struct cmap_node node; /* 链表节点 */
odp_port_t in_port; /* 报文接收端口 */
struct cmap subtables_map; // 管理下边subtables的索引,用于遍历
struct pvector subtables; // 上文TSS算法所说的元组表
}
struct pvector {
// 指向具体子表信息
OVSRCU_TYPE(struct pvector_impl *) impl;
// 平时,temp都是为NULL.只有当pvector扩充时,temp才用来临时缓存数据.
// 待排好序后,再拷贝到impl中,temp再置NULL
struct pvector_impl *temp;
}; // 相当于vector<pvector_entry>
struct pvector_impl {
size_t size; /* Number of entries in the vector */
size_t allocated; /* Number allocted entries */
/* 初始化的时候只有4个元素.后续可能会扩充 */
struct pvector_entry vector[];
}
struct pvector_entry {
// pvector_impl中的vector是按照priority从小到大排序的
// pmd_thread_main里会把priority赋值为hit_cnt,然后排序
int priority;
/* 实际指向了struct dpcls_subtable结构 */
void *ptr;
}
// 子表信息
struct dpcls_subtable {
/* The fields are only used by writers. */
struct cmap_node cmap_node OVS_GUARDED; /* Within dpcls 'subtables_map'. */ struct cmap rules; // 该表的bucket内容
uint32_t hit_cnt; // 命中该子表的次数 // 下边是mask的miniflow前两个的bits里1的个数
uint8_t mf_bits_set_unit0;
uint8_t mf_bits_set_unit1;
// 根据mf_bits_set_unit01选择最合适的查找算法
dpcls_subtable_lookup_func lookup_func; /* Caches the masks to match a packet to, reducing runtime calculations. */
uint64_t *mf_masks; // 由下边的mask->mf->bits[01]得来的,
struct netdev_flow_key mask; // 该表的掩码信息
}; 关于上边的mf_masks与mask,举个例子
mf_bits_set_unit0 = 4, mf_bits_set_unit1 = 0
netdev_flow_key.mf.bits[0] = 111010 (2进制)
mf_masks = [1, 111, 1111, 11111] (2进制)
2.2 dpcls查询入口:fast_path_processing->dpcls_lookup()
static bool
dpcls_lookup(struct dpcls *cls, const struct netdev_flow_key *keys[],
struct dpcls_rule **rules, const size_t cnt,
int *num_lookups_p)
{
#define MAP_BITS (sizeof(uint32_t) * CHAR_BIT)
BUILD_ASSERT_DECL(MAP_BITS >= NETDEV_MAX_BURST); struct dpcls_subtable *subtable;
uint32_t keys_map = TYPE_MAXIMUM(uint32_t); /* Set all bits. */ if (cnt != MAP_BITS) {
/*keys_map中置1位数为包的总数,并且第i位对应第i个包*/
keys_map >>= MAP_BITS - cnt; /* Clear extra bits. */
}
memset(rules, 0, cnt * sizeof *rules); int lookups_match = 0, subtable_pos = 1;
uint32_t found_map; PVECTOR_FOR_EACH (subtable, &cls->subtables) {
// 查找函数,对应下边的lookup_generic()
found_map = subtable->lookup_func(subtable, keys_map, keys, rules); uint32_t pkts_matched = count_1bits(found_map);
// 搜索的子表个数,加上的是当前这几个key找了多少个表
lookups_match += pkts_matched * subtable_pos; keys_map &= ~found_map;
if (!keys_map) {
if (num_lookups_p) {
*num_lookups_p = lookups_match;
}
// 全找到了
return true;
}
subtable_pos++;
} if (num_lookups_p) {
*num_lookups_p = lookups_match;
}
// 没有全找到
return false;
}
lookup_generic()
入口在这里,往下走,传进去subtable有效字段有多大
static uint32_t
dpcls_subtable_lookup_generic(struct dpcls_subtable *subtable,
uint32_t keys_map,
const struct netdev_flow_key *keys[],
struct dpcls_rule **rules)
{
return lookup_generic_impl(subtable, keys_map, keys, rules,
subtable->mf_bits_set_unit0,
subtable->mf_bits_set_unit1);
} static inline uint32_t ALWAYS_INLINE
lookup_generic_impl(struct dpcls_subtable *subtable, // 当前的subtable
uint32_t keys_map, // miss_bit_map
const struct netdev_flow_key *keys[], // miss_key
struct dpcls_rule **rules, // save hit_rule
const uint32_t bit_count_u0,
const uint32_t bit_count_u1)
{
// 有几个包
const uint32_t n_pkts = count_1bits(keys_map);
ovs_assert(NETDEV_MAX_BURST >= n_pkts);
uint32_t hashes[NETDEV_MAX_BURST]; // 根据mask字段的大小开空间
const uint32_t bit_count_total = bit_count_u0 + bit_count_u1;
// 一个batch最大是NETDEV_MAX_BURST
const uint32_t block_count_required = bit_count_total * NETDEV_MAX_BURST;
uint64_t *mf_masks = subtable->mf_masks;
int i; // 申请存储一个batch报文信息的数组,存放
uint64_t *blocks_scratch = get_blocks_scratch(block_count_required); // 获得每个key与当前表的mask“与运算”的结果
ULLONG_FOR_EACH_1 (i, keys_map) {
netdev_flow_key_flatten(keys[i],
&subtable->mask, // 该表的掩码信息
mf_masks, // 由subtable->mask处理后的mask
&blocks_scratch[i * bit_count_total],
bit_count_u0,
bit_count_u1);
} // 算出来每一个key在该subtable里的hash值,该hash值由“mask字节数,key和mask与运算结果”得出
ULLONG_FOR_EACH_1 (i, keys_map) {
uint64_t *block_ptr = &blocks_scratch[i * bit_count_total];
uint32_t hash = hash_add_words64(0, block_ptr, bit_count_total);
hashes[i] = hash_finish(hash, bit_count_total * 8);
} uint32_t found_map;
const struct cmap_node *nodes[NETDEV_MAX_BURST]; // 找到每个key在该subtable里的cmap,并且返回每个key有没有被找到,第i位是1则找到
found_map = cmap_find_batch(&subtable->rules, keys_map, hashes, nodes); ULLONG_FOR_EACH_1 (i, found_map) {
struct dpcls_rule *rule;
// 可能不同的rule有相同的hash,看那个是匹配的
CMAP_NODE_FOR_EACH (rule, cmap_node, nodes[i]) {
const uint32_t cidx = i * bit_count_total;
/*rule->mask & keys[i]的值与rule->flow相比较*/
uint32_t match = netdev_rule_matches_key(rule, bit_count_total,
&blocks_scratch[cidx]);
if (OVS_LIKELY(match)) {
rules[i] = rule;
subtable->hit_cnt++;
goto next;
}
}
ULLONG_SET0(found_map, i); /* Did not match. */
next:
; /* Keep Sparse happy. */
}
return found_map;
}
掩码运算netdev_flow_key_flatten()
// 这个函数对应dpif-netdev.c里面的dpcls_flow_key_gen_masks()
static inline void
netdev_flow_key_flatten(const struct netdev_flow_key *key, // 要查找的miss_key
const struct netdev_flow_key *mask,
const uint64_t *mf_masks,
uint64_t *blocks_scratch,
const uint32_t u0_count,
const uint32_t u1_count)
{
/* Load mask from subtable, mask with packet mf, popcount to get idx. */
const uint64_t *pkt_blocks = miniflow_get_values(&key->mf);
const uint64_t *tbl_blocks = miniflow_get_values(&mask->mf); // 获取miss_key和mask的miniflow /* Packet miniflow bits to be masked by pre-calculated mf_masks. */
const uint64_t pkt_bits_u0 = key->mf.map.bits[0];
const uint32_t pkt_bits_u0_pop = count_1bits(pkt_bits_u0);
const uint64_t pkt_bits_u1 = key->mf.map.bits[1]; // 这个函数就是把miss_key与subtable的掩码进行&运算
// 会运算出该mask在意字段结果,放到blocks_scratch里
netdev_flow_key_flatten_unit(&pkt_blocks[0], // key-mf的数据段
&tbl_blocks[0], // mask->mf的数据段
&mf_masks[0], // mask->mf->bits得来mask
&blocks_scratch[0], // 存放的地址
pkt_bits_u0, // key->mf里的bits[0]
u0_count); // mask->mf->bits[0]里1的个数 netdev_flow_key_flatten_unit(&pkt_blocks[pkt_bits_u0_pop], // 上边bits[0]的已经算过了,从bits[1]开始算
&tbl_blocks[u0_count],
&mf_masks[u0_count],
&blocks_scratch[u0_count],
pkt_bits_u1,
u1_count);
} static inline void
netdev_flow_key_flatten_unit(const uint64_t *pkt_blocks, // key-mf的数据段
const uint64_t *tbl_blocks, // mask->mf里的数据段
const uint64_t *mf_masks, // mask->mf->bits得来mask
uint64_t *blocks_scratch, // 存放到这里
const uint64_t pkt_mf_bits, // key->mf里的bits[01]
const uint32_t count) // mask->mf->bits[0]里1的个数
{ // 说一下意思,这个我们流程就是用key和subtable的mask与运算,肯定只需要与运算mask里
// 不为0的字段,其他的mask不关心,然后这个操作就是为了得到key对应字段是key->mf的第几位,
// 比如mask的bits[0]=11111, key的bits[0] = 10100, mask里的第3个1在key里面是第1个
// 这一位与的结果就是tbl_blocks[2]&pkt_blocks[0], 也就是怎么找到key里的下标0
// 就看key当前位之前有几个1就行了。这里这样做的1010111,
// 蓝色1之前有count_1bits(1010111 & 0001111) = 3 // 对上边的mask举个例子 count = 4;
// mask->mf->bits[0] = 111010 (2进制)
// mf_masks = [1, 111, 1111, 11111] (2进制);
// pkt_mf_bits = 010100
// blocks_scratch = [0,0,0,0,pkt_blocks[1]&tbl_blocks[4],0]
uint32_t i;
for (i = 0; i < count; i++) {
// 拿i=2举例
uint64_t mf_mask = mf_masks[i]; // mf_mask = 001111
uint64_t idx_bits = mf_mask & pkt_mf_bits; // idx_bits = 000100
const uint32_t pkt_idx = count_1bits(idx_bits); // pkt_idx = 1 uint64_t pkt_has_mf_bit = (mf_mask + 1) & pkt_mf_bits; // pkt_has_mf_bit = 010000
// 是否求掩码:mask当前位对应的key的字段,如果key在当前位是0,下边算掩码就会变成0
uint64_t no_bit = ((!pkt_has_mf_bit) > 0) - 1; // 2^64 - 1 // mask里第i个字段与运算key对应的字段
blocks_scratch[i] = pkt_blocks[pkt_idx] & tbl_blocks[i] & no_bit; //
}
}
key对应的cmap:cmap_find_batch()
unsigned long
cmap_find_batch(const struct cmap *cmap, unsigned long map,
uint32_t hashes[], const struct cmap_node *nodes[])
{
const struct cmap_impl *impl = cmap_get_impl(cmap);
unsigned long result = map;
int i;
// 每一位就是一个包,一字节8个包
uint32_t h1s[sizeof map * CHAR_BIT];
const struct cmap_bucket *b1s[sizeof map * CHAR_BIT];
const struct cmap_bucket *b2s[sizeof map * CHAR_BIT];
uint32_t c1s[sizeof map * CHAR_BIT]; // 每个impl里桶的数量为impl->mask+1
// 为什么mask是桶的个数减1:因为下标从0开始,找下表的时候直接(hash & impl->mask)就行了 // 至于为什么开两个?因为buckets存放的方法也是一个值对应两个hash
// 第一次hash1 = rehash(impl->basis, hash), 找buckets[hash1 & impl->mask], 遍历里面CMAP_K个元素
// 第二次hash2 = other_hash(hash1), 找buckets[hash2 & impl->mask], 遍历里面CMAP_K个元素 /* Compute hashes and prefetch 1st buckets. */
ULLONG_FOR_EACH_1(i, map) {
h1s[i] = rehash(impl, hashes[i]);
b1s[i] = &impl->buckets[h1s[i] & impl->mask];
OVS_PREFETCH(b1s[i]);
}
/* Lookups, Round 1. Only look up at the first bucket. */
ULLONG_FOR_EACH_1(i, map) {
uint32_t c1;
const struct cmap_bucket *b1 = b1s[i];
const struct cmap_node *node; do {
c1 = read_even_counter(b1);
// 找一下这个cmap_bucket里面有没有相同hash的
node = cmap_find_in_bucket(b1, hashes[i]);
} while (OVS_UNLIKELY(counter_changed(b1, c1))); if (!node) {
/* Not found (yet); Prefetch the 2nd bucket. */
b2s[i] = &impl->buckets[other_hash(h1s[i]) & impl->mask];
OVS_PREFETCH(b2s[i]);
c1s[i] = c1; /* We may need to check this after Round 2. */
continue;
}
/* Found. */
ULLONG_SET0(map, i); /* Ignore this on round 2. */
OVS_PREFETCH(node);
nodes[i] = node;
}
/* Round 2. Look into the 2nd bucket, if needed. */
ULLONG_FOR_EACH_1(i, map) {
uint32_t c2;
const struct cmap_bucket *b2 = b2s[i];
const struct cmap_node *node; do {
c2 = read_even_counter(b2);
node = cmap_find_in_bucket(b2, hashes[i]);
} while (OVS_UNLIKELY(counter_changed(b2, c2))); if (!node) {
// 可能被修改了,
if (OVS_UNLIKELY(counter_changed(b1s[i], c1s[i]))) {
node = cmap_find__(b1s[i], b2s[i], hashes[i]);
if (node) {
goto found;
}
}
/* Not found. */
ULLONG_SET0(result, i); /* Fix the result. */
continue;
}
found:
OVS_PREFETCH(node);
nodes[i] = node;
}
return result;
}
2.3 fast_path_processing()
static inline void
fast_path_processing(struct dp_netdev_pmd_thread *pmd,
struct dp_packet_batch *packets_,
struct netdev_flow_key **keys,
struct dp_packet_flow_map *flow_map,
uint8_t *index_map,
odp_port_t in_port)
{
const size_t cnt = dp_packet_batch_size(packets_);
#if !defined(__CHECKER__) && !defined(_WIN32)
const size_t PKT_ARRAY_SIZE = cnt;
#else
/* Sparse or MSVC doesn't like variable length array. */
enum { PKT_ARRAY_SIZE = NETDEV_MAX_BURST };
#endif
struct dp_packet *packet;
struct dpcls *cls;
struct dpcls_rule *rules[PKT_ARRAY_SIZE];
struct dp_netdev *dp = pmd->dp;
int upcall_ok_cnt = 0, upcall_fail_cnt = 0;
int lookup_cnt = 0, add_lookup_cnt;
bool any_miss; for (size_t i = 0; i < cnt; i++) {
/* Key length is needed in all the cases, hash computed on demand. */
keys[i]->len = netdev_flow_key_size(miniflow_n_values(&keys[i]->mf));
}
/* Get the classifier for the in_port */
// 找到端口对应的dpcls结构,每个port有自己的dpcls,因为每个port收到的报文会更相似
cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
if (OVS_LIKELY(cls)) {
// 调用dpcls_lookup进行匹配
any_miss = !dpcls_lookup(cls, (const struct netdev_flow_key **)keys,
rules, cnt, &lookup_cnt);
} else {
any_miss = true;
memset(rules, 0, sizeof(rules));
}
// 如果有miss的,则需要进行openflow流表查询
if (OVS_UNLIKELY(any_miss) && !fat_rwlock_tryrdlock(&dp->upcall_rwlock)) {
uint64_t actions_stub[512 / 8], slow_stub[512 / 8];
struct ofpbuf actions, put_actions; ofpbuf_use_stub(&actions, actions_stub, sizeof actions_stub);
ofpbuf_use_stub(&put_actions, slow_stub, sizeof slow_stub); DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
struct dp_netdev_flow *netdev_flow; if (OVS_LIKELY(rules[i])) {
continue;
}
// 此时可能已经更新了,在进入upcall之前如果再查一次,如果能够查到,会比upcall消耗的少得多
netdev_flow = dp_netdev_pmd_lookup_flow(pmd, keys[i],
&add_lookup_cnt);
if (netdev_flow) {
lookup_cnt += add_lookup_cnt;
rules[i] = &netdev_flow->cr;
continue;
}
// 第一级和第二级流表查找失败后,就要查找第三级流表了,即openflow流表,这也称为upcall调用。
// 在普通ovs下是通过netlink实现的,在ovs+dpdk下,直接在pmd线程中调用upcall_cb即可。
// 开始查找openflow流表。如果查找openflow流表成功并需要下发到dpcls时,需要判断是否超出最大流表限制
int error = handle_packet_upcall(pmd, packet, keys[i],
&actions, &put_actions); if (OVS_UNLIKELY(error)) {
upcall_fail_cnt++;
} else {
upcall_ok_cnt++;
}
} ofpbuf_uninit(&actions);
ofpbuf_uninit(&put_actions);
fat_rwlock_unlock(&dp->upcall_rwlock);
} else if (OVS_UNLIKELY(any_miss)) {
DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
if (OVS_UNLIKELY(!rules[i])) {
dp_packet_delete(packet);
COVERAGE_INC(datapath_drop_lock_error);
upcall_fail_cnt++;
}
}
} DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
struct dp_netdev_flow *flow;
/* Get the original order of this packet in received batch. */
int recv_idx = index_map[i];
uint16_t tcp_flags; if (OVS_UNLIKELY(!rules[i])) {
continue;
} flow = dp_netdev_flow_cast(rules[i]); bool hook_cached = false;
if (pmd->cached_hook && \
pmd->cached_hook_pmd && \
pmd->cached_hook->hook_flow_miss) {
hook_cached = pmd->cached_hook->hook_flow_miss(pmd->cached_hook_pmd, packet, flow);
} if (!hook_cached) {
bool smc_enable_db;
atomic_read_relaxed(&pmd->dp->smc_enable_db, &smc_enable_db);
// 查找到了packet,如果开启了smc,更新smc
if (smc_enable_db) {
uint32_t hash = dp_netdev_flow_hash(&flow->ufid);
smc_insert(pmd, keys[i], hash);
}
// 查到了packet,看是否写会更新上一层cache(EMC)
if (emc_probabilistic_insert(pmd, keys[i], flow)) {
if (flow->status == OFFLOAD_NONE) {
queue_netdev_flow_put(pmd->dp->dp_flow_offload, \
pmd->dp->class, \
flow, NULL, DP_NETDEV_FLOW_OFFLOAD_OP_ADD);
}
}
}
/* Add these packets into the flow map in the same order
* as received.
*/
tcp_flags = miniflow_get_tcp_flags(&keys[i]->mf);
packet_enqueue_to_flow_map(packet, flow, tcp_flags,
flow_map, recv_idx);
}
// 更新各个信息
pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_HIT,
cnt - upcall_ok_cnt - upcall_fail_cnt);
pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_LOOKUP,
lookup_cnt);
pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MISS,
upcall_ok_cnt);
pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_LOST,
upcall_fail_cnt);
}
2.4 smc更新smc_insert()
static inline void
smc_insert(struct dp_netdev_pmd_thread *pmd,
const struct netdev_flow_key *key,
uint32_t hash)
{
struct smc_cache *smc_cache = &(pmd->flow_cache).smc_cache;
struct smc_bucket *bucket = &smc_cache->buckets[key->hash & SMC_MASK];
uint16_t index;
uint32_t cmap_index;
int i; //布谷鸟算法
cmap_index = cmap_find_index(&pmd->flow_table, hash);
index = (cmap_index >= UINT16_MAX) ? UINT16_MAX : (uint16_t)cmap_index; /* If the index is larger than SMC can handle (uint16_t), we don't
* insert */
if (index == UINT16_MAX) {
//表明找到了
return;
} /* If an entry with same signature already exists, update the index */
uint16_t sig = key->hash >> 16;
for (i = 0; i < SMC_ENTRY_PER_BUCKET; i++) {
if (bucket->sig[i] == sig) {
bucket->flow_idx[i] = index;
return;
}
}
/* If there is an empty entry, occupy it. */
for (i = 0; i < SMC_ENTRY_PER_BUCKET; i++) {
if (bucket->flow_idx[i] == UINT16_MAX) {
bucket->sig[i] = sig;
bucket->flow_idx[i] = index;
return;
}
}
/* Otherwise, pick a random entry. */
i = random_uint32() % SMC_ENTRY_PER_BUCKET;
bucket->sig[i] = sig;
bucket->flow_idx[i] = index;
}
3. upcall到openflow查找,然后更新dpcls
static bool
cmap_try_insert(struct cmap_impl *impl, struct cmap_node *node, uint32_t hash)
{
uint32_t h1 = rehash(impl, hash);
uint32_t h2 = other_hash(h1);
// hash两次找到两个桶
struct cmap_bucket *b1 = &impl->buckets[h1 & impl->mask];
struct cmap_bucket *b2 = &impl->buckets[h2 & impl->mask]; // 插入规则:
// 1.是否有相同hash的node,就插到对应链上
// 2.没有相同hash,就看有没有空的node
// 3.都不行就通过bfs,看能否让b1,b2空出来一个,把这个放进去
// 都不行就插入失败
return (OVS_UNLIKELY(cmap_insert_dup(node, hash, b1) ||
cmap_insert_dup(node, hash, b2)) ||
OVS_LIKELY(cmap_insert_bucket(node, hash, b1) ||
cmap_insert_bucket(node, hash, b2)) ||
cmap_insert_bfs(impl, node, hash, b1, b2));
}
参考博客:
OVS-DPDK 流表查询详解的更多相关文章
- (转)Mysql 多表查询详解
MySQL 多表查询详解 一.前言 二.示例 三.注意事项 一.前言 上篇讲到mysql中关键字执行的顺序,只涉及了一张表:实际应用大部分情况下,查询语句都会涉及到多张表格 : 1.1 多表连接有 ...
- Mysql 多表查询详解
Reference: https://blog.csdn.net/jintao_ma/article/details/51260458 一.前言 二.示例 三.注意事项 一.前言 上篇讲到Mysql ...
- MySQL简单查询详解-单表查询
MySQL简单查询详解-单表查询 作者:尹正杰 版权声明:原创作品,谢绝转载!否则将追究法律责任. 一.查询的执行路径 一条SQL查询语句的执行过程大致如下图所示: 1>.客户端和服务端通过my ...
- ovs源码阅读--流表查询原理
背景 在ovs交换机中,报文的处理流程可以划分为一下三个步骤:协议解析,表项查找和动作执行,其中最耗时的步骤在于表项查找,往往一个流表中有数目巨大的表项,如何根据数据报文的信息快速的查找到对应的流表项 ...
- openVswitch(OVS)源代码分析之工作流程(flow流表查询)
原文链接: openVswitch(OVS)源代码分析之工作流程(flow流表查询)
- Neutron 理解 (4): Neutron OVS OpenFlow 流表 和 L2 Population [Netruon OVS OpenFlow tables + L2 Population]
学习 Neutron 系列文章: (1)Neutron 所实现的虚拟化网络 (2)Neutron OpenvSwitch + VLAN 虚拟网络 (3)Neutron OpenvSwitch + GR ...
- ElasticSearch第四步-查询详解
ElasticSearch系列学习 ElasticSearch第一步-环境配置 ElasticSearch第二步-CRUD之Sense ElasticSearch第三步-中文分词 ElasticSea ...
- oracle表分区详解
原文来自:http://www.cnblogs.com/leiOOlei/archive/2012/06/08/2541306.html oracle表分区详解 从以下几个方面来整理关于分区表的概念及 ...
- ThinkPHP视图查询详解
ThinkPHP视图查询详解 参考http://www.jb51.net/article/51674.htm 这篇文章主要介绍了ThinkPHP视图查询,需要的朋友可以参考下 ThinkP ...
- SQL Server表分区详解
原文:SQL Server表分区详解 什么是表分区 一般情况下,我们建立数据库表时,表数据都存放在一个文件里. 但是如果是分区表的话,表数据就会按照你指定的规则分放到不同的文件里,把一个大的数据文件拆 ...
随机推荐
- CSS3-transform位移实现双开门效果
transform可以用于实现位移,旋转,缩放等效果. 位移:transform: translate(水平距离,垂直距离); 这里先借助其位移属性实现双开门的效果,鼠标hover的时候最上面的图片向 ...
- Redis设计实现-学习笔记
最近在准备面试,问到redis相关知识,只能说个皮毛,说的既不深入也不全面,所以抓紧突击一下,先学<redis设计与实现>. 选择看书的原因是: 书中全面深入,且能出书一定十分用心: 搜博 ...
- [2001年NOIP普及组] 求先序排列
给出一棵二叉树的中序与后序排列.求出它的先序排列.(约定树结点用不同的大写字母表示,长度<=8). 输入 第一行输入一个字符串表示二叉树的中序排列,第二行输入一个字符串表示二叉树的后序排列. 输 ...
- 微信小程序:微信web开发阶段性学习总结
小程序运行机制 前台/后台状态 小程序启动后,界面被展示给用户,此时小程序处于前台状态. 当用户点击右上角胶囊按钮关闭小程序,或者按了设备 Home 键离开微信时,小程序并没有完全终止运行,而是进入了 ...
- 浏览器F12,Network中各按钮的作用
Network下 preserve log:勾选,页面发生跳转,接口不丢失:(比如登录成功跳转到首页,登录的接口就没了,勾选Perserve log,会记录跳转前的接口): Disable cache ...
- RDD编程
一.词频统计 1.读文本文件生成RDD lines 2.将一行一行的文本分割成单词 words flatmap() 3.全部转换为小写 lower() 4.去掉长度小于3的单词 filter() 5. ...
- JavaScript 之 数组在内存中的存储方式(连续或不连续)
最近在纠结一个问题,就是数组这个引用类型在JavaScript 中是不是和其他语言一样开辟了一个连续的内存来存储,但是在JS 中每个元素又可以是不同的类型,这就导致了没办法用一个相同大小的存储,所以数 ...
- vscode php 方法无法跳转问题
安装扩展 PHP IntelliSense #路径感知 PHP Intelephense #自动补全 PHP DocBlocker #在类或属性.方法上面敲/**回车自动生成参数注释等 PHP Na ...
- Debug --> python中的True False 0 1
今天看了下python中的一些基础知识,以offer64为例叭! 求 1+2+...+n ,要求不能使用乘除法.for.while.if.else.switch.case等关键字及条件判断语句(A?B ...
- eset node32卸载记录
安装的是这个东西,卸载麻烦 1.一般的卸载软件比如wise program uninstall无论是普通卸载还是强制卸载都是实现不了的,火绒自带的文件粉碎是可以使用的,有两个目录要进行粉碎C:\Pro ...