基于AC有限状态机的多模匹配算法

参考链接：http://www.cnblogs.com/zzqcn/p/3525636.html

感谢原文作者。

花了两天半时间实现并测试了算法。

按照上文的思路实现了一遍，可能是原文中有些地方描述的不是特别清楚，导致一开始测试的时候发现了各种匹配遗漏的情况，后经过自己各种努力终于解决了各种遗漏。

同时在实现过程中也遇到了各种小问题，最后都解决了，总结起来主要有四个大坑，自己实现的时候需要注意，四个坑都在代码的注释里面了。

这里的实现虽然不会有遗漏的情况，但会有同一模式串在相同的偏移多次被命中的情况，但无伤大雅，至少没有遗漏不是吗。实际应用中只需对结果做去重就好了。

测试结论：对一个101.3MB的PE，从中随机抽取长度在[16-116)Bytes的模式串16个，分别用memcmp方式和AC自动机方式进行匹配，memcmp方式耗时33秒，AC方式耗时12秒，可见优势还是比较明显的。

代码中如有哪里不对，欢迎一起讨论。

 #include <cstdlib>

 #include <cstdio>

 #include <cstring>

 #include <stdint.h>

 #include <vector>

 #include <map>

 #include <queue>

 #include <ctime>

 typedef struct ACNode

 {

     uint64_t        u64Depth;

     struct ACNode   *pFail;

     std::map<unsigned char, struct ACNode *>    *pmpGotoTab;

     struct ACParrent

     {

         struct ACNode   *pParent;

         unsigned char   ucCondition;

     } Parent;

     bool            bIsMathed;

 } AC_NODE, *P_AC_NODE;

 typedef void (__stdcall *P_AC_FOUND_CALLBACK)(const unsigned char *In_pucBuf, uint64_t In_u64EndPos, uint64_t In_u64Len);

 int InitACGoto(const std::vector<const std::vector<unsigned char> *> &In_vctPattern,

     std::vector<P_AC_NODE> &Out_vctACNodes)

 {

     int             iRetVal     = ;

     P_AC_NODE       pRoot       = NULL;

     unsigned int    uiPattIdx   = ;

     unsigned int    uiUCharIdx  = ;

     uint16_t        u16Idx      = ;

     if (In_vctPattern.empty())

     {

         iRetVal = -;

         goto fun_ret;

     }

     pRoot = (P_AC_NODE)calloc(, sizeof(AC_NODE));

     if (pRoot == NULL)

     {

         iRetVal = -;

         goto fun_ret;

     }

     pRoot->pmpGotoTab = new std::map<unsigned char, struct ACNode *>();

     for (u16Idx = ; u16Idx <= 0xff; u16Idx ++)

         pRoot->pmpGotoTab->insert(std::pair<unsigned char, struct ACNode *>((unsigned char)u16Idx, pRoot));

     Out_vctACNodes.push_back(pRoot);

     for (uiPattIdx = ; uiPattIdx < In_vctPattern.size(); uiPattIdx ++)

     {

         P_AC_NODE   pCurNode    = pRoot;

         for (uiUCharIdx = ; uiUCharIdx < In_vctPattern[uiPattIdx]->size(); uiUCharIdx ++)

         {

             unsigned char   ucCurUChar  = In_vctPattern[uiPattIdx]->at(uiUCharIdx);

             if (pCurNode->pmpGotoTab->find(ucCurUChar) == pCurNode->pmpGotoTab->end()

                 || (pCurNode->pmpGotoTab->find(ucCurUChar) != pCurNode->pmpGotoTab->end()

                 && pCurNode->pmpGotoTab->at(ucCurUChar) == pRoot))

             {

                 P_AC_NODE   pNode = (P_AC_NODE)calloc(, sizeof(AC_NODE));

                 if (pNode == NULL)

                 {

                     iRetVal = -;

                     goto fun_ret;

                 }

                 pNode->u64Depth = uiUCharIdx + ;

                 pNode->Parent.pParent = pCurNode;

                 pNode->Parent.ucCondition = ucCurUChar;

                 pNode->pmpGotoTab = new std::map<unsigned char, struct ACNode *>();

                 if (pCurNode->pmpGotoTab->find(ucCurUChar) != pCurNode->pmpGotoTab->end())

                     pCurNode->pmpGotoTab->erase(ucCurUChar);

                 pCurNode->pmpGotoTab->insert(std::pair<unsigned char, struct ACNode *>(ucCurUChar, pNode));

                 pCurNode = pNode;

                 Out_vctACNodes.push_back(pNode);

             }

             else

                 pCurNode = pCurNode->pmpGotoTab->at(ucCurUChar);

             if (uiUCharIdx == In_vctPattern[uiPattIdx]->size() - )

                 pCurNode->bIsMathed = true;

         }

     }

 fun_ret:

     return iRetVal;

 }

 int ACFail(std::vector<P_AC_NODE> &Out_vctACNodes)

 {

     int                     iRetVal = ;

     std::queue<P_AC_NODE>   quNodes;

     if (Out_vctACNodes.empty())

     {

         iRetVal = -;

         goto fun_ret;

     }

     quNodes.push(Out_vctACNodes[]);

     while (!quNodes.empty())

     {

         std::map<unsigned char, struct ACNode *>::iterator  itGoto;

         P_AC_NODE   pNode = quNodes.front();

         quNodes.pop();

         if (pNode->u64Depth <= )

             pNode->pFail = Out_vctACNodes[];

         else

         {

             P_AC_NODE   pParentFail = pNode->Parent.pParent->pFail;

             while (pParentFail->pmpGotoTab->find(pNode->Parent.ucCondition) == pParentFail->pmpGotoTab->end())

                 pParentFail = pParentFail->pFail;

             pNode->pFail = pParentFail->pmpGotoTab->at(pNode->Parent.ucCondition);

         }

         for (itGoto = pNode->pmpGotoTab->begin(); itGoto != pNode->pmpGotoTab->end(); itGoto ++)

         {

             if (itGoto->second != Out_vctACNodes[])

                 quNodes.push(itGoto->second);

         }

     }

 fun_ret:

     return iRetVal;

 }

 void __stdcall ACFoundCallBack(const unsigned char *In_pucBuf, uint64_t In_u64EndPos, uint64_t In_u64Len)

 {

     if (In_pucBuf == NULL || In_u64Len == )

         goto fun_ret;

     printf("<<<<<<<<<<FUCKOFF:%x\n", In_u64EndPos - In_u64Len);

 fun_ret:

     return;

 }

 int ACSearch(const P_AC_NODE In_pRoot, const unsigned char *In_pucBuf, uint64_t In_u64BufLen, P_AC_FOUND_CALLBACK In_pfCallBack)

 {

     int         iRetVal     = ;

     P_AC_NODE   pCurrent    = NULL;

     uint64_t    u64Idx      = ;

     if (In_pRoot == NULL || In_pucBuf == NULL || In_u64BufLen ==  || In_pfCallBack == NULL)

     {

         iRetVal = -;

         goto fun_ret;

     }

     pCurrent = In_pRoot;

     for (u64Idx = ; u64Idx < In_u64BufLen;)

     {

         P_AC_NODE   pFail   = NULL;

         if (pCurrent->pmpGotoTab->find(In_pucBuf[u64Idx]) != pCurrent->pmpGotoTab->end())

         {

             pCurrent = pCurrent->pmpGotoTab->at(In_pucBuf[u64Idx]);

             //坑1，出现匹配失败时不要前进，只在匹配成功时前进

             u64Idx ++;

         }

         else

             pCurrent = pCurrent->pFail;

         //坑3，每个节点都需要沿着失配指针一直向上找所有匹配到的结果，而不是

         //只在匹配成功时才这么做，否则会出现匹配遗漏（形如“abcd”和“bc”这样的特征串并存的情况）

         pFail = pCurrent->pFail;

         //坑4，一定要走到根，否则会出现匹配遗漏

         while (pFail != In_pRoot)

         {

             if (pFail->bIsMathed)

                 In_pfCallBack(In_pucBuf, u64Idx, pFail->u64Depth);

             pFail = pFail->pFail;

         }

         //坑2，不管是否匹配成功，都要判断当前节点状态，因为出现失配后的

         //转移也有可能转到一个成功匹配的节点上

         if (pCurrent->bIsMathed)

             In_pfCallBack(In_pucBuf, u64Idx, pCurrent->u64Depth);

     }

 fun_ret:

     return iRetVal;

 }

 void ReleaseACNodes(std::vector<P_AC_NODE> &Out_vctACNodes)

 {

     unsigned int    uiIdx   = ;

     for (uiIdx = ; uiIdx < Out_vctACNodes.size(); uiIdx ++)

     {

         delete Out_vctACNodes[uiIdx]->pmpGotoTab;

         free(Out_vctACNodes[uiIdx]);

     }

     Out_vctACNodes.clear();

 }

 void main(int argc, char **argv)

 {

     std::vector<P_AC_NODE>  vctNodes;

     std::vector<const std::vector<unsigned char> *> vctPatterns;

     unsigned char   *pucBuf = NULL;

     FILE            *pf     = NULL;

     long            lFileSize   = ;

     time_t          tACBegin    = {};

     double          dMemSec     = 0.0;

     pf = fopen(argv[], "rb");

     fseek(pf, , SEEK_END);

     lFileSize = ftell(pf);

     fseek(pf, , SEEK_SET);

     pucBuf = (unsigned char *)calloc(lFileSize, );

     fread(pucBuf, , lFileSize, pf);

     fclose(pf);

     for (int i = ; i < ; i ++)

     {

         std::vector<unsigned char>  *pvctPattern = new std::vector<unsigned char>();

         int iBegin  = rand() % (lFileSize - );

         int iLen    = rand() %  + ;

         for (int j = ; j < iLen; j ++)

             pvctPattern->push_back(pucBuf[j + iBegin]);

         vctPatterns.push_back(pvctPattern);

         printf("%x:%u\n", iBegin, iLen);

         for (long j = ; j < lFileSize - iLen; j ++)

         {

             time_t  tMemBegin   = time(NULL);

             if (memcmp(pucBuf + iBegin, pucBuf + j, iLen) == )

                 printf(">>>>>>>>>>Off:%x\n", j);

             dMemSec += difftime(time(NULL), tMemBegin);

         }

     }

     InitACGoto(vctPatterns, vctNodes);

     ACFail(vctNodes);

     tACBegin = time(NULL);

     ACSearch(vctNodes[], pucBuf, lFileSize, ACFoundCallBack);

     printf("MemTime::%f\nACTime::%f\n", dMemSec, difftime(time(NULL), tACBegin));

     ReleaseACNodes(vctNodes);

     return;

 }

基于AC有限状态机的多模匹配算法的更多相关文章

基于Qt有限状态机的一种实现方式和完善的人工智能方法
基于Qt有限状态机的一种实现方式和完善的人工智能方法人工智能在今年是一个非常火的方向,当然了.不不过今年,它一直火了非常多年,有关人工智能的一些算法层出不穷.人工智能在非常多领域都有应用,就拿我熟悉 ...
java实现多模匹配算法
这个是好几年前写的了.都统一放到cnblogs上面. --------------------------------Node ---------------------------------- p ...
基于KMP与Levenshtein模糊匹配算法的银行联行号查询（转）
在人民银行那里,每个银行的每一个营业网点都有自己唯一的银行联行号,根据这个号码能快速定位一间银行具体的分支行,就像根据一个身份证号码能快速确定一个人一样.例如汇款时,汇款单上要求填写收款人开户行,然后 ...
基于KMP与Levenshtein模糊匹配算法的银行联行号查询
在人民银行那里,每个银行的每一个营业网点都有自己唯一的银行联行号,根据这个号码能快速定位一间银行具体的分支行,就像根据一个身份证号码能快速确定一个人一样.例如汇款时,汇款单上要求填写收款人开户行,然后 ...
基于Unity有限状态机框架
这个框架是Unity wiki上的框架.网址:http://wiki.unity3d.com/index.php/Finite_State_Machine 这就相当于是“模板”吧,自己写的代码,写啥都 ...
多模匹配算法之Aho-Corasick
除剔除那些含有敏感词的文本,由于有大量的敏感词,所以通过简单的正则表达式和字符串查找的方式效率太低,每次都有遍历一次字符串.而AC算法的核心思想就是避免不必要的回溯使搜索一直沿着向前的方向,最大可能的 ...
POJ 3691 DNA repair 基于AC自己主动机DP
dp[i][j] 它表示的长度 i 下游前缀 j 更改节点的最小数量. 很清楚dp[0][0] = 0; dp[ i ][ j ] = min(dp[ i ][ j ],dp[i-1][k] + (j ...
AC多模式匹配算法
建议:学习ac算法最好的途径是看论文pdf_Efficient_String_Matching_An_Aid_to_Biblio 一.一般的搜索算法 keyword: { he, she, his, ...
【工程应用一】多目标多角度的快速模板匹配算法（基于NCC，效果无限接近Halcon中........)
愿意写代码的人一般都不太愿意去写文章,因为代码方面的艺术和文字中的美学往往很难兼得,两者都兼得的人通常都已经被西方极乐世界所收罗,我也是只喜欢写代码,让那些字母组成美妙的歌曲,然后自我沉浸在其中自得其 ...

随机推荐

Selenium3 + Python3自动化测试系列二——selenium元素定位
一.selenium元素定位 Selenium对网页的控制是基于各种前端元素的,在使用过程中,对于元素的定位是基础,只有准去抓取到对应元素才能进行后续的自动化控制,我在这里将对selenium8种元 ...
前端解读面向切面编程(AOP)
前言面向对象(OOP)作为经典的设计范式,对于我们来说可谓无人不知,还记得我们入行起始时那句经典的总结吗-万事万物皆对象. 是的,基于OOP思想封装.继承.多态的特点,我们会自然而然的遵循模块化.组 ...
4923: [Lydsy1706月赛]K小值查询平衡树非旋转Treap
国际惯例的题面:这种维护排序序列,严格大于的进行操作的题都很套路......我们按照[0,k],(k,2k],(2k,inf)分类讨论一下就好.显然第一个区间的不会变化,第二个区间的会被平移进第一个区 ...
PHP Math 函数 mt_rand() 使用 Mersenne Twister 算法返回随机整数。
语法 mt_rand(min,max) 说明如果没有提供可选参数 min 和 max,mt_rand() 返回 0 到 RAND_MAX 之间的伪随机数.例如想要 5 到 15(包括 5 和 15) ...
一款功能强悍的web磁盘管理工具（A powerful web disk management tools）
https://github.com/kingAnyWHere/web-ftp web-ftp 一款功能强悍的web磁盘管理工具 (A powerful web disk management too ...
bzoj 2013 上升计数
题意: 给一个数集和一个数d,问满足下列要求的排列数(相同的数要区分): a[i]+d>=a[i+1] ( i in [1,n) ) 因为数的给出顺序不重要,所以先排序,假如我们已经解决了前i ...
startup.bat闪退问题
startup.bat闪退问题我自己遇到的 1.例如: 手动点击startup.bat 后 ,一闪而过 2例如:在cmd下进到tomcat的bin目录运行 startup.bat 解决问题,第一 ...
MIRUO面试题
1.c#可以继承string类吗?2.接口可以实现接口吗?抽象类可以实现接口吗?抽象类可以实现实体类吗?3.用C#计算2.5的3次方的方法.4.什么是协同程序?5.GC是什么,如何减少内存,如何加快性 ...
Hadoop化繁为简(三)—探索Mapreduce简要原理与实践
目录-探索mapreduce 1.Mapreduce的模型简介与特性?Yarn的作用? 2.mapreduce的工作原理是怎样的? 3.配置Yarn与Mapreduce.演示Mapreduce例子程序 ...
C++ - 定义无双引号的字符串宏
在某些特殊场合下,我们可能需要定义一个字符串宏,但又不能用双引号比如像这样 #define HELLO hello world 如果我们只是简单的展开HELLO,肯定会无法编译 std::cout ...

基于AC有限状态机的多模匹配算法

基于AC有限状态机的多模匹配算法的更多相关文章

随机推荐

热门专题