C++ 简单中文敏感词检测工具类
具体思路:
1->敏感词库,可从数据库读取,也可以从文件加载.
2->将敏感词转化为gbk编码,因为gbk严格按照字符一个字节,汉字两个字节的格式编码,便于容易切分文字段.
3->将所有敏感词以首个字符[英文一字节,汉字两字节]转换为一个整数,然后按照这个整数给所有敏感词建立索引,索引的value用list,因为考虑到同一个整数对应多个关键字.
4->检测一段内文字类容时,也实现转化为gbk,然后逐个字符[英文一字节,汉字两字节]检测是否有以该字符为首的敏感词.
代码.h
#ifndef SENSITIVE_WORDS_CHECKER_
#define SENSITIVE_WORDS_CHECKER_
#include <stdint.h>
#include <stdio.h>
#include <memory.h>
#include <map>
#include <vector> enum {
enmMaxWordLength = , //每个敏感词最大长度
enmMaxWordsFileLength = * , //敏感词文件最大长度128k
enmMaxContentLength = , // 单次检测内容测最大长度
}; struct SensitiveWord
{
char szWord[enmMaxWordLength];
SensitiveWord()
{
memset(szWord, , enmMaxWordLength);
}
}; typedef std::vector<SensitiveWord*> WordList;
typedef std::map<uint32_t, WordList*> WordMap; class SensitiveWordsChecker
{
public:
SensitiveWordsChecker() :arrSensitiveWord(NULL), nSensitiveWordCnt(){}
~SensitiveWordsChecker(){ delete[] arrSensitiveWord; }
public:
void LoadWordsFromUTF8File(const char *file_name);
void LoadWordsFromGBKFile(const char *file_name);
protected:
int32_t WriteToFile(const char buf[], const int32_t buf_size, const char *file_name);
void DumpWordMap();
void GenTestData();
void Test();
void StrAppend(char buf[], const uint32_t bufLen, uint32_t &offset, const char *fmt, ...);
private:
int32_t LoadFile(char buf[], const uint32_t buf_size, const char *file_name);
int32_t CodeConvert(char *from_charset, char *to_charset, char *inbuf, size_t inlen, char *outbuf, size_t outlen);
int32_t UTF8_To_GBK(char *inbuf, size_t inlen, char *outbuf, size_t outlen);
int32_t GBK_To_UTF8(char *inbuf, size_t inlen, char *outbuf, size_t outlen);
uint32_t GetWordsCount(char buf[],const uint32_t buf_size,char separator);
char *StrcpyExcludeChar(char *dst, const uint32_t dst_len, const char *src, const char *exclude_list);
int32_t GetWords(char gbk_buf[], const uint32_t buf_size, char separator);
void BuildWordMap();
uint32_t GetFirstCharFromGBK(char gbk_buf[]);
uint32_t GetFirstCharFromTUF8(char utf8_buf[]);
uint32_t GetFirstChar(char buf[]);
// 返回 0 表示in_utf8_buf里面没有敏感词
// 返回 1 表示in_utf8_buf里面含有关键词,并将关键词替换为*输出到out_utf8_buf
int32_t CheckSensitiveWord(char out_utf8_buf[], char in_utf8_buf[]);
const SensitiveWord* FindSensitiveWord(uint32_t code,const char *pos);
private:
SensitiveWord *arrSensitiveWord;
uint32_t nSensitiveWordCnt;
WordMap mapWords;
}; #endif
.cpp
#include "SenditiveWordsChecker.h"
#include "stdio.h"
#include "string.h"
#include "iconv.h"
#include <stdarg.h>
#include <new> void SensitiveWordsChecker::LoadWordsFromUTF8File(const char *file_name)
{
char utf8_buf[enmMaxWordsFileLength] , gbk_buf[enmMaxWordsFileLength];
LoadFile(utf8_buf, enmMaxWordsFileLength, file_name);
UTF8_To_GBK(utf8_buf, strlen(utf8_buf), gbk_buf, enmMaxWordsFileLength);
GetWords(gbk_buf, enmMaxWordsFileLength, ',');
} void SensitiveWordsChecker::LoadWordsFromGBKFile(const char *file_name)
{
char gbk_buf[enmMaxWordsFileLength];
LoadFile(gbk_buf, enmMaxWordsFileLength, file_name);
GetWords(gbk_buf, enmMaxWordsFileLength,',');
} int32_t SensitiveWordsChecker::LoadFile(char buf[], const uint32_t buf_size, const char *file_name)
{
FILE * pFile;
size_t lSize = , result = ;
fopen_s(&pFile, file_name, "rb");
if (pFile == NULL) { fputs("File error\n", stderr); return -; }
// obtain file size:
fseek(pFile, , SEEK_END);
lSize = ftell(pFile);
rewind(pFile);
if (lSize >= buf_size){ fputs("file too large\n", stderr); return -; }
result = fread(buf, , lSize, pFile);
if (result != lSize) { fputs("Reading error\n", stderr); return -; }
buf[lSize] = '\0';
return fclose(pFile);
} int32_t SensitiveWordsChecker::CodeConvert(char *from_charset, char *to_charset, char *inbuf, size_t inlen, char *outbuf, size_t outlen)
{
iconv_t cd;
char **pin = &inbuf;
char **pout = &outbuf; cd = iconv_open(to_charset, from_charset);
if (cd == )
return -;
memset(outbuf, , outlen);
if (iconv(cd, pin, &inlen, pout, &outlen) == -)
return -;
iconv_close(cd);
*pout = '\0';
return ;
} int32_t SensitiveWordsChecker::UTF8_To_GBK(char *inbuf, size_t inlen, char *outbuf, size_t outlen)
{
return CodeConvert("utf-8", "gbk", inbuf, inlen, outbuf, outlen);
} int32_t SensitiveWordsChecker::GBK_To_UTF8(char *inbuf, size_t inlen, char *outbuf, size_t outlen)
{
return CodeConvert("gbk", "utf-8", inbuf, inlen, outbuf, outlen);
} uint32_t SensitiveWordsChecker::GetWordsCount(char buf[], const uint32_t buf_size, char separator)
{
const char *p = buf - ;
uint32_t i = ;
while ((p = strchr(p + , separator)) != NULL)
{
++i;
}
return i;
} int32_t SensitiveWordsChecker::WriteToFile(const char buf[], const int32_t buf_size, const char *file_name)
{
FILE * pFile;
size_t result;
fopen_s(&pFile, file_name, "wb");
if (pFile == NULL) { fputs("File error\n", stderr); return -; }
result = fwrite(buf, , buf_size, pFile);
if (result != buf_size) { fputs("Writing error\n", stderr); return -; }
return fclose(pFile);
} int32_t SensitiveWordsChecker::GetWords(char gbk_buf[], const uint32_t buf_size, char separator)
{
char buf[enmMaxWordsFileLength];
StrcpyExcludeChar(buf, enmMaxWordsFileLength, gbk_buf, "\n"); //排除换行符
uint32_t nWordsCount = GetWordsCount(buf, buf_size,',');
printf("words_count=%d\n", nWordsCount);
arrSensitiveWord = new SensitiveWord[nWordsCount];
if (arrSensitiveWord == NULL){return -;}
nSensitiveWordCnt = ;
const char *p = NULL,*q = buf;
while ((p = strchr(q, separator)) != NULL)
{
memcpy(arrSensitiveWord[nSensitiveWordCnt].szWord, q, p - q);
//printf("%s\n", arrSensitiveWord[nSensitiveWordCnt].szWord);
q = p + ;
++nSensitiveWordCnt;
}
BuildWordMap();
return ;
} char * SensitiveWordsChecker::StrcpyExcludeChar(char *dst, const uint32_t dst_len, const char *src, const char *exclude_list)
{
uint32_t i = , j = , flag = ;
const char *p = NULL;
if (dst == NULL && src == NULL)return NULL;
if (dst == src)return dst;
for (; j < dst_len && src[i] != '\0'; ++i)
{
flag = ;
p = exclude_list;
while (p && *p != '\0')
{
if (*p == src[i]){ flag = ; break; }
p++;
}
if (flag == )dst[j++] = src[i];
}
dst[j] = '\0';
return dst;
} uint32_t SensitiveWordsChecker::GetFirstCharFromGBK(char gbk_buf[])
{
int32_t code = ;
int32_t len = strlen(gbk_buf);
if (len == )return ;
if (gbk_buf[] >= || len == )
{
//printf("%c\n", gbk_buf[0]);
return uint32_t(gbk_buf[]); //ASCII 字符
}
else
{
short high = (short)gbk_buf[] + ;
short low = (short)gbk_buf[] + ;
code = high * + low;
char cstr[];
cstr[] = gbk_buf[]; // GBK严格按照两个字节表示一个中文字符
cstr[] = gbk_buf[];
cstr[] = ;
//printf("%s %x\n", cstr, code);
return code;
}
} uint32_t SensitiveWordsChecker::GetFirstCharFromTUF8(char utf8_buf[])
{
uint32_t code = ;
int32_t len = strlen(utf8_buf);
if (len == )return ;
if (utf8_buf[] >= || len == )
{
printf("%c\n", utf8_buf[]);
return int32_t(utf8_buf[]); //ASCII 字符
}
else
{
short high = (short)utf8_buf[];
short mid = (short)utf8_buf[];
short low = (short)utf8_buf[];
code = high * * + mid * + low;
char cstr[];
cstr[] = utf8_buf[]; // UTF8大多数情况下三个字节表示一个中文字符
cstr[] = utf8_buf[];
cstr[] = utf8_buf[];
cstr[] = ;
printf("%s\n", cstr);
return code;
}
} uint32_t SensitiveWordsChecker::GetFirstChar(char buf[])
{
uint32_t code = ;
int32_t len = strlen(buf);
if (len == )return ;
return (uint32_t)buf[];
} void SensitiveWordsChecker::BuildWordMap()
{
WordList *wordList = NULL;
for (uint32_t i = ; i < nSensitiveWordCnt; ++i)
{
uint32_t code = GetFirstCharFromGBK(arrSensitiveWord[i].szWord);
WordMap::iterator it = mapWords.find(code);
if (it == mapWords.end())
{
wordList = new WordList();
mapWords[code] = wordList;
}
else
{
wordList = it->second;
}
wordList->push_back(&arrSensitiveWord[i]);
}
DumpWordMap();
GenTestData();
Test();
} void SensitiveWordsChecker::DumpWordMap()
{
uint32_t word_cnt = ,i = ;
WordMap::const_iterator it = mapWords.begin();
for (; it != mapWords.end(); ++it)
{
//printf("%u : %u\n", i++, it->second->size());
word_cnt += it->second->size();
}
printf("word_cnt = %u\n", word_cnt);
} int32_t SensitiveWordsChecker::CheckSensitiveWord(char out_utf8_buf[], char in_utf8_buf[])
{
// 先把被检测字符串转换为GBK编码
char gbk_buf[enmMaxContentLength],out_gbk_buf[enmMaxContentLength];
UTF8_To_GBK(in_utf8_buf, strlen(in_utf8_buf), gbk_buf, enmMaxContentLength);
// 提取GBK字串里面的每一个字符,去map里面查找以该字符为首的关键词列表
int32_t gbk_buf_len = strlen(gbk_buf);
uint32_t code = , flag = , out_gbk_buf_len = ;
char c = , cstr[] = { };
for (int32_t i = ; i < gbk_buf_len;)
{
flag = ;
if (gbk_buf[i] >= || i == gbk_buf_len - )
{
c = gbk_buf[i];
//printf("%c\n", c); //ASCII字符
code = (uint32_t)c;
flag = ;
out_gbk_buf[out_gbk_buf_len] = c;
}
else
{
flag = ;
short high = (short)gbk_buf[i] + ;
short low = (short)gbk_buf[i + ] + ;
code = high * + low; cstr[] = gbk_buf[i];
cstr[] = gbk_buf[i + ];
cstr[] = ; out_gbk_buf[out_gbk_buf_len] = cstr[];
out_gbk_buf[out_gbk_buf_len + ] = cstr[];
//printf("%s\n", cstr);
}
// 检查敏感词
const SensitiveWord *sensitiveWord = FindSensitiveWord(code, &gbk_buf[i]);
int32_t word_len = ;
if (NULL != sensitiveWord)
{
flag = ;
//printf("%s\n", sensitiveWord->szWord);
word_len = strlen(sensitiveWord->szWord);
memset(&out_gbk_buf[out_gbk_buf_len],'*', word_len);
}
int32_t step = word_len + flag;
i += step;
out_gbk_buf_len += step;
}
out_gbk_buf[out_gbk_buf_len] = '\0';
//printf("out_gbk_buf = %s\n", out_gbk_buf);
GBK_To_UTF8(out_gbk_buf, strlen(out_gbk_buf), out_utf8_buf, enmMaxContentLength);
return ;
} const SensitiveWord* SensitiveWordsChecker::FindSensitiveWord(uint32_t code, const char *pos)
{
int32_t word_len = ;
WordMap::const_iterator it = mapWords.find(code);
if (it == mapWords.end()){ return NULL; }
WordList *wordList = it->second;
for (uint32_t i = ; i < wordList->size(); i++)
{
const SensitiveWord *sensitiveWord = (*wordList)[i];
word_len = strlen(sensitiveWord->szWord);
// 如果内容一样,就说明是敏感词
if (memcmp(sensitiveWord->szWord, pos, word_len) == )
{
return sensitiveWord;
}
}
return NULL;
} void SensitiveWordsChecker::GenTestData()
{
char in_gbk_buf[enmMaxWordsFileLength], out_gbk_buf[enmMaxWordsFileLength];
LoadFile(in_gbk_buf, enmMaxWordsFileLength, "poem.txt");
int32_t len = strlen(in_gbk_buf);
uint32_t n = ;
for (int32_t i = ; i < len && n < enmMaxWordsFileLength;++i)
{
if (i % == && short(in_gbk_buf[i]) > )
{
int32_t nRandIndex = rand() % nSensitiveWordCnt;
SensitiveWord sensitiveWord = arrSensitiveWord[nRandIndex];
int32_t word_len = strlen(sensitiveWord.szWord);
for (int32_t j = ; j < word_len && n < enmMaxWordsFileLength; ++j)
{
out_gbk_buf[n++] = sensitiveWord.szWord[j];
}
}
out_gbk_buf[n++] = in_gbk_buf[i];
}
out_gbk_buf[n] = '\0';
char out_utf8_buf[enmMaxWordsFileLength];
GBK_To_UTF8(out_gbk_buf, strlen(out_gbk_buf), out_utf8_buf, enmMaxWordsFileLength);
WriteToFile(out_utf8_buf, strlen(out_utf8_buf), "test_data.txt");
} void SensitiveWordsChecker::Test()
{
const int32_t max_line_len = ;
char utf8_buf[enmMaxWordsFileLength];
char out_utf8_buf[enmMaxWordsFileLength];
LoadFile(utf8_buf, enmMaxWordsFileLength, "test_data.txt");
const char *p = NULL, *q = utf8_buf;
uint32_t offset = ;
while ((p = strchr(q, '\n')) != NULL)
{
char in_uft8_line[max_line_len] = { };
char out_uft8_line[max_line_len] = { };
char out_gbk_line[max_line_len] = { };
memcpy(in_uft8_line, q, p - q);
UTF8_To_GBK(in_uft8_line, strlen(in_uft8_line), out_gbk_line, max_line_len);
printf("%s\n", out_gbk_line);
CheckSensitiveWord(out_uft8_line, in_uft8_line);
q = p + ;
char gbk[enmMaxContentLength];
UTF8_To_GBK(out_uft8_line, strlen(out_uft8_line), gbk, enmMaxContentLength);
printf("%s\n", gbk);
StrAppend(out_utf8_buf, enmMaxWordsFileLength, offset, "%s", out_uft8_line);
}
WriteToFile(out_utf8_buf, offset, "test_data_ret.txt");
} void SensitiveWordsChecker::StrAppend(char buf[], const uint32_t bufLen, uint32_t &offset, const char *fmt, ...)
{
va_list argptr;
va_start(argptr, fmt);
if (offset < bufLen)
{
offset += vsprintf_s(buf + offset, bufLen - offset, fmt, argptr);
}
va_end(argptr);
}
测试效果:
完整VS2013工程:http://download.csdn.net/detail/tangxin19930330/9558997
C++ 简单中文敏感词检测工具类的更多相关文章
- 关于spring中Assert的应用(方法入参检测工具类)
关于spring中Assert的应用(方法入参检测工具类) Web 应用在接受表单提交的数据后都需要对其进行合法性检查,如果表单数据不合法,请求将被驳回.类似的,当我们在编写类的方法时,也常常需要对方 ...
- 简单了解Spring中常用工具类_java - JAVA
文章来源:嗨学网 敏而好学论坛www.piaodoo.com 欢迎大家相互学习 文件资源操作 Spring 定义了一个 org.springframework.core.io.Resource 接口, ...
- 敏感词检测、屏蔽设计(iOS & Android)
敏感词检测 服务器端最常使用的算法是DFA算法.如果服务器端使用java实现常规的DFA算法,假若... 源码:https://github.com/qiyer/DFA_Cplusplus
- Android敏感词过滤主要类
package com.tradeaider.app.utils; import com.tradeaider.app.activity.MyApplication;import java.util. ...
- 【YFMemoryLeakDetector】人人都能理解的 iOS 内存泄露检测工具类
背景 即使到今天,iOS 应用的内存泄露检测,仍然是一个很重要的主题.我在一年前,项目中随手写过一个简单的工具类,当时的确解决了大问题.视图和控制器相关的内存泄露,几乎都不存在了.后来想着一直就那个工 ...
- php简单实用的操作文件工具类(创建、移动、复制、删除)
php简单实用好用的文件及文件夹复制函数和工具类(创建.移动.复制.删除) function recurse_copy($src,$dst) { // 原目录,复制到的目录 $dir = opend ...
- python中文及符号检测工具带GUI界面
import tkinter import webbrowser import re #本程序是一个中文字符和中文检测工具 #中文字符自己添加,我只添加了一点 #输入字符串,点击检查文本即可判断有没有 ...
- Spring Assert(方法入参检测工具类-断言)
Web 应用在接受表单提交的数据后都需要对其进行合法性检查,如果表单数据不合法,请求将被驳回.类似的,当我们在编写类的方法时,也常常需要对方法入参进行合 法性检查,如果入参不符合要求,方法将通过抛出异 ...
- Java 实现简单的SQL动态组装工具类
第一版 package com.zh.oukele.util; import java.util.HashMap; import java.util.Iterator; import java.uti ...
随机推荐
- ArduinoYun的电源插座
ArduinoYun的电源插座 Arduino Yun有两排插座,这些插座可以按类型分为三类:电源.数字IO和模拟输入.电源部分主要集中在如图1.7所示的部分本文选自Arduino Yun快速入门教程 ...
- JavaScript中两个感叹号(!!)的作用是什么?
!!一般用来将后面的表达式强制转换为布尔类型的数据(boolean),也就是只能是true或者false. 看这么个例子: var a: var b=!!a; a默认是undefined.!a是tru ...
- windows 8 系统部署IIS并发布网站
企业用户可以在已经部署了windows 8 的电脑中通过部署IIS服务器来发布自己公司的企业内部网站实现对企业的网络办公的管理工作. 准备篇 IIS的添加和运行 一.IIS的添加 1.请进入“控制面板 ...
- ZOJ 3908 Number Game ZOJ Monthly, October 2015 - F
Number Game Time Limit: 2 Seconds Memory Limit: 65536 KB The bored Bob is playing a number game ...
- 初始化lpc2106开发工程
单片机型号:lpc2106.Init.s:初始化pc指针和sp指针. AREA Init, CODE, READONLY IMPORT test1_main EXPORT ...
- TYVJ P1083 分糖果 Label:bfs
描述 童年的我们,将和朋友分享美好的事物作为自己的快乐.这天,C小朋友得到了Plenty of candies,将要把这些糖果分给要好的朋友们.已知糖果从一个人传给另一个人需要1 秒的时间,同一个小朋 ...
- 【wikioi】1010 过河卒
题目链接 算法:DFS+剪枝 14.01.02 PS: 递推应该也可以的,改天看看 刚开始最容易想到的是朴素搜索 #include <iostream> using namespace s ...
- 使用mybatis执行oracle存储过程
存储过程在小公司用的不多,但是如果业务比较复杂或者性能要求比较苛刻的时候存储过程就派上用场了,ibatis的前期的一些版本貌似不支持存储过程因此我选择了mybatis来做实验. 1.无输入和输出参数的 ...
- Qt5.4 VS2010 Additional Dependancies
Go to Linker -> General -> Additional LIbrary Directories: qtmaind.libQt5Cored.libQt5Guid.libQ ...
- 新建childTest文件夹,里面依然放进我们需要的.py文件即可
一.模块 我们编写文件:a.py,放在C:\Python34\Lib\sit-packages下,里面写上一句代码为: print('this is a') 之后我们就可以在我们的代码里面引用a.py ...