C#脏字过滤算法

public class DirtyWordOper
    {
        private static Dictionary<string, object> hash = new Dictionary<string, object>();
        private static BitArray firstCharCheck = new BitArray(char.MaxValue);//把脏词的第一个字符记录下来
        private static BitArray allCharCheck = new BitArray(char.MaxValue);//把每一个个脏词的所有字符都记录下来
        private static int maxLength = 0;//
        private static bool onlyOne = true;

#region
        /// <summary>
        /// 返回替换后的字符串字符串的长度不变
        /// </summary>
        /// <param name="text"></param>
        /// <returns></returns>
        public string Replace(string text)
        {
            if (onlyOne)
            {
                Init();//初始化数据执行一次就不会执行了
                onlyOne = false;
            }
            if (!isDirtyword(text))
            {
                return text;
            }
            //获取替换操作表
            List<DetailRepModel> drlist = GetList(text);
            //执行替换操作
            return Replace2(text, drlist);
        }

/// <summary>
        /// 初始化用只执行一次
        /// </summary>
        /// <param name="text"></param>
        private static void Init()
        {
            string[] badwords = DirtyWordData.DirtyKeyword.Split('|');
            foreach (string bw in badwords)
            {
                string[] strarrtemp = bw.Split('&');
                string word = strarrtemp[0];
                word = word.Trim();//去掉数据中的空格及格式符号
                word = word.Replace("/r", "");
                word = word.Replace("/n", "");
                if (word == "")
                {
                    break;
                }
                if (!hash.ContainsKey(word))
                {
                    hash.Add(word, null);
                    maxLength = Math.Max(maxLength, word.Length);
                    firstCharCheck[word[0]] = true; 代码生成器

foreach (char c in word)
                    {
                        allCharCheck[c] = true;
                    }
                }
            }
        }
        /// <summary>
        /// 是否包含了脏词
        /// </summary>
        /// <param name="text"></param>
        /// <returns></returns>
        private static bool isDirtyword(string text)
        {
            int index = 0;
            //int offset = 0;
            while (index < text.Length)
            {
                //如果第一个字符都不符合
                if (!firstCharCheck[text[index]])
                {// 直接找到与脏词第一字符相同为止
                    while (index < text.Length - 1 && !firstCharCheck[text[++index]]) ;
                }
                for (int j = 1; j <= Math.Min(maxLength, text.Length - index); j++)
                {
                    if (!allCharCheck[text[index + j - 1]])
                    {
                        break;
                    }
                    string sub = text.Substring(index, j);
                    //判定脏字字典中是否包括了脏词
                    if (hash.ContainsKey(sub))
                    {
                        return true;//是
                    }
                }
                index++;
            }
            return false;//否
        }

/// <summary>
        /// 返回操作列表
        /// </summary>
        /// <param name="text"></param>
        /// <returns></returns>
        private static List<DetailRepModel> GetList(string text)
        {
            List<DetailRepModel> DetailList = new List<DetailRepModel>();
            int index = 0;
            while (index < text.Length)
            {
                if (!firstCharCheck[text[index]])
                {
                    while (index < text.Length - 1 && !firstCharCheck[text[++index]]) ;
                }
                DetailRepModel tempDetail = null;
                for (int j = 1; j <= Math.Min(maxLength, text.Length - index); j++)
                {
                    if (!allCharCheck[text[index + j - 1]])
                    {
                        if (tempDetail != null)
                        {//优先先字符串替换
                            index = index + tempDetail.number - 1;//索引要返回上一位，所以要减1
                            DetailList.Add(tempDetail);
                        }
                        break;
                    }
                    string sub = text.Substring(index, j);
                    if (hash.ContainsKey(sub))
                    {
                        tempDetail = new DetailRepModel();
                        tempDetail.index = index;
                        tempDetail.number = sub.Length;
                        tempDetail.content = sub;
                        //break;//进行下一次不然要出现， abc 其中ab 与a都关键字要生成两个操作
                    }
                    if (tempDetail != null)
                    {
                        if (j + 1 > Math.Min(maxLength, text.Length - index))
                        {//优先先字符串替换
                            DetailList.Add(tempDetail);
                            index = index + tempDetail.number - 1;//索引要返回上一位，所以要减1
                        }
                    }
                }
                index++;
            }
            return DetailList;
        }
        /// <summary>
        /// 传入字串和脏字替换操作表,
        /// </summary>
        /// <param name="text"></param>
        /// <param name="drlist"></param>
        /// <returns> 输出替换后的字串</returns>
        private static string Replace2(string text, List<DetailRepModel> drlist)
        {

if (drlist == null || drlist.Count == 0 || text == "")
            {
                return text;
            }
            foreach (DetailRepModel dr in drlist)
            {
                if (dr != null)
                {
                    string strtemp = text.Substring(dr.index, dr.number);
                    object ob = DirtyWordData.DirtyHT[(object)strtemp];
                    if (ob == null)
                    {
                        //记录错误
                        break;
                    }
                    // 这样替换有错误，
                    text = text.Substring(0, dr.index) + ob.ToString() + text.Substring(dr.index + dr.number);
                    //text = text.Replace(strtemp, ob.ToString());
                }
            }
            return text;
        }
        #endregion
    }

效果还行，不过我们老大给我说了个方法更NB，说比这种要快50倍；只是写起来有点麻烦

public interface IReplaceDW
    {
        string Replace(string s);
    }
    public class ReplaceDW
    {
        public static void AddToWords(DirtyChar parent, string s, string t)
        {
            DirtyChar dc = parent.Children.Find(o => o.Orienginal == s[0]);
            if (dc == null)
            {
                dc = new DirtyChar() { Orienginal = s[0], Children = new List<DirtyChar>(), Target = "" };
                parent.Children.Add(dc);
            }
            if (s.Length > 1)
            {//
                AddToWords(dc, s.Substring(1), t);
            }
            else
            {
                dc.Target = t;
            }
        }

public static string BuildChildren(DirtyChar dc, int deepLevel)
        {
            StringBuilder sb = new StringBuilder();
            string spaces = new string(' ', deepLevel + 4);

if (dc.Children.Count > 0)
            {
                sb.Append(@"
" + spaces + @"if (i + 1 == len){");
                sb.Append(@"
" + spaces + @"    sb.Append(""" + dc.Target + @""");
                ");
                sb.Append(@"
" + spaces + @"    i++;
" + spaces + @"    break;}");
                sb.Append(@"
" + spaces + @" switch (s[i + " + deepLevel.ToString() + @"])
" + spaces + @" {
");
                foreach (DirtyChar c in dc.Children)
                {
                    sb.Append(@"
" + spaces + @"  case '" + c.Orienginal + @"':
");
                    sb.Append(BuildChildren(c, deepLevel + 1));
                    sb.Append(@"
" + spaces + @"   break;");
                }


                sb.Append(@"
" + spaces + @" default:
" + spaces + @"    sb.Append(""" + dc.Target + @""");
" + spaces + @"    i++;
" + spaces + @"    break;
" + spaces + @" }
");
            }
            else
            {
                sb.Append(@"
" + spaces + @"  sb.Append(""" + dc.Target + @""");
");
                if (deepLevel == 1)
                {
                    sb.Append(@"
" + spaces + @"  i++;
");
                }
                else
                {
                    sb.Append(@"
" + spaces + @"  i += " + (deepLevel).ToString() + @";
");
                }
            }
            return sb.ToString();
        }

private IReplaceDW _r = null;
        private static bool isfirst = true;
        public string Replace(string s)
        {
            return _r.Replace(s);
        }
        private static List<KeyValuePair<string, string>> tmp = new List<KeyValuePair<string, string>>();
        public ReplaceDW()
        {
            if (isfirst)
            {
                List<KeyValuePair<string, string>> dict = new List<KeyValuePair<string, string>>();
                foreach (DictionaryEntry d in KeyWord.DirtyWordData.DirtyHT)
                {
                    dict.Add(new KeyValuePair<string, string>(d.Key.ToString(), d.Value.ToString()));
                }
                // 整理进 list
                //List<KeyValuePair<string, string>> tmp = new List<KeyValuePair<string, string>>();
                foreach (KeyValuePair<string, string> kv in dict)
                {
                    tmp.Add(kv);
                }
                // 倒排
                tmp.Sort((a, b) => { return b.Key.CompareTo(a.Key); });
                isfirst = false;
            }
            var compiler = new CSharpCodeProvider();
            var options = new CompilerParameters();

// set compile options
            options.CompilerOptions = "/o";
            options.GenerateExecutable = false;
            options.GenerateInMemory = true;
            options.ReferencedAssemblies.Add("System.dll");
            options.ReferencedAssemblies.Add(this.GetType().Assembly.Location);

// set the source code to compile
            DirtyChar words = new DirtyChar() { Children = new List<DirtyChar>() };
            //DirtyChar words2 = new DirtyChar();
            //words2.Children = new List<DirtyChar>();
            foreach (KeyValuePair<string, string> kv in tmp)
            {//构建字典表
                AddToWords(words, kv.Key, kv.Value);
            }

StringBuilder sb = new StringBuilder();
            sb.Append(@"
using System;
namespace KeyWord
{
public class ReplaceDW_ : IReplaceDW
{
    public string Replace( string s )
{
  int len = s.Length, i = 0;
        System.Text.StringBuilder sb = new System.Text.StringBuilder(len);
");
            sb.Append(@"
  while (i < len)
  {
   switch (s[i])
   {
");
            foreach (DirtyChar c in words.Children)
            {
                sb.Append(@"
    case '" + c.Orienginal + @"':
");
                sb.Append(BuildChildren(c, 1));
                sb.Append(@"
     break;");
            }
            sb.Append(@"
    default:
     sb.Append(s[i++]);
     break;
   }
  }
");
            sb.Append(@"
  return sb.ToString();

}
}
}");
            // compile the code, on-the-fly
            var result = compiler.CompileAssemblyFromSource(options, sb.ToString());

            foreach (var error in result.Errors)
            {
                // print errors
                ;
            }

// if compilation sucessed
            if ((!result.Errors.HasErrors) && (result.CompiledAssembly != null))
            {
                var type = result.CompiledAssembly.GetType("KeyWord.ReplaceDW_");
                try
                {
                    if (type != null)
                    {
                        this._r = Activator.CreateInstance(type) as IReplaceDW;
                    }
                    this.Replace("x"); //预热
                    this.Replace("x"); //预热
                }
                catch (Exception ex)
                {
                    Console.WriteLine(ex);
                }
            }
        }
    }

点击下载本例源码

C#脏字过滤算法的更多相关文章

SVD++：推荐系统的基于矩阵分解的协同过滤算法的提高
1.背景知识在讲SVD++之前,我还是想先回到基于物品相似的协同过滤算法.这个算法基本思想是找出一个用户有过正反馈的物品的相似的物品来给其作为推荐.其公式为:
GBDT(Gradient Boosting Decision Tree)算法&协同过滤算法
GBDT(Gradient Boosting Decision Tree)算法参考:http://blog.csdn.net/dark_scope/article/details/24863289 理 ...
Spark机器学习之协同过滤算法
Spark机器学习之协同过滤算法一).协同过滤 1.1 概念协同过滤是一种借助"集体计算"的途径.它利用大量已有的用户偏好来估计用户对其未接触过的物品的喜好程度.其内在思想是相 ...
Collaborative Filtering(协同过滤)算法详解
基本思想基于用户的协同过滤算法是通过用户的历史行为数据发现用户对商品或内容的喜欢(如商品购买,收藏,内容评论或分享),并对这些喜好进行度量和打分.根据不同用户对相同商品或内容的态度和偏好程度计算用户 ...
【机器学习笔记一】协同过滤算法 - ALS
参考资料 [1]<Spark MLlib 机器学习实践> [2]http://blog.csdn.net/u011239443/article/details/51752904 [3]线性 ...
吴恩达机器学习笔记58-协同过滤算法（Collaborative Filtering Algorithm）
在之前的基于内容的推荐系统中,对于每一部电影,我们都掌握了可用的特征,使用这些特征训练出了每一个用户的参数.相反地,如果我们拥有用户的参数,我们可以学习得出电影的特征. 但是如果我们既没有用户的参数, ...
Spark机器学习(11)：协同过滤算法
协同过滤(Collaborative Filtering,CF)算法是一种常用的推荐算法,它的思想就是找出相似的用户或产品,向用户推荐相似的物品,或者把物品推荐给相似的用户.怎样评价用户对商品的偏好? ...
亚马逊协同过滤算法 Collaborative filtering
这节课时郭强的三维课.他讲的是MAYA和max .自己对这个也不怎么的感兴趣.而且这个课感觉属于数字媒体.自己对游戏,动画,这些东西一点都不兴趣,比如大一的时候刚开学的时候,张瑞的数字媒体的导论课.还 ...
win7下使用Taste实现协同过滤算法
如果要实现Taste算法,必备的条件是: 1) JDK,使用1.6版本.需要说明一下,因为要基于Eclipse构建,所以在设置path的值之前要先定义JAVA_HOME变量. 2) Maven,使用2 ...

随机推荐

Tomcat服务器如何读取本地磁盘数据?
实际问题: 如何让用户下载本地磁盘的资源文件呢? 在server.xml文件中配置虚拟路径如下(以下代码放在Host标签之中即可): 例如: 具体含义: 把本地磁盘目录 "D:\uploa ...
intellij idea 插件开发--快速定位到mybatis mapper文件中的sql
intellij idea 提供了openApi,通过openApi我们可以自己开发插件,提高工作效率.这边直接贴个链接,可以搭个入门的demo:http://www.jianshu.com/p/24 ...
UVa10791 - Minimum Sum LCM
分析即为紫薯上的分析. 难点是发现当每个aipi作为一个单独的整数时才最优.. 答案就是将所有不同的相同因子的积相加即可代码: #include<cstdio> #include&l ...
Java数据结构和算法总结-字符串及高频面试题算法
前言:周末闲来无事,在七月在线上看了看字符串相关算法的讲解视频,收货颇丰,跟着视频讲解简单做了一下笔记,方便以后翻阅复习同时也很乐意分享给大家.什么字符串在算法中有多重要之类的大路边上的客套话就不多说 ...
MySQL比like语句更高效的写法locate position instr find_in_set
使用内部函数instr,可代替传统的like方式查询,并且速度更快. instr函数,第一个参数是字段,第二个参数是要查询的串,返回串的位置,第一个是1,如果没找到就是0. 例如, select na ...
IO 调优
磁盘优化 1.增加缓存 2.优化磁盘的管理系统 3.设计合理的磁盘存储数据块 4.应用合理的RAID策略 TCP网络参数调优网络IO优化 1.减少网络交互次数 2.减少网络传输数据量的大小 3.尽量 ...
LeetCode 599. Minimum Index Sum of Two Lists （从两个lists里找到相同的并且位置总和最靠前的）
Suppose Andy and Doris want to choose a restaurant for dinner, and they both have a list of favorite ...
LeetCode 339. Nested List Weight Sum （嵌套列表重和）$
Given a nested list of integers, return the sum of all integers in the list weighted by their depth. ...
php 不写闭合标签
参阅了一些文章,对PHP闭合标签的总结如下: 好处:如果这个是一个被别人包含的程序,没有这个结束符,可以减少很多很多问题,比如说:header, setcookie, session_st ...
swift之函数式编程（四）
文章内容来自<Functional Programing in Swift>,具体内容请到书中查阅 Map, Filter, Reduce Functions that take func ...

C#脏字过滤算法

C#脏字过滤算法的更多相关文章

随机推荐

热门专题