点击下载 SegList.rar

主要功能如下
最新的SegList分词辅助类,帮助类
看下面代码吧

/// <summary>
/// 类说明:SegList
/// 编 码 人:苏飞
/// 联系方式:361983679
/// 更新网站:[url=http://www.sufeinet.com/thread-655-1-1.html]http://www.sufeinet.com/thread-655-1-1.html[/url]
/// </summary>
using System;
using System.Collections;
using System.IO;
using System.Text.RegularExpressions; namespace DotNet.Utilities
{
/// <summary>
/// 分词辅助类
/// </summary>
public class SegList
{
public int MaxLength;
private ArrayList m_seg; public int Count
{
get
{
return m_seg.Count;
}
} public SegList()
{
m_seg = new ArrayList();
MaxLength = ;
} public void Add(object obj)
{
m_seg.Add(obj);
if (MaxLength < obj.ToString().Length)
{
MaxLength = obj.ToString().Length;
}
} public object GetElem(int i)
{
if (i < this.Count)
return m_seg[i];
else
return null;
} public void SetElem(int i, object obj)
{
m_seg[i] = obj;
} public bool Contains(object obj)
{
return m_seg.Contains(obj);
} /// <summary>
/// 按长度排序
/// </summary>
public void Sort()
{
Sort(this);
} /// <summary>
/// 按长度排序
/// </summary>
public void Sort(SegList list)
{
int max = ;
for (int i = ; i < list.Count - ; ++i)
{
max = i;
for (int j = i + ; j < list.Count; ++j)
{ string str1 = list.GetElem(j).ToString();
string str2 = list.GetElem(max).ToString();
int l1;
int l2;
if (str1 == "null")
l1 = ;
else
l1 = str1.Length; if (str2 == "null")
l2 = ;
else
l2 = str2.Length; if (l1 > l2)
max = j;
}
object o = list.GetElem(max);
list.SetElem(max, list.GetElem(i));
list.SetElem(i, o);
}
}
} /// <summary>
/// 分词类
/// </summary>
//----------------调用----------------------
//Segment seg = new Segment();
//seg.InitWordDics();
//seg.EnablePrefix = true;
//seg.Separator =" ";
//seg.SegmentText("字符串", false).Trim();
//-------------------------------------------
public class Segment
{
#region 私有字段
private string m_DicPath = System.Web.HttpContext.Current.Server.MapPath("bin/ShootSeg/sDict.dic");
private string m_NoisePath = System.Web.HttpContext.Current.Server.MapPath("bin/ShootSeg/sNoise.dic");
private string m_NumberPath = System.Web.HttpContext.Current.Server.MapPath("bin/ShootSeg/sNumber.dic");
private string m_WordPath = System.Web.HttpContext.Current.Server.MapPath("bin/ShootSeg/sWord.dic");
private string m_PrefixPath = System.Web.HttpContext.Current.Server.MapPath("bin/ShootSeg/sPrefix.dic");
private Hashtable htWords;
private ArrayList alNoise;
private ArrayList alNumber;
private ArrayList alWord;
private ArrayList alPrefix;
private double m_EventTime = ; /// <summary>
/// 分隔符
/// </summary>
private string m_Separator = " "; /// <summary>
/// 用于验证汉字的正则表达式
/// </summary>
private string strChinese = "[\u4e00-\u9fa5]";
#endregion #region 公有属性
/// <summary>
/// 基本词典路径
/// </summary>
public string DicPath
{
get
{
return m_DicPath;
}
set
{
m_DicPath = value;
}
} /// <summary>
/// 数据缓存函数
/// </summary>
/// <param name="key">索引键</param>
/// <param name="val">缓存的数据</param>
private static void SetCache(string key, object val)
{
if (val == null) val = " ";
System.Web.HttpContext.Current.Application.Lock();
System.Web.HttpContext.Current.Application.Set(key, val);
System.Web.HttpContext.Current.Application.UnLock();
} /// <summary>
/// 读取缓存
/// </summary>
private static object GetCache(string key)
{
return System.Web.HttpContext.Current.Application.Get(key);
} /// <summary>
/// 暂时无用
/// </summary>
public string NoisePath
{
get
{
return m_NoisePath;
}
set
{
m_NoisePath = value;
}
} /// <summary>
/// 数字词典路径
/// </summary>
public string NumberPath
{
get
{
return m_NumberPath;
}
set
{
m_NumberPath = value;
}
} /// <summary>
/// 字母词典路径
/// </summary>
public string WordPath
{
get
{
return m_WordPath;
}
set
{
m_WordPath = value;
}
} /// <summary>
/// 姓名前缀字典 用于纠错姓名
/// </summary>
public string PrefixPath
{
get
{
return m_PrefixPath;
}
set
{
m_PrefixPath = value;
}
} /// <summary>
/// 是否开启姓名纠错功能
/// </summary>
public bool EnablePrefix
{
get
{
if (alPrefix.Count == )
return false;
else
return true;
}
set
{
if (value)
alPrefix = LoadWords(PrefixPath, alPrefix);
else
alPrefix = new ArrayList();
}
} /// <summary>
/// 用时每次进行加载或分词动作后改属性表示为上一次动作所用时间
/// 已精确到毫秒但分词操作在字符串较短时可能为0
/// </summary>
public double EventTime
{
get
{
return m_EventTime;
}
} /// <summary>
/// 分隔符,默认为空格
/// </summary>
public string Separator
{
get
{
return m_Separator;
}
set
{
if (value != "" && value != null) m_Separator = value;
}
}
#endregion #region 构造方法
/// <summary>
/// 构造方法
/// </summary>
public Segment()
{ } /// <summary>
/// 构造方法
/// </summary>
public Segment(string p_DicPath, string p_NoisePath, string p_NumberPath, string p_WordPath)
{
m_WordPath = p_DicPath;
m_WordPath = p_NoisePath;
m_WordPath = p_NumberPath;
m_WordPath = p_WordPath;
this.InitWordDics();
}
#endregion #region 公有方法
/// <summary>
/// 加载词列表
/// </summary>
public void InitWordDics()
{
DateTime start = DateTime.Now;
if (GetCache("jcms_dict") == null)
{
htWords = new Hashtable();
Hashtable father = htWords;
Hashtable forfather = htWords; string strChar1;
string strChar2; StreamReader reader = new StreamReader(DicPath, System.Text.Encoding.UTF8);
string strline = reader.ReadLine(); SegList list;
Hashtable child = new Hashtable(); long i = ;
while (strline != null && strline.Trim() != "")
{
i++;
strChar1 = strline.Substring(, );
strChar2 = strline.Substring(, );
if (!htWords.ContainsKey(strChar1))
{
father = new Hashtable();
htWords.Add(strChar1, father);
}
else
{
father = (Hashtable)htWords[strChar1];
} if (!father.ContainsKey(strChar2))
{
list = new SegList();
if (strline.Length > )
list.Add(strline.Substring());
else
list.Add("null");
father.Add(strChar2, list);
}
else
{
list = (SegList)father[strChar2];
if (strline.Length > )
{
list.Add(strline.Substring());
}
else
{
list.Add("null");
}
father[strChar2] = list;
}
htWords[strChar1] = father;
strline = reader.ReadLine();
}
try
{
reader.Close();
}
catch
{ }
SetCache("jcms_dict", htWords);
}
htWords = (Hashtable)GetCache("jcms_dict"); alNoise = LoadWords(NoisePath, alNoise);
alNumber = LoadWords(NumberPath, alNumber);
alWord = LoadWords(WordPath, alWord);
alPrefix = LoadWords(PrefixPath, alPrefix); TimeSpan duration = DateTime.Now - start;
m_EventTime = duration.TotalMilliseconds;
} /// <summary>
/// 加载文本词组到ArrayList
/// </summary>
public ArrayList LoadWords(string strPath, ArrayList list)
{
StreamReader reader = new StreamReader(strPath, System.Text.Encoding.UTF8);
list = new ArrayList();
string strline = reader.ReadLine();
while (strline != null)
{
list.Add(strline);
strline = reader.ReadLine();
}
try
{
reader.Close();
}
catch
{ }
return list;
} /// <summary>
/// 输出词列表
/// </summary>
public void OutWords()
{
IDictionaryEnumerator idEnumerator1 = htWords.GetEnumerator();
while (idEnumerator1.MoveNext())
{
IDictionaryEnumerator idEnumerator2 = ((Hashtable)idEnumerator1.Value).GetEnumerator();
while (idEnumerator2.MoveNext())
{
SegList aa = (SegList)idEnumerator2.Value;
for (int i = ; i < aa.Count; i++)
{
Console.WriteLine(idEnumerator1.Key.ToString() + idEnumerator2.Key.ToString() + aa.GetElem(i).ToString());
}
}
}
} /// <summary>
/// 输出ArrayList
/// </summary>
public void OutArrayList(ArrayList list)
{
if (list == null) return;
for (int i = ; i < list.Count; i++)
{
Console.WriteLine(list[i].ToString());
}
} /// <summary>
/// 分词过程,不支持回车
/// </summary>
/// <param name="strText">要分词的文本</param>
/// <returns>分词后的文本</returns>
public string SegmentText(string strText)
{
strText = (strText + "$").Trim();
if (htWords == null) return strText;
if (strText.Length < ) return strText;
DateTime start = DateTime.Now;
int length = ;
int preFix = ;
bool word = false;
bool number = false;
string reText = "";
string strPrefix = "";
string strLastChar = "";
string strLastWords = Separator; for (int i = ; i < strText.Length - ; i++)
{
#region 对于每一个字的处理过程
string strChar1 = strText.Substring(i, );
string strChar2 = strText.Substring(i + , ).Trim();
bool yes;
SegList l;
Hashtable h; if (reText.Length > ) strLastChar = reText.Substring(reText.Length - ); if (strChar1 == " ")
{
if ((number || word) && strLastChar != Separator) reText += this.Separator;
yes = true;
}
else
yes = false; int CharType = GetCharType(strChar1);
switch (CharType)
{
case :
#region 如果是数字,如果数字的上一位是字母要和后面的数字分开
if (word)
{
reText += Separator;
}
word = false;
number = true;
strLastWords = "";
break;
#endregion
case :
case :
#region 如果是字母
if (number)
strLastWords = Separator;
else
strLastWords = ""; word = true;
number = false;
break;
#endregion
case :
case :
#region 第一级哈希表是否包含关键字,假如包含处理第二级哈希表
//上一个字是否为字母
if (word) reText += Separator; #region 检测上一个是否是数字,这个过程是用于修正数字后的量词的
if (number && CharType != )
{
h = (Hashtable)htWords["n"];
if (h.ContainsKey(strChar1))
{
l = (SegList)h[strChar1];
if (l.Contains(strChar2))
{
reText += strChar1 + strChar2 + Separator;
yes = true;
i++;
}
else if (l.Contains("null"))
{
reText += strChar1 + Separator;
yes = true;
}
}
else
reText += Separator;
}
#endregion //非汉字数字的汉字
if (CharType == )
{
word = false;
number = false;
strLastWords = Separator;
}
else
{
word = false;
number = true;
strLastWords = "";
} //第二级哈希表取出
h = (Hashtable)htWords[strChar1]; //第二级哈希表是否包含关键字
if (h.ContainsKey(strChar2))
{
#region 第二级包含关键字
//取出ArrayList对象
l = (SegList)h[strChar2]; //遍历每一个对象 看是否能组合成词
for (int j = ; j < l.Count; j++)
{
bool have = false;
string strChar3 = l.GetElem(j).ToString(); //对于每一个取出的词进行检测,看是否匹配,长度保护
if ((strChar3.Length + i + ) < strText.Length)
{
//向i+2后取出m长度的字
string strChar = strText.Substring(i + , strChar3.Length).Trim();
if (strChar3 == strChar && !yes)
{
if (strPrefix != "")
{
reText += strPrefix + Separator;
strPrefix = "";
preFix = ;
}
reText += strChar1 + strChar2 + strChar;
i += strChar3.Length + ;
have = true;
yes = true;
break;
}
}
else if ((strChar3.Length + i + ) == strText.Length)
{
string strChar = strText.Substring(i + ).Trim();
if (strChar3 == strChar && !yes)
{
if (strPrefix != "")
{
reText += strPrefix + Separator;
strPrefix = "";
preFix = ;
}
reText += strChar1 + strChar2 + strChar;
i += strChar3.Length + ;
have = true;
yes = true;
break;
}
} if (!have && j == l.Count - && l.Contains("null") && !yes)
{
if (preFix == )
{
reText += strPrefix + strChar1 + strChar2;
strPrefix = "";
preFix = ;
}
else if (preFix > )
{
reText += strPrefix + strLastWords + strChar1 + strChar2;
strPrefix = "";
preFix = ;
}
else
{
if (CharType == ) reText += strChar1 + strChar2;
else reText += strChar1 + strChar2;
strLastWords = this.Separator;
number = false;
}
i++;
yes = true;
break;
}
else if (have)
{
break;
}
}
#endregion //如果没有匹配还可能有一种情况,这个词语只有两个字,以这两个字开头的词语不存在
if (!yes && l.Contains("null"))
{
if (preFix == )
{
reText += strPrefix + strChar1 + strChar2;
strPrefix = "";
preFix = ;
}
else if (preFix > )
{
reText += strPrefix + strLastWords + strChar1 + strChar2;
strPrefix = "";
preFix = ;
}
else
{
if (CharType == ) reText += strChar1 + strChar2;
else reText += strChar1 + strChar2;
strLastWords = this.Separator;
number = false;
}
i++;
yes = true;
}
if (reText.Length > ) strLastChar = reText.Substring(reText.Length - );
if (CharType == && GetCharType(strLastChar) == )
{
number = true;
}
else if (strLastChar != this.Separator) reText += this.Separator;
}
#endregion
break;
default:
#region 未知字符,可能是生僻字,也可能是标点符合之类
if (word && !yes)
{
reText += Separator;
}
else if (number && !yes)
{
reText += Separator;
}
number = false;
word = false;
strLastWords = this.Separator;
break;
#endregion
}
if (!yes && number || !yes && word)
{
reText += strChar1;
yes = true;
}
if (!yes)
{
#region 处理姓名问题
if (preFix == )
{
if (alPrefix.Contains(strChar1 + strChar2))
{
i++;
strPrefix = strChar1 + strChar2;
preFix++;
}
else if (alPrefix.Contains(strChar1))
{
if (!number)
{
strPrefix = strChar1;
preFix++;
}
else
{
reText += strChar1 + strLastWords;
number = false;
word = false;
}
}
else
{
if (preFix == )
{
reText += strPrefix + Separator + strChar1 + Separator;
strPrefix = "";
preFix = ;
}
else if (preFix > )
{
if (Regex.IsMatch(strChar1, strChinese))
{
strPrefix += strChar1;
preFix++;
}
else
{
reText += strPrefix + Separator + strChar1 + Separator;
strPrefix = "";
preFix = ;
}
}
else
{
reText += strChar1 + strLastWords;
number = false;
word = false;
}
}
}
else
{
if (preFix == )
{
reText += strPrefix + Separator + strChar1 + Separator;
strPrefix = "";
preFix = ;
}
else if (preFix > )
{
if (Regex.IsMatch(strChar1, strChinese))
{
strPrefix += strChar1;
preFix++;
}
else
{
reText += strPrefix + Separator + strChar1 + Separator;
strPrefix = "";
preFix = ;
}
}
else
{
reText += strChar1 + strLastWords;
number = false;
}
}
#endregion
}
length = i;
#endregion
} #region 最后防止最后一个字的丢失
if (length < strText.Length - )
{
string strLastChar1 = strText.Substring(strText.Length - ).Trim();
string strLastChar2 = strText.Substring(strText.Length - ).Trim(); if (reText.Length > ) strLastChar = reText.Substring(reText.Length - );
if (preFix != )
{
reText += strPrefix + strLastChar1;
}
else
{
switch (GetCharType(strLastChar1))
{
case :
if (strLastChar1 != "." && strLastChar1 != ".")
reText += strLastChar1;
else
reText += Separator + strLastChar1;
break;
case :
case :
if (alWord.Contains(strLastChar2))
reText += strLastChar1;
break;
case :
case :
if ((number || word) && strLastChar != Separator)
reText += Separator + strLastChar1;
else
reText += strLastChar1;
break;
default:
if (strLastChar != Separator)
reText += Separator + strLastChar1;
else
reText += strLastChar1;
break;
}
}
if (reText.Length > ) strLastChar = (reText.Substring(reText.Length - ));
if (strLastChar != this.Separator) reText += this.Separator;
}
#endregion TimeSpan duration = DateTime.Now - start;
m_EventTime = duration.TotalMilliseconds;
return reText.Replace(" $", ""); //这里包含一个字的,则去掉
} /// <summary>
/// 重载分词过程,支持回车
/// </summary>
public string SegmentText(string strText, bool Enter)
{
if (Enter)
{
DateTime start = DateTime.Now;
string[] strArr = strText.Split('\n'); string reText = "";
for (int i = ; i < strArr.Length; i++)
{
reText += SegmentText(strArr[i]) + "\r\n";
} TimeSpan duration = DateTime.Now - start;
m_EventTime = duration.TotalMilliseconds;
return reText;
}
else
{
return SegmentText(strText);
}
} #region 判断字符类型
/// <summary>
/// 判断字符类型,0为未知,1为数字,2为字母,3为汉字,4为汉字数字
/// </summary>
private int GetCharType(string p_Char)
{
int CharType = ;
if (alNumber.Contains(p_Char)) CharType = ;
if (alWord.Contains(p_Char)) CharType = ;
if (htWords.ContainsKey(p_Char)) CharType += ;
return CharType;
}
#endregion #region 对加载的词典排序并重新写入
/// <summary>
/// 对加载的词典排序并重新写入
/// </summary>
public void SortDic()
{
SortDic(false);
} /// <summary>
/// 对加载的词典排序并重新写入
/// </summary>
/// <param name="Reload">是否重新加载</param>
public void SortDic(bool Reload)
{
DateTime start = DateTime.Now;
StreamWriter sw = new StreamWriter(DicPath, false, System.Text.Encoding.UTF8); IDictionaryEnumerator idEnumerator1 = htWords.GetEnumerator();
while (idEnumerator1.MoveNext())
{
IDictionaryEnumerator idEnumerator2 = ((Hashtable)idEnumerator1.Value).GetEnumerator();
while (idEnumerator2.MoveNext())
{
SegList aa = (SegList)idEnumerator2.Value;
aa.Sort();
for (int i = ; i < aa.Count; i++)
{
if (aa.GetElem(i).ToString() == "null")
sw.WriteLine(idEnumerator1.Key.ToString() + idEnumerator2.Key.ToString());
else
sw.WriteLine(idEnumerator1.Key.ToString() + idEnumerator2.Key.ToString() + aa.GetElem(i).ToString());
}
}
}
sw.Close(); if (Reload) InitWordDics(); TimeSpan duration = DateTime.Now - start;
m_EventTime = duration.TotalMilliseconds;
}
#endregion /// <summary>
/// 删除两行完全相同的词,暂时无用!
/// </summary>
/// <returns>相同词条个数</returns>
public int Optimize()
{
int l = ;
DateTime start = DateTime.Now; Hashtable htOptimize = new Hashtable();
StreamReader reader = new StreamReader(DicPath, System.Text.Encoding.UTF8);
string strline = reader.ReadLine();
while (strline != null && strline.Trim() != "")
{
if (!htOptimize.ContainsKey(strline))
htOptimize.Add(strline, null);
else
l++;
}
Console.WriteLine("ready");
try
{
reader.Close();
}
catch { }
StreamWriter sw = new StreamWriter(DicPath, false, System.Text.Encoding.UTF8);
IDictionaryEnumerator ide = htOptimize.GetEnumerator();
while (ide.MoveNext())
sw.WriteLine(ide.Key.ToString());
try
{
sw.Close();
}
catch { }
TimeSpan duration = DateTime.Now - start;
m_EventTime = duration.TotalMilliseconds;
return l;
}
#endregion
}
}

[分词] C#SegList分词辅助类,帮助类 (转载)的更多相关文章

  1. ElasticSearch已经配置好ik分词和mmseg分词(转)

    ElasticSearch是一个基于Lucene构建的开源,分布式,RESTful搜索引擎.设计用于云计算中,能够达到实时搜索,稳定,可靠,快速,安装使用方便.支持通过HTTP使用JSON进行数据索引 ...

  2. 为 Elasticsearch 添加中文分词,对比分词器效果

    转自:http://keenwon.com/1404.html 为 Elasticsearch 添加中文分词,对比分词器效果 Posted in 后端 By KeenWon On 2014年12月12 ...

  3. python中文分词:结巴分词

    中文分词是中文文本处理的一个基础性工作,结巴分词利用进行中文分词.其基本实现原理有三点: 基于Trie树结构实现高效的词图扫描,生成句子中汉字所有可能成词情况所构成的有向无环图(DAG) 采用了动态规 ...

  4. .添加索引和类型,同时设定edgengram分词和charsplit分词

    1.添加索引和类型,同时设定edgengram分词和charsplit分词 curl -XPUT 'http://127.0.0.1:9200/userindex/' -d '{   "se ...

  5. 为Elasticsearch添加中文分词,对比分词器效果

    http://keenwon.com/1404.html Elasticsearch中,内置了很多分词器(analyzers),例如standard (标准分词器).english(英文分词)和chi ...

  6. ES 09 - 定制Elasticsearch的分词器 (自定义分词策略)

    目录 1 索引的分析 1.1 分析器的组成 1.2 倒排索引的核心原理-normalization 2 ES的默认分词器 3 修改分词器 4 定制分词器 4.1 向索引中添加自定义的分词器 4.2 测 ...

  7. Elasticsearch拼音分词和IK分词的安装及使用

    一.Es插件配置及下载 1.IK分词器的下载安装 关于IK分词器的介绍不再多少,一言以蔽之,IK分词是目前使用非常广泛分词效果比较好的中文分词器.做ES开发的,中文分词十有八九使用的都是IK分词器. ...

  8. 和我一起打造个简单搜索之IK分词以及拼音分词

    elasticsearch 官方默认的分词插件,对中文分词效果不理想,它是把中文词语分成了一个一个的汉字.所以我们引入 es 插件 es-ik.同时为了提升用户体验,引入 es-pinyin 插件.本 ...

  9. 盘古分词+一元/二元分词Lucene

    本文参考自:https://blog.csdn.net/mss359681091/article/details/52078147 http://www.cnblogs.com/top5/archiv ...

随机推荐

  1. 利用switch case 来运行咱们结婚吧

    static void Main(string[] args)        {            while (true)            {                int x, ...

  2. LinkedIn高级分析师王益:大数据时代的理想主义和现实主义(图灵访谈)

    转自:http://www.ituring.com.cn/article/75445 王益,LinkedIn高级分析师.他曾在腾讯担任广告算法和策略的技术总监,在此期间他发明了并行机器学习系统“孔雀” ...

  3. 搜索(另类状态BFS):NOIP 华容道

    描述 小 B 最近迷上了华容道,可是他总是要花很长的时间才能完成一次.于是,他想到用编程来完成华容道:给定一种局面,华容道是否根本就无法完成,如果能完成,最少需要多少时间. 小 B 玩的华容道与经典的 ...

  4. [PeterDLax著泛函分析习题参考解答]第7章 Hilbert 空间结果的应用

    1. 对测度是 $\sigma$ 有限的情形证明 Radon-Nikodym 定理. 证明: 设 $\mu,\nu$ 均为 $\sigma$ 有限的非负测度, 则存在分割 $$\bex X=\cup_ ...

  5. Ubuntu下安装mysql-python包

      sudo apt-get install libmysqld-dev sudo apt-get install libmysqlclient-dev sudo apt-get install py ...

  6. Git起步--git安装与初次运行git前配置

    在你开始使用 Git 前,需要将它安装在你的计算机上. 即便已经安装,最好将它升级到最新的版本. 你可以通过软件包或者其它安装程序来安装,或者下载源码编译安装. 一.Git安装 1. 在linux上安 ...

  7. HDOJ 1004题 Let the Balloon Rise strcmp()函数

    Problem Description Contest time again! How excited it is to see balloons floating around. But to te ...

  8. ldr指令总结

    LDR/STR字和无符号字节加载/存储 1,LDR Rd,[Rn]   2, LDR Rd,[Rn,Flexoffset] 3, LDR Rd,[Rn],Flexoffset 4, LDR Rd,la ...

  9. 洛谷1377 M国王 (SCOI2005互不侵犯King)

    洛谷1377 M国王 (SCOI2005互不侵犯King) 本题地址:http://www.luogu.org/problem/show?pid=1377 题目描述 天天都是n皇后,多么无聊啊.我们来 ...

  10. numpy note_1

    tile(A,reps) 通过重复 reps次数的A 创建一个数组 examples: >>> a = np.array([0, 1, 2]) >>> np.til ...