记录lucene.net的使用过程
之前公司要做一个信息展示的网站,领导说要用lucene.net来实现全文检索,类似百度的搜索功能,但是本人技术有限,只是基本实现搜索和高亮功能,特此记录;
先看下页面效果,首先我搜索“为什么APP消息没有推送”,出来的结果如下图:

然后我再搜索“醒 消息 推”,出来结果如下图:

然后说下,我使用的是Lucene.net版本是2.9.22,盘古分词的版本是2.3.1,注意,版本lucene.net和盘古分词的版本一定要对上,之前我用Lucene.net3.0的版本,就一直有错误,后来换到低版本才没问题的;
接着是关键的类LuceneHelper,如下所示:
public class LuceneHelper
{
readonly LogHelper _logHelper = new LogHelper(MethodBase.GetCurrentMethod());
private LuceneHelper() { } #region 单例
private static LuceneHelper _instance = null;
private static readonly object Lock = new object();
/// <summary>
/// 单例
/// </summary>
public static LuceneHelper instance
{
get
{
lock (Lock)
{
if (_instance == null)
{
_instance = new LuceneHelper();
PanGu.Segment.Init(PanGuXmlPath);//使用盘古分词,一定要记得初始化
}
return _instance;
}
}
}
#endregion #region 分词测试 /// <summary>
/// 处理关键字为索引格式
/// </summary>
/// <param name="keywords"></param>
/// <returns></returns>
private string GetKeyWordsSplitBySpace(string keywords)
{
PanGuTokenizer ktTokenizer = new PanGuTokenizer();//使用盘古分词器来吧关键字分词
StringBuilder result = new StringBuilder();
ICollection<WordInfo> words = ktTokenizer.SegmentToWordInfos(keywords);
foreach (WordInfo word in words)
{
if (word == null)
{
continue;
}
//result.AppendFormat("{0}^{1}.0 ", word.Word, (int)Math.Pow(3, word.Rank));
result.AppendFormat("{0} ", word.Word);
}
return result.ToString().Trim();
}
#endregion #region 创建索引
/// <summary>
/// 创建索引
/// </summary>
/// <param name="datalist"></param>
/// <returns></returns>
public bool CreateIndex<T>(IList<T> datalist)
{
IndexWriter writer = null;
try
{
writer = new IndexWriter(directory_luce, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);//false表示追加(true表示删除之前的重新写入)
//writer = new IndexWriter(directory_luce, null, true, IndexWriter.MaxFieldLength.LIMITED);//false表示追加(true表示删除之前的重新写入)
}
catch
{
writer = new IndexWriter(directory_luce, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);//false表示追加(true表示删除之前的重新写入)
//writer = new IndexWriter(directory_luce, null, true, IndexWriter.MaxFieldLength.LIMITED);//false表示追加(true表示删除之前的重新写入)
}
foreach (var data in datalist)
{
CreateIndex<T>(writer, data);
}
writer.Optimize();
writer.Close();
return true;
} public bool CreateIndex<T>(IndexWriter writer, T data)
{
try
{ if (data == null) return false;
Document doc = new Document();
Type type = data.GetType(); //创建类的实例
//object obj = Activator.CreateInstance(type, true);
//获取公共属性
PropertyInfo[] Propertys = type.GetProperties();
for (int i = ; i < Propertys.Length; i++)
{
//Propertys[i].SetValue(Propertys[i], i, null); //设置值
PropertyInfo pi = Propertys[i];
string name = pi.Name;
object objval = pi.GetValue(data, null);
string value = objval == null ? "" : objval.ToString(); //值
if (name.ToLower() == "id" || name.ToLower() == "type")//id在写入索引时必是不分词,否则是模糊搜索和删除,会出现混乱
{
doc.Add(new Field(name, value, Field.Store.YES, Field.Index.NOT_ANALYZED));//id不分词
}
else if (name.ToLower() == "IsNewest".ToLower())
{
//doc.Add(new Field(name, value, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS));//分词建索引,但是Field的值不像通常那样被保存,而是只取一个byte,这样节约存储空间
doc.Add(new Field(name, value, Field.Store.YES, Field.Index.NOT_ANALYZED));//IsNewest不分词
}
else if (name.ToLower() == "IsReqular".ToLower())
{
//doc.Add(new Field(name, value, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS));//分词建索引,但是Field的值不像通常那样被保存,而是只取一个byte,这样节约存储空间
doc.Add(new Field(name, value, Field.Store.YES, Field.Index.NOT_ANALYZED));//IsReqular不分词
}
else
{
if (name.ToLower() == "Contents".ToLower())
{
value = GetNoHtml(value);//去除正文的html标签
}
doc.Add(new Field(name, value, Field.Store.YES, Field.Index.ANALYZED));//其他字段分词
}
}
writer.AddDocument(doc);
}
catch (System.IO.FileNotFoundException fnfe)
{
throw fnfe;
}
return true;
}
#endregion #region 在title和content字段中查询数据,该方法未使用,可能有错漏,我使用的是下面的分页查询的;
/// <summary>
/// 在title和content字段中查询数据
/// </summary>
/// <param name="keyword"></param>
/// <returns></returns>
public List<Questions> Search(string keyword)
{ string[] fileds = { "Title", "Contents" };//查询字段
//Stopwatch st = new Stopwatch();
//st.Start();
QueryParser parser = null;// new QueryParser(Lucene.Net.Util.Version.LUCENE_30, field, analyzer);//一个字段查询
parser = new MultiFieldQueryParser(version, fileds, analyzer);//多个字段查询
Query query = parser.Parse(keyword);
int n = ;
IndexSearcher searcher = new IndexSearcher(directory_luce, true);//true-表示只读
TopDocs docs = searcher.Search(query, (Filter)null, n);
if (docs == null || docs.totalHits == )
{
return null;
}
else
{
List<Questions> list = new List<Questions>();
int counter = ;
foreach (ScoreDoc sd in docs.scoreDocs)//遍历搜索到的结果
{
try
{
Document doc = searcher.Doc(sd.doc); string id = doc.Get("ID");
string title = doc.Get("Title");
string content = doc.Get("Contents"); string createdate = doc.Get("AddTime");
PanGu.HighLight.SimpleHTMLFormatter simpleHTMLFormatter = new PanGu.HighLight.SimpleHTMLFormatter("<font color=\"red\">", "</font>");
PanGu.HighLight.Highlighter highlighter = new PanGu.HighLight.Highlighter(simpleHTMLFormatter, new PanGu.Segment());
highlighter.FragmentSize = Int32.MaxValue;
content = highlighter.GetBestFragment(keyword, content);
string titlehighlight = highlighter.GetBestFragment(keyword, title);
if (titlehighlight != "") title = titlehighlight; Questions model = new Questions
{
ID = int.Parse(id),
Title = title,
Contents = content,
AddTime = DateTime.Parse(createdate)
}; list.Add(model);
}
catch (Exception ex)
{
Console.WriteLine(ex.Message);
}
counter++;
}
return list;
}
//st.Stop();
//Response.Write("查询时间:" + st.ElapsedMilliseconds + " 毫秒<br/>"); }
#endregion #region 在不同的分类下再根据title和content字段中查询数据(分页)
/// <summary>
/// 在不同的类型下再根据title和content字段中查询数据(分页)
/// </summary>
/// <param name="_type">分类,传空值查询全部</param>
/// <param name="keyword"></param>
/// <param name="PageIndex"></param>
/// <param name="PageSize"></param>
/// <param name="TotalCount"></param>
/// <returns></returns>
public List<Questions> Search(string _type,bool? _isnew,bool? _isreq ,string keyword, int PageIndex, int PageSize, out int TotalCount)
{
try
{
if (PageIndex < ) PageIndex = ;
//Stopwatch st = new Stopwatch();
//st.Start();
BooleanQuery bq = new BooleanQuery();
if (_type != "" && _type != "-100")
{
QueryParser qpflag = new QueryParser(version, "Type", analyzer);//一个字段查询
Query qflag = qpflag.Parse(_type);
bq.Add(qflag, Lucene.Net.Search.BooleanClause.Occur.MUST);//与运算
}
if (_isnew.HasValue)
{
QueryParser qpnew = new QueryParser(version, "IsNewest", analyzer);
Query qnew = qpnew.Parse(_isnew.Value.ToString());
bq.Add(qnew, Lucene.Net.Search.BooleanClause.Occur.MUST);
}
if (_isreq.HasValue)
{
QueryParser qpreq = new QueryParser(version, "IsReqular", analyzer);
Query qreq = qpreq.Parse(_isnew.Value.ToString());
bq.Add(qreq, Lucene.Net.Search.BooleanClause.Occur.MUST);
} string keyword2 = keyword;
if (keyword != "")
{ keyword = GetKeyWordsSplitBySpace(keyword); string[] fileds = { "Title", "Contents" };//查询字段
QueryParser parser = null;// new QueryParser(version, field, analyzer);//一个字段查询
parser = new MultiFieldQueryParser(version, fileds, analyzer);//多个字段查询
//parser.DefaultOperator = QueryParser.Operator.OR;
parser.SetDefaultOperator(QueryParser.Operator.OR);//这里QueryParser.Operator.OR表示并行结果,相当于模糊搜索,QueryParser.Operator.AND相当于精准搜索
Query queryKeyword = parser.Parse(keyword); bq.Add(queryKeyword, Lucene.Net.Search.BooleanClause.Occur.MUST);//与运算
} //TopScoreDocCollector collector = TopScoreDocCollector.Create(PageIndex * PageSize, false);
IndexSearcher searcher = new IndexSearcher(directory_luce, true);//true-表示只读 //Sort sort = new Sort(new SortField("AddTime", SortField.DOC, false)); //此处为结果排序功能,但是使用排序会影响搜索权重(类似百度搜索排名机制)
//TopDocs topDocs = searcher.Search(bq, null, PageIndex * PageSize, sort);
TopDocs topDocs = searcher.Search(bq, null, PageIndex * PageSize);
//searcher.Search(bq, collector);
if (topDocs == null || topDocs.totalHits == )
{
TotalCount = ;
return null;
}
else
{
int start = PageSize * (PageIndex - );
//结束数
int limit = PageSize;
ScoreDoc[] hits = topDocs.scoreDocs;
List<Questions> list = new List<Questions>();
int counter = ;
TotalCount = topDocs.totalHits;//获取Lucene索引里的记录总数 //Lucene.Net.Highlight.SimpleHTMLFormatter simpleHTMLFormatter = new Lucene.Net.Highlight.SimpleHTMLFormatter("<em class=\"hl-l-t-main\">", "</em>");
//Lucene.Net.Highlight.Highlighter highlighter = new Lucene.Net.Highlight.Highlighter(simpleHTMLFormatter,new Lucene.Net.Highlight.QueryScorer(bq)); foreach (ScoreDoc sd in hits)//遍历搜索到的结果
{
try
{
Document doc = searcher.Doc(sd.doc);
string id = doc.Get("ID");
string title = doc.Get("Title");
string content = doc.Get("Contents");
string updatetime = doc.Get("AddTime"); PanGu.HighLight.SimpleHTMLFormatter simpleHTMLFormatter = new PanGu.HighLight.SimpleHTMLFormatter("<em class=\"hl-l-t-main\">", "</em>");
PanGu.HighLight.Highlighter highlighter = new PanGu.HighLight.Highlighter(simpleHTMLFormatter, new Segment());//搜索关键字高亮显示,上面的高亮样式自己写
highlighter.FragmentSize = Int32.MaxValue; //这里如果值小于搜索内容的长度的话,会导致搜索结果被截断,因此设置最大,根据需求来吧
string contentHighlight = highlighter.GetBestFragment(keyword2, content);
string titleHighlight = highlighter.GetBestFragment(keyword2, title); //string titleHighlight = highlighter.GetBestFragment(analyzer, "Title", title); //string contentHighlight = highlighter.GetBestFragment(analyzer, "Contents", content); title = string.IsNullOrEmpty(titleHighlight) ? title : titleHighlight;
content = string.IsNullOrEmpty(contentHighlight) ? content : contentHighlight; var model = new Questions
{
ID = int.Parse(id),
Title = title,
Contents = content,
AddTime = DateTime.Parse(updatetime)
};
list.Add(model);
}
catch (Exception ex)
{
//这里可以写错误日志
}
counter++;
}
return list;
}
//st.Stop();
}
catch (Exception e)
{
TotalCount = ;
return null;
} } /// <summary>
/// 去除html标签
/// </summary>
/// <param name="StrHtml"></param>
/// <returns></returns>
public string GetNoHtml(string StrHtml)
{
string strText="";
if (!string.IsNullOrEmpty(StrHtml))
{
strText = System.Text.RegularExpressions.Regex.Replace(StrHtml, @"<[^>]+>", "");
strText = System.Text.RegularExpressions.Regex.Replace(strText, @"&[^;]+;", "");
strText = System.Text.RegularExpressions.Regex.Replace(strText, @"\\s*|\t|\r|\n", ""); }
return strText; }
#endregion #region 删除索引数据(根据id)
/// <summary>
/// 删除索引数据(根据id)
/// </summary>
/// <param name="id"></param>
/// <returns></returns>
public bool Delete(string id)
{
bool IsSuccess = false;
Term term = new Term("id", id);
//Analyzer analyzer = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30);
//Version version = new Version();
//MultiFieldQueryParser parser = new MultiFieldQueryParser(version, new string[] { "name", "job" }, analyzer);//多个字段查询
//Query query = parser.Parse("小王"); //IndexReader reader = IndexReader.Open(directory_luce, false);
//reader.DeleteDocuments(term);
//Response.Write("删除记录结果: " + reader.HasDeletions + "<br/>");
//reader.Dispose(); IndexWriter writer = new IndexWriter(directory_luce, analyzer, false, IndexWriter.MaxFieldLength.LIMITED);
writer.DeleteDocuments(term); // writer.DeleteDocuments(term)或者writer.DeleteDocuments(query);
////writer.DeleteAll();
writer.Commit();
//writer.Optimize();//
IsSuccess = writer.HasDeletions();
writer.Close();
return IsSuccess;
}
#endregion #region 删除全部索引数据
/// <summary>
/// 删除全部索引数据
/// </summary>
/// <returns></returns>
public bool DeleteAll()
{
bool IsSuccess = true;
try
{
IndexWriter writer = new IndexWriter(directory_luce, analyzer, false, IndexWriter.MaxFieldLength.LIMITED);
writer.DeleteAll();
writer.Commit();
writer.Optimize();//
IsSuccess = writer.HasDeletions();
writer.Close();
}
catch
{
IsSuccess = false;
}
return IsSuccess;
}
#endregion #region directory_luce
private Lucene.Net.Store.Directory _directory_luce = null;
/// <summary>
/// Lucene.Net的目录-参数
/// </summary>
public Lucene.Net.Store.Directory directory_luce
{
get
{
if (_directory_luce == null) _directory_luce = Lucene.Net.Store.FSDirectory.Open(directory);
return _directory_luce;
}
}
#endregion #region directory
private System.IO.DirectoryInfo _directory = null;
/// <summary>
/// 索引在硬盘上的目录
/// </summary>
public System.IO.DirectoryInfo directory
{
get
{
if (_directory == null)
{
string dirPath = HttpContext.Current.Server.MapPath("/LuceneDic");
if (System.IO.Directory.Exists(dirPath) == false)
_directory = System.IO.Directory.CreateDirectory(dirPath);
else
_directory = new System.IO.DirectoryInfo(dirPath);
}
return _directory;
}
}
#endregion #region analyzer
private Analyzer _analyzer = null;
/// <summary>
/// 分析器
/// </summary>
public Analyzer analyzer
{
get
{
//if (_analyzer == null)
{
// _analyzer = new Lucene.Net.Analysis.PanGu.PanGuAnalyzer();//弃用盘古分词,感觉有点问题,测试下来没有自带分词好用,也有可能是好用的,但是之前用的高版本lucene.net,导致分词失效
_analyzer = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_29);
}
return _analyzer;
}
}
#endregion #region version
private static Lucene.Net.Util.Version _version = Lucene.Net.Util.Version.LUCENE_29;
/// <summary>
/// 版本号枚举类
/// </summary>
public Lucene.Net.Util.Version version
{
get
{
return _version;
}
}
#endregion
/// <summary>
/// 盘古分词的配置文件
/// </summary>
protected static string PanGuXmlPath
{
get
{
return HttpContext.Current.Server.MapPath("/PanGu/PanGu.xml");
}
}
}
然后是一些需要引用的DLL和盘古分词的字典文件等,如下所示:
至此Lucene.net的简单应用到此结束,谢谢!
记录lucene.net的使用过程的更多相关文章
- 记录sqoop同步失败问题解决过程,过程真的是很崎岖。(1月6日解决)
记录sqoop同步失败问题解决过程,过程真的是很崎岖.事发原因:最近突然出现sqoop export to mysql时频繁出错.看了下日志是卡在某条数据过不去了,看异常.看sqoop生成的mr并未发 ...
- step_by_step_记录deepin下curl安装过程
记录 deepin 下 curl 安装过程 wget https://curl.haxx.se/download/curl-7.55.1.tar.gz .tar.gz cd curl-/ ./conf ...
- 【Android实战】记录自学自己定义GifView过程,能同一时候支持gif和其它图片!【有用篇】
之前写了一篇博客.<[Android实战]记录自学自己定义GifView过程,具体解释属性那些事! [学习篇]> 关于自己定义GifView的,具体解说了学习过程及遇到的一些类的解释,然后 ...
- 记录手动签名APK的过程
记录手动签名APK的过程 前两天更新了华为平台上的APK,被驳回,原因是新APK签名和老的APK不一致,老用户安装会失败,用命令行安装会报如下的错误: harlanc@harlancdeMacBook ...
- 记录一下安装hexo的过程
记录一下安装hexo的过程 首先你的电脑需要安装node.js和Git 安装好Git之后需要配置本机与Github之间的ssh方便更新同步博客到Github上,在一个地方新建一个文件夹作为我们博客的根 ...
- 理解Lucene索引与搜索过程中的核心类
理解索引过程中的核心类 执行简单索引的时候需要用的类有: IndexWriter.Directory.Analyzer.Document.Field 1.IndexWriter IndexWr ...
- lucene 建立索引的过程
时间 -- :: CSDN博客 原文 http://blog.csdn.net/caohaicheng/article/details/ 看lucene主页(http://lucene.apach ...
- lucene建立索引的过程
建立索引过程 用户提交数据=>solr建立索引=>调用lucene包建立索引 官方建立索引和查询索引的例子如下: http://lucene.apache.org/core/4_10_3/ ...
- 记录一次OOM分析过程
工具: jstat jmap jhat 1.jstat查看gc情况 S0C.S1C.S0U.S1U:Survivor 0/1区容量(Capacity)和使用量(Used) EC.EU:Eden区容量和 ...
随机推荐
- 深入理解Flink ---- End-to-End Exactly-Once语义
上一篇文章所述的Exactly-Once语义是针对Flink系统内部而言的. 那么Flink和外部系统(如Kafka)之间的消息传递如何做到exactly once呢? 问题所在: 如上图,当sink ...
- 使用java NIO及高速缓冲区写入文件
byte[] bytes = Files.readAllBytes(Paths.get("E:\\pdf\\aaa\\html\\text.txt").normalize()); ...
- selenium+python自动化测试-环境搭建
firefox浏览器打不开的解决办法: 1.确认将geckodriver拷贝到Firefox安装目录 2.将安装目录添加到Windows的path里 3.重启IDE
- webview的学习总结:
1.1: Weview常见的坑 及其 内存泄漏的解决方案: WebView 1. WebView常见的坑 API 16之前版本存在远程代码执行漏洞,该漏洞源自于程序没有正确限制使用WebView.ad ...
- Bash Shellshock(CVE-2014-6271)破壳漏洞测试
0x01 漏洞原理 Bash使用的环境变量是通过函数名称来调用的,导致漏洞出问题是以"(){"开头定义的环境变量在命令ENV中解析成函数后,Bash执行并未退出,而是继续解析并执行 ...
- python进阶-mock接口
setting.py MYSQL_HOST='192.168.127.139' PASSWORD=' PORT=3306 USER='root' DB='stu' tools.py import py ...
- GCE 部署 ELK 7.1可视化分析 nginx
目录 一.准备 1.1.服务器环境准备 二.安装 ES 2.1.遇到小问题 三.安装 Kibana 四.安装 Logstash 一.准备 我这边有一个网站放在了 Google VM 上面,所以打算在购 ...
- Flutter Window环境运行(VSCode + 单独运行Android 虚拟机)
官网以及很多网上文章的开发都是基于Android ,因为它能创建不同类型移动设备虚拟机.但个人始终觉得它太庞大,启动慢耗资源,但我们使用Flutter又离不开虚拟机. 经过实践,现在能成功的单独启动移 ...
- 通过bat批处理程序如何实现在多个txt文件后面加上相同的一行文字
通过bat批处理程序如何实现在多个txt文件后面加上相同的一行文字 set/p a=输入要增加的文字 for /f "delims=" %%i in ('dir /b *.txt' ...
- Qt中mouseMoveEvent无效
最近用Qt软件界面,需要用到mouseMoveEvent,研究了下,发现些问题,分享一下. 在Qt中要捕捉鼠标移动事件需要重写MouseMoveEvent,但是MouseMoveEvent为了不太耗资 ...