LuceneNet 搜索一
1.引用读取PDF文件组件
FontBox-0.1.0-dev.dll
IKVM.GNU.Classpath.dll
IKVM.Runtime.dll
PDFBox-0.7.3.dll
2.添加office 组件 这个就过吧
3.添加盘古分词
PanGu.dll
PanGu.HighLight.dll
PanGu.Lucene.Analyzer.dll
4.添加Lucene.net 引用
Lucene.Net.dll
5.创建索引库
#region 同步资料到索引库
static Queue<ResourcesModel> TaskQueue = new Queue<ResourcesModel>();
private void tmResSync_Tick(object sender, EventArgs e)
{
//读取数据到队列
try
{
InitTaskQueue();
LogHelper.writeLog("WinFrom【同步数据索引库读取数据到队列】成功!!");
}
catch (Exception ex)
{
LogHelper.writeErrLog("WinFrom【同步数据索引库读取数据到队列】:" + ex.Message);
}
}
public void ServiceStart()
{
Thread TaskThread = new Thread(new ThreadStart(ThreadInvoke));
TaskThread.IsBackground = true;
TaskThread.Start();
}
public void ThreadInvoke()
{
while (true)
{
try
{
if (TaskQueue.Count > )
{
ResourcesModel res = null;
lock (TaskQueue)
{
res = TaskQueue.Dequeue();
}
//调用方法
new CreateResIndex().CreateIndex(res);
}
else
{
Thread.Sleep();
}
}
catch (Exception ex)
{
LogHelper.writeErrLog("WinFrom【同步数据索引库出错】:"+ex.ToString());
}
}
}
public void InitTaskQueue()
{
//读取资料中心数据
var query = new CreateResIndex().Get_View_CreateResIndex(" and uploadTime is not null and IsIndex=0 ");
if (query!=null)
{
for (int i = ; i < query.Rows.Count; i++)
{
var model =new ResourcesModel();
model.ID =query.Rows[i]["ID"].ToString();
model.FileName=query.Rows[i]["FileName"]!=null ? query.Rows[i]["FileName"].ToString():"";
model.FilePath=query.Rows[i]["FilePath"]!=null ? query.Rows[i]["FilePath"].ToString():"";
model.CreaetBy=query.Rows[i]["UserName"]!=null ? query.Rows[i]["UserName"].ToString():"";
model.Types=query.Rows[i]["Name"]!=null ? query.Rows[i]["Name"].ToString():"";
model.TypeId=query.Rows[i]["Type"]!=null ? query.Rows[i]["Type"].ToString():"";
model.SimpleDesc=query.Rows[i]["SimpleDesc"]!=null ? query.Rows[i]["SimpleDesc"].ToString():"";
model.Title=query.Rows[i]["Title"]!=null ? query.Rows[i]["Title"].ToString():"";
model.Tags=query.Rows[i]["Tag"]!=null ? query.Rows[i]["Tag"].ToString():"";
model.OP = query.Rows[i]["IsDel"] != null && query.Rows[i]["IsDel"].ToString()!="" ? Convert.ToBoolean(query.Rows[i]["IsDel"].ToString())==true ? "" : "":"";
model.UploadTime = query.Rows[i]["uploadTime"] != null && query.Rows[i]["uploadTime"].ToString() != "" ? Convert.ToDateTime(query.Rows[i]["uploadTime"]).ToString("yyyy-MM-dd"):"";
TaskQueue.Enqueue(model);
}
} }
#endregion
#region ResourcesModel
public class ResourcesModel
{
public ResourcesModel() { } /// <summary>
/// 标识
/// </summary>
public string ID { get; set; } /// <summary>
/// 标题
/// </summary>
public string Title { get; set; } /// <summary>
///标签
/// </summary>
public string Tags { get; set; } /// <summary>
///创建人
/// </summary>
public string CreaetBy { get; set; } /// <summary>
///上传时间
/// </summary>
public string UploadTime { get; set; } /// <summary>
///类别
/// </summary>
public string Types { get; set; } /// <summary>
///简介
/// </summary>
public string SimpleDesc { get; set; }
/// <summary>
///内容
/// </summary>
public string ContextDesc { get; set; }
/// <summary>
/// 有来标注是 删除=0 增加=1 修改=2
/// </summary>
public string OP { get; set; }
/// <summary>
/// 类型Id
/// </summary>
public string TypeId { get; set; }
/// <summary>
/// 文件路径
/// </summary>
public string FilePath { get; set; }
/// <summary>
/// 文件名称
/// </summary>
public string FileName { get; set; }
}
#endregion
#region 读取文件
public class ReadFilesTxt
{
public string ResumeTxt(string path)
{
string str = string.Empty; StreamReader reader = new StreamReader(path, System.Text.Encoding.Default);
str = reader.ReadToEnd(); //再通过查询解析出来的的字符串有没有GB2312 的字段,来判断是否是GB2312格式的,如果是,则重新以GB2312的格式解析
System.Text.RegularExpressions.Regex reGB = new System.Text.RegularExpressions.Regex("GB2312", RegexOptions.IgnoreCase);
System.Text.RegularExpressions.Match mcGB = reGB.Match(str);
if (mcGB.Success)
{
StreamReader reader2 = new StreamReader(path, System.Text.Encoding.GetEncoding("GB2312"));
str = reader2.ReadToEnd();
}
return str;
} private string ResumeWord(string path)
{
string str = string.Empty;
object missing = System.Reflection.Missing.Value;
object readOnly = true;
object docPathp = path;
Microsoft.Office.Interop.Word.Application wordApp = new Microsoft.Office.Interop.Word.Application(); Microsoft.Office.Interop.Word.Document wordDoc = wordApp.Documents.Open(ref docPathp,
ref missing,
ref readOnly,
ref missing,
ref missing,
ref missing,
ref missing,
ref missing,
ref missing,
ref missing,
ref missing,
ref missing,
ref missing,
ref missing,
ref missing,
ref missing);
str = wordDoc.Content.Text;
wordDoc.Close(ref missing, ref missing, ref missing);
wordApp.Quit(ref missing, ref missing, ref missing); return str;
} private string ResumeExcel(string path)
{
string str = string.Empty;
//创建Application对象
Microsoft.Office.Interop.Excel.Application xApp = new Microsoft.Office.Interop.Excel.Application();
xApp.Visible = false;
object readOnly = true;
object missing = System.Reflection.Missing.Value;
////得到WorkBook对象,
Microsoft.Office.Interop.Excel.Workbook xBook = xApp.Workbooks._Open(path,
missing, readOnly, missing, missing,
missing, missing, missing, missing,
missing, missing, missing, missing); var count = xBook.Sheets.Count;
Microsoft.Office.Interop.Excel.Worksheet xSheet;
for (int k = ; k < count; k++)
{
xSheet = (Microsoft.Office.Interop.Excel.Worksheet)xBook.Sheets[k + ];
var rcount = xSheet.UsedRange.Rows.Count;
var ccount = xSheet.UsedRange.Columns.Count; for (int m = ; m < rcount; m++)
{
for (int n = ; n < ccount; n++)
{
str = str + ((Microsoft.Office.Interop.Excel.Range)xSheet.Cells[m + , n + ]).Value2;
}
}
} xSheet = null;
xBook.Close(missing, missing, missing);
xApp.Quit(); return str;
} public string ResumePDF(string path)
{ org.pdfbox.pdmodel.PDDocument doc = org.pdfbox.pdmodel.PDDocument.load(path); org.pdfbox.util.PDFTextStripper pdfStripper = new org.pdfbox.util.PDFTextStripper(); string text = pdfStripper.getText(doc); return text; } public string GetReadContext(string ResourceRoute, string path)
{
StringBuilder sb = new StringBuilder(); try
{
if (path != "")
{
string[] paths = path.Split(';');
for (int i = ; i < paths.Length; i++)
{
if (paths[i] != null && paths[i].ToString() != "")
{ string lpath = paths[i].ToString();
var suffix = lpath.Substring(lpath.LastIndexOf(".") + , lpath.Length - lpath.LastIndexOf(".") - );
if ("doc" == suffix || "docx" == suffix)
{
sb.Append(ResumeWord(ResourceRoute + lpath));
}
else if ("xls" == suffix || "xlsx" == lpath)
{
sb.Append(ResumeExcel(ResourceRoute + lpath));
}
else if ("pdf" == suffix)
{
sb.Append(ResumePDF(ResourceRoute + lpath));
}
else if ("txt" == suffix)
{
sb.Append(ResumeTxt(ResourceRoute + lpath));
} }
}
}
}
catch (Exception ex)
{ LogHelper.writeErrLog( "【读取文件出错:文件名称:" + path + " 】 错误消息:" + ex.Message.ToString());
} return sb.ToString();
} public string GetReadContextSingle(string ResourceRoute, string lpath)
{
StringBuilder sb = new StringBuilder();
try
{
if (lpath != "")
{
var suffix = lpath.Substring(lpath.LastIndexOf(".") + , lpath.Length - lpath.LastIndexOf(".") - );
if ("doc" == suffix || "docx" == suffix)
{
sb.Append(ResumeWord(ResourceRoute + lpath));
}
else if ("xls" == suffix || "xlsx" == lpath)
{
sb.Append(ResumeExcel(ResourceRoute + lpath));
}
else if ("pdf" == suffix)
{
sb.Append(ResumePDF(ResourceRoute + lpath));
}
else if ("txt" == suffix)
{
sb.Append(ResumeTxt(ResourceRoute + lpath));
}
}
}
catch (Exception ex)
{ LogHelper.writeErrLog("【读取文件出错:文件名称:" + ResourceRoute + lpath + " 】 错误消息:" + ex.Message.ToString());
} return sb.ToString();
}
}
#endregion
#region 创建索引
public class CreateResIndex
{
public static string IndexPath = ConfigurationManager.AppSettings["pathIndex"];//索引文件路径
public static string ResourceRoute = ConfigurationManager.AppSettings["ResourceRoute"];//文件路径 // private readonly ILog log = LogManager.GetLogger("CreateIndex"); #region 属性
/// <summary>
/// 盘古分词器
/// </summary>
protected Analyzer NewPanGuAnalyzer
{
get { return new PanGuAnalyzer(); } } /// <summary>
/// Lucene.Net的目录-参数
/// </summary>
public FSDirectory DirectoryLuce
{
get
{
return FSDirectory.Open(new DirectoryInfo(IndexPath), new NativeFSLockFactory());
}
}
#endregion #region 创建索引
/// <summary>
///创建索引
/// </summary>
public void CreateIndex(ResourcesModel res)
{
//创建索引目录
if (!System.IO.Directory.Exists(IndexPath))
{
System.IO.Directory.CreateDirectory(IndexPath);
} //FSDirectory directory = FSDirectory.Open(new DirectoryInfo(IndexDic), new NativeFSLockFactory());
bool isUpdate = IndexReader.IndexExists(DirectoryLuce);
if (isUpdate)
{
if (IndexWriter.IsLocked(DirectoryLuce))
{
IndexWriter.Unlock(DirectoryLuce);
}
} IndexWriter writer = new IndexWriter(DirectoryLuce, NewPanGuAnalyzer, !isUpdate, IndexWriter.MaxFieldLength.UNLIMITED);
List<string> listIsdex = GetResourceTypePublicResources();
List<string> modifyindex = new List<string>();
if (res != null)
{
if (res.OP == "")
{
writer.DeleteDocuments(new Term("ID", res.ID.ToString().Trim()));
modifyindex.Add(res.ID.ToString().Trim());
LogHelper.writeLog("【删除索引编号】 【ID:" + res.ID.ToString().Trim() + "】");
}
else
{ if (IsPublicResources(listIsdex, res.TypeId.Trim()))
{ writer.DeleteDocuments(new Term("ID", res.ID.ToString().Trim())); var path = res.FilePath; string ID = res.ID.ToString().Trim();
string Title = res.Title != null ? res.Title.ToString() : ""; string CreaetBy = res.CreaetBy == null ? "" : res.CreaetBy.ToString();
string UploadTime = res.UploadTime;
string Types = res.Types != null ? res.Types.ToString() : "";
string SimpleDesc = res.SimpleDesc == null ? "" : res.SimpleDesc.ToString();
string Tags = res.Tags != null ? res.Tags.ToString() : "";
string FileName = res.FileName;
try
{
string ContextDesc = "";
AddIndex(writer, ID, Title, Tags, SimpleDesc, "", Types, UploadTime, CreaetBy, FileName);
if (path != "")
{
string[] paths = path.Split(';');
string[] pname = FileName.Split(';');
for (int i = ; i < paths.Length; i++)
{
if (paths[i] != null && paths[i].ToString() != "")
{
string lpath = paths[i].ToString();
string lname = pname[i].ToString();
ContextDesc= new ReadFilesTxt().GetReadContextSingle(ResourceRoute, lpath);
//SimpleDesc=ContextDesc ContextDesc=""
string NewFileName = GetFileName(lpath, lname);
AddIndex(writer, ID, NewFileName, Tags, ContextDesc, lpath, Types, UploadTime, CreaetBy, FileName);
}
}
} // string ContextDesc = new ReadFilesTxt().GetReadContext(ResourceRoute,path);
LogHelper.writeLog("【添加索引编号】 【ID:" + res.ID.ToString().Trim() + "】");
modifyindex.Add(ID);
}
catch (Exception ex)
{
LogHelper.writeLog("【添加索引失败】 【ID:" + ID + "】:" + ex.Message.ToString()); } }
}
}
writer.Optimize();
writer.Close();
ModifyResIndex(modifyindex);
} public void AddIndex(IndexWriter writer, string ID, string Title, string Tags, string SimpleDesc, string ContextDesc, string Types, string UploadTime, string CreaetBy,string FileName)
{
try
{
Lucene.Net.Documents.Document doc = new Lucene.Net.Documents.Document();
doc.Add(new Lucene.Net.Documents.Field("ID", ID, Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.NOT_ANALYZED));//存储且索引
doc.Add(new Lucene.Net.Documents.Field("Title", Title, Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.ANALYZED));//存储且索引
doc.Add(new Lucene.Net.Documents.Field("Tags", Tags, Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.ANALYZED));//存储且索引
doc.Add(new Lucene.Net.Documents.Field("SimpleDesc", SimpleDesc, Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.ANALYZED));//存储且索引
doc.Add(new Lucene.Net.Documents.Field("FileName", FileName, Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.ANALYZED));//存储且索引
doc.Add(new Lucene.Net.Documents.Field("ContextDesc", ContextDesc, Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.ANALYZED));//存储且索引
doc.Add(new Lucene.Net.Documents.Field("Types", Types, Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.ANALYZED));//存储且索引
doc.Add(new Lucene.Net.Documents.Field("UploadTime", UploadTime, Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.NOT_ANALYZED));//存储且索引
doc.Add(new Lucene.Net.Documents.Field("CreaetBy", CreaetBy, Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.NO));
writer.AddDocument(doc);
}
catch (FileNotFoundException fnfe)
{
throw fnfe;
}
catch (Exception ex)
{
throw ex;
}
}
public string GetFileName(object objfilepath, object FileName)
{
string result = "";
if (FileName != null && FileName.ToString() != "")
{
result = FileName.ToString();
}
else
{
if (objfilepath != null && objfilepath.ToString() != "")
{
string filename = objfilepath.ToString().Substring(objfilepath.ToString().LastIndexOf(',') + ).Replace(";", "");
result = filename;
}
}
return result;
}
#endregion #region 获取数据库数据
/// <summary>
/// 获取中心资料库数据
/// </summary>
/// <param name="whereStr"></param>
/// <returns></returns>
public DataTable Get_View_CreateResIndex(string whereStr)
{
string sql = " Select * From Res_View_createResIndex where 1=1 " + whereStr;
DataTable dt = new DataTable(); try
{
DataSet ds = Ruihua.Common.DbHelperSQL.Query(sql);
if (ds != null && ds.Tables != null && ds.Tables.Count > )
{
dt = ds.Tables[];
}
}
catch (Exception ex)
{
LogHelper.writeLog("【 获取中心资料库数据错误】:" + ex.ToString());
}
return dt;
} public void ModifyResIndex(List<string> list)
{
string sql = " update ResourceInfoNew set IsIndex=1 where id in ({0}) ";
StringBuilder sb = new StringBuilder("'-1'");
//Ruihua.Common.DbHelperSQL.connectionString = ConfigurationManager.AppSettings["ResConStr"].ToString();
LogHelper.writeLog("【更新索引编号开始】:" + string.Join(",", list.ToArray()));
if (list.Count > )
{
for (int i = ; i < list.Count; i++)
{
sb.Append(",'" + list[i].ToString() + "'");
}
sql = string.Format(sql, sb.ToString());
int result = Ruihua.Common.DbHelperSQL.ExecuteSql(sql);
LogHelper.writeLog("【更新索引编号结束:" + result.ToString() + "】:" + string.Join(",", list.ToArray()));
}
} /// <summary>
/// 判断是否公共资源
/// </summary>
/// <returns></returns>
public bool IsPublicResources(List<string> list, string Id)
{ if (list.Contains(Id))
{
return true;
}
return false; }
public List<string> GetResourceTypePublicResources()
{
ObjectCache cache = MemoryCache.Default;
List<string> ResourceType = cache["ResourceType"] as List<string>;
List<string> publicresource = new List<string>();
if (ResourceType == null)
{ // Ruihua.Common.DbHelperSQL.connectionString = ConfigurationManager.AppSettings["ResConStr"].ToString();
string sql = "select *From ResourceType ";
DataSet ds = Ruihua.Common.DbHelperSQL.Query(sql);
if (ds != null && ds.Tables != null && ds.Tables.Count > )
{
DataTable dt = ds.Tables[];
var query1 = from q1 in dt.AsEnumerable()
where q1.Field<string>("ParentID") == ""
select q1;
if (query1 != null)
{
foreach (var item in query1)
{
publicresource.Add(item.Field<string>("TID").Trim());
//第二层
AddListString(ref publicresource, dt, item.Field<string>("TID").Trim());
}
}
}
CacheItemPolicy policy = new CacheItemPolicy();
policy.AbsoluteExpiration = DateTimeOffset.Now.AddSeconds(1800.0);//属性设置为 60*30 秒后逐出缓存
cache.Set("ResourceType", publicresource, policy);
}
else
{
publicresource = ResourceType;
}
return publicresource; }
public void AddListString(ref List<string> list, DataTable dt, string Id)
{
var query2 = from q2 in dt.AsEnumerable()
where q2.Field<string>("ParentID") == Id
select q2;
if (query2 != null)
{
foreach (var item in query2)
{
list.Add(item.Field<string>("TID").Trim());
AddListString(ref list, dt, item.Field<string>("TID").Trim());
}
}
} #endregion } #endregion
LuceneNet 搜索一的更多相关文章
- sulin LuceneNet 搜索二
1.添加引用dll using Lucene.Net.Search;using Lucene.Net.Analysis.PanGu;using PanGu;using PanGu.HighLight; ...
- Lucene.Net 入门级实例 浅显易懂。。。
Lucene.Net 阅读目录 开始 Lucene简介 效果图 Demo文件说明 简单使用 重点类的说明 存在问题 调整后 Lucene.Net博文与资源下载 做过站内搜索的朋友应该对Lucene.N ...
- Lucene.Net(转)
出处:http://www.cnblogs.com/piziyimao/archive/2013/01/31/2887072.html 做过站内搜索的朋友应该对Lucene.Net不陌生,没做过的也许 ...
- lucene.net helper类 【结合盘古分词进行搜索的小例子(分页功能)】
转自:http://blog.csdn.net/pukuimin1226/article/details/17558247 添加:2013-12-25 更新:2013-12-26 新增分页功能. ...
- 瞎折腾之 Lucene.Net + MVC 搜索功能(上)
前言 首先,关于Lucene.Net 的文章已经很多了.我这次决定写出来只是为了练练手,虽然在别人看来没什么用,但是自己确实是手动实践了一把.我个人觉得还是有意义的.爱折腾.敢于实践.才能有所收获,才 ...
- Lucene.net 从创建索引到搜索的代码范例
关于Lucene.Net的介绍网上已经很多了在这里就不多介绍Lucene.Net主要分为建立索引,维护索引和搜索索引Field.Store的作用是通过全文检查就能返回对应的内容,而不必再通过id去DB ...
- 记一次企业级爬虫系统升级改造(五):基于JieBaNet+Lucene.Net实现全文搜索
实现效果: 上一篇文章有附全文搜索结果的设计图,下面截一张开发完成上线后的实图: 基本风格是模仿的百度搜索结果,绿色的分页略显小清新. 目前已采集并创建索引的文章约3W多篇,索引文件不算太大,查询速度 ...
- LuceneNet 实现快速大文件大数据查询
做过站内搜索的朋友应该对Lucene.Net不陌生,因为用普通的sql like查询肯定是不行的,太慢了. 首先说明的是--Lucene.Net只是一个全文检索开发包,不是一个成型的搜索引擎, 它的 ...
- Lucene.net(4.8.0) 学习问题记录六:Lucene 的索引系统和搜索过程分析
前言:目前自己在做使用Lucene.net和PanGu分词实现全文检索的工作,不过自己是把别人做好的项目进行迁移.因为项目整体要迁移到ASP.NET Core 2.0版本,而Lucene使用的版本是3 ...
随机推荐
- wdatapicker 时间选择器——例
效果: html: <!DOCTYPE html> <html> <head> <meta charset="utf-8"> < ...
- java ajax长连接请求服务器数据
Servlet 3.0笔记之异步请求Comet推送长轮询(long polling)篇 Comet另一种形式为长轮询(long polling),客户端会与服务器建立一个持久的连接,直到服务器端有数据 ...
- linux常用软连接使用ln -s
[软连接]另外一种连接称之为符号连接(Symbolic Link),也叫软连接.软链接文件有类似于Windows的快捷方式.它实际上是一个特殊的文件.在符号连接中,文件实际上是一个文本文件,其中包含的 ...
- 模拟实现call、apply
1. 知识点补充: 首先在模拟实现前,先Mark一些我之前不知道的知识: a. eval(string)函数:可计算某个字符串,并执行其中的JavaScript代码 其中,string是必需传入的待计 ...
- hdu5952 Counting Cliques 技巧dfs
题意:一个有N个点M条边的图,球其中由S个点构成的团的个数.一个团是一个完全子图. 没有什么好办法,只有暴力深搜,但是这里有一个神奇的操作:将无向图转为有向图:当两个点编号u<v时才有边u-&g ...
- win10下aria2和BaiduExporter的配置和安装
一.aria2的配置 下载 aria2下载地址: https://github.com/aria2/aria2/releases 链接:https://pan.baidu.com/s/1olJyZkX ...
- 数据结构学习笔记_树(二叉搜索树,B-树,B+树,B*树)
一.查找二叉树(二叉搜索树BST) 1.查找二叉树的性质 1).所有非叶子结点至多拥有两个儿子(Left和Right): 2).所有结点存储一个关键字: 3).非叶子结点的左指针指向小于其关键字的子树 ...
- centos7 搭建 php7 + nginx (2)
安装php php下载地址 # 避免出错,先安装下面 yum install libzip libzip-devel libxml2-devel openssl openssl-devel bzip2 ...
- 关于H5裁剪图片后,直传阿里云的一些问题
这段时间在工作中碰到一个需要在h5裁剪图像,然后直传阿里云的需求.图中遇到了一些小问题,分享出来大家都看看. h5裁剪图像:cropper.js是一个神器啊关于用法,网上可以收罗出大量的帖子,这里我就 ...
- 第12章 SQL联接
第12章 SQL联接 关系数据库的3个支柱:选择.投影和联接. 两种基本的连接同等联接和非同等联接. 源表和目标表有相同的名称的列,就可以在他们之间执行自然联接,而无需指定连接列. 自然join us ...