LuceneNet 搜索一
1.引用读取PDF文件组件
FontBox-0.1.0-dev.dll
IKVM.GNU.Classpath.dll
IKVM.Runtime.dll
PDFBox-0.7.3.dll
2.添加office 组件 这个就过吧
3.添加盘古分词
PanGu.dll
PanGu.HighLight.dll
PanGu.Lucene.Analyzer.dll
4.添加Lucene.net 引用
Lucene.Net.dll
5.创建索引库
#region 同步资料到索引库
static Queue<ResourcesModel> TaskQueue = new Queue<ResourcesModel>();
private void tmResSync_Tick(object sender, EventArgs e)
{
//读取数据到队列
try
{
InitTaskQueue();
LogHelper.writeLog("WinFrom【同步数据索引库读取数据到队列】成功!!");
}
catch (Exception ex)
{
LogHelper.writeErrLog("WinFrom【同步数据索引库读取数据到队列】:" + ex.Message);
}
}
public void ServiceStart()
{
Thread TaskThread = new Thread(new ThreadStart(ThreadInvoke));
TaskThread.IsBackground = true;
TaskThread.Start();
}
public void ThreadInvoke()
{
while (true)
{
try
{
if (TaskQueue.Count > )
{
ResourcesModel res = null;
lock (TaskQueue)
{
res = TaskQueue.Dequeue();
}
//调用方法
new CreateResIndex().CreateIndex(res);
}
else
{
Thread.Sleep();
}
}
catch (Exception ex)
{
LogHelper.writeErrLog("WinFrom【同步数据索引库出错】:"+ex.ToString());
}
}
}
public void InitTaskQueue()
{
//读取资料中心数据
var query = new CreateResIndex().Get_View_CreateResIndex(" and uploadTime is not null and IsIndex=0 ");
if (query!=null)
{
for (int i = ; i < query.Rows.Count; i++)
{
var model =new ResourcesModel();
model.ID =query.Rows[i]["ID"].ToString();
model.FileName=query.Rows[i]["FileName"]!=null ? query.Rows[i]["FileName"].ToString():"";
model.FilePath=query.Rows[i]["FilePath"]!=null ? query.Rows[i]["FilePath"].ToString():"";
model.CreaetBy=query.Rows[i]["UserName"]!=null ? query.Rows[i]["UserName"].ToString():"";
model.Types=query.Rows[i]["Name"]!=null ? query.Rows[i]["Name"].ToString():"";
model.TypeId=query.Rows[i]["Type"]!=null ? query.Rows[i]["Type"].ToString():"";
model.SimpleDesc=query.Rows[i]["SimpleDesc"]!=null ? query.Rows[i]["SimpleDesc"].ToString():"";
model.Title=query.Rows[i]["Title"]!=null ? query.Rows[i]["Title"].ToString():"";
model.Tags=query.Rows[i]["Tag"]!=null ? query.Rows[i]["Tag"].ToString():"";
model.OP = query.Rows[i]["IsDel"] != null && query.Rows[i]["IsDel"].ToString()!="" ? Convert.ToBoolean(query.Rows[i]["IsDel"].ToString())==true ? "" : "":"";
model.UploadTime = query.Rows[i]["uploadTime"] != null && query.Rows[i]["uploadTime"].ToString() != "" ? Convert.ToDateTime(query.Rows[i]["uploadTime"]).ToString("yyyy-MM-dd"):"";
TaskQueue.Enqueue(model);
}
} }
#endregion
#region ResourcesModel
public class ResourcesModel
{
public ResourcesModel() { } /// <summary>
/// 标识
/// </summary>
public string ID { get; set; } /// <summary>
/// 标题
/// </summary>
public string Title { get; set; } /// <summary>
///标签
/// </summary>
public string Tags { get; set; } /// <summary>
///创建人
/// </summary>
public string CreaetBy { get; set; } /// <summary>
///上传时间
/// </summary>
public string UploadTime { get; set; } /// <summary>
///类别
/// </summary>
public string Types { get; set; } /// <summary>
///简介
/// </summary>
public string SimpleDesc { get; set; }
/// <summary>
///内容
/// </summary>
public string ContextDesc { get; set; }
/// <summary>
/// 有来标注是 删除=0 增加=1 修改=2
/// </summary>
public string OP { get; set; }
/// <summary>
/// 类型Id
/// </summary>
public string TypeId { get; set; }
/// <summary>
/// 文件路径
/// </summary>
public string FilePath { get; set; }
/// <summary>
/// 文件名称
/// </summary>
public string FileName { get; set; }
}
#endregion
#region 读取文件
public class ReadFilesTxt
{
public string ResumeTxt(string path)
{
string str = string.Empty; StreamReader reader = new StreamReader(path, System.Text.Encoding.Default);
str = reader.ReadToEnd(); //再通过查询解析出来的的字符串有没有GB2312 的字段,来判断是否是GB2312格式的,如果是,则重新以GB2312的格式解析
System.Text.RegularExpressions.Regex reGB = new System.Text.RegularExpressions.Regex("GB2312", RegexOptions.IgnoreCase);
System.Text.RegularExpressions.Match mcGB = reGB.Match(str);
if (mcGB.Success)
{
StreamReader reader2 = new StreamReader(path, System.Text.Encoding.GetEncoding("GB2312"));
str = reader2.ReadToEnd();
}
return str;
} private string ResumeWord(string path)
{
string str = string.Empty;
object missing = System.Reflection.Missing.Value;
object readOnly = true;
object docPathp = path;
Microsoft.Office.Interop.Word.Application wordApp = new Microsoft.Office.Interop.Word.Application(); Microsoft.Office.Interop.Word.Document wordDoc = wordApp.Documents.Open(ref docPathp,
ref missing,
ref readOnly,
ref missing,
ref missing,
ref missing,
ref missing,
ref missing,
ref missing,
ref missing,
ref missing,
ref missing,
ref missing,
ref missing,
ref missing,
ref missing);
str = wordDoc.Content.Text;
wordDoc.Close(ref missing, ref missing, ref missing);
wordApp.Quit(ref missing, ref missing, ref missing); return str;
} private string ResumeExcel(string path)
{
string str = string.Empty;
//创建Application对象
Microsoft.Office.Interop.Excel.Application xApp = new Microsoft.Office.Interop.Excel.Application();
xApp.Visible = false;
object readOnly = true;
object missing = System.Reflection.Missing.Value;
////得到WorkBook对象,
Microsoft.Office.Interop.Excel.Workbook xBook = xApp.Workbooks._Open(path,
missing, readOnly, missing, missing,
missing, missing, missing, missing,
missing, missing, missing, missing); var count = xBook.Sheets.Count;
Microsoft.Office.Interop.Excel.Worksheet xSheet;
for (int k = ; k < count; k++)
{
xSheet = (Microsoft.Office.Interop.Excel.Worksheet)xBook.Sheets[k + ];
var rcount = xSheet.UsedRange.Rows.Count;
var ccount = xSheet.UsedRange.Columns.Count; for (int m = ; m < rcount; m++)
{
for (int n = ; n < ccount; n++)
{
str = str + ((Microsoft.Office.Interop.Excel.Range)xSheet.Cells[m + , n + ]).Value2;
}
}
} xSheet = null;
xBook.Close(missing, missing, missing);
xApp.Quit(); return str;
} public string ResumePDF(string path)
{ org.pdfbox.pdmodel.PDDocument doc = org.pdfbox.pdmodel.PDDocument.load(path); org.pdfbox.util.PDFTextStripper pdfStripper = new org.pdfbox.util.PDFTextStripper(); string text = pdfStripper.getText(doc); return text; } public string GetReadContext(string ResourceRoute, string path)
{
StringBuilder sb = new StringBuilder(); try
{
if (path != "")
{
string[] paths = path.Split(';');
for (int i = ; i < paths.Length; i++)
{
if (paths[i] != null && paths[i].ToString() != "")
{ string lpath = paths[i].ToString();
var suffix = lpath.Substring(lpath.LastIndexOf(".") + , lpath.Length - lpath.LastIndexOf(".") - );
if ("doc" == suffix || "docx" == suffix)
{
sb.Append(ResumeWord(ResourceRoute + lpath));
}
else if ("xls" == suffix || "xlsx" == lpath)
{
sb.Append(ResumeExcel(ResourceRoute + lpath));
}
else if ("pdf" == suffix)
{
sb.Append(ResumePDF(ResourceRoute + lpath));
}
else if ("txt" == suffix)
{
sb.Append(ResumeTxt(ResourceRoute + lpath));
} }
}
}
}
catch (Exception ex)
{ LogHelper.writeErrLog( "【读取文件出错:文件名称:" + path + " 】 错误消息:" + ex.Message.ToString());
} return sb.ToString();
} public string GetReadContextSingle(string ResourceRoute, string lpath)
{
StringBuilder sb = new StringBuilder();
try
{
if (lpath != "")
{
var suffix = lpath.Substring(lpath.LastIndexOf(".") + , lpath.Length - lpath.LastIndexOf(".") - );
if ("doc" == suffix || "docx" == suffix)
{
sb.Append(ResumeWord(ResourceRoute + lpath));
}
else if ("xls" == suffix || "xlsx" == lpath)
{
sb.Append(ResumeExcel(ResourceRoute + lpath));
}
else if ("pdf" == suffix)
{
sb.Append(ResumePDF(ResourceRoute + lpath));
}
else if ("txt" == suffix)
{
sb.Append(ResumeTxt(ResourceRoute + lpath));
}
}
}
catch (Exception ex)
{ LogHelper.writeErrLog("【读取文件出错:文件名称:" + ResourceRoute + lpath + " 】 错误消息:" + ex.Message.ToString());
} return sb.ToString();
}
}
#endregion
#region 创建索引
public class CreateResIndex
{
public static string IndexPath = ConfigurationManager.AppSettings["pathIndex"];//索引文件路径
public static string ResourceRoute = ConfigurationManager.AppSettings["ResourceRoute"];//文件路径 // private readonly ILog log = LogManager.GetLogger("CreateIndex"); #region 属性
/// <summary>
/// 盘古分词器
/// </summary>
protected Analyzer NewPanGuAnalyzer
{
get { return new PanGuAnalyzer(); } } /// <summary>
/// Lucene.Net的目录-参数
/// </summary>
public FSDirectory DirectoryLuce
{
get
{
return FSDirectory.Open(new DirectoryInfo(IndexPath), new NativeFSLockFactory());
}
}
#endregion #region 创建索引
/// <summary>
///创建索引
/// </summary>
public void CreateIndex(ResourcesModel res)
{
//创建索引目录
if (!System.IO.Directory.Exists(IndexPath))
{
System.IO.Directory.CreateDirectory(IndexPath);
} //FSDirectory directory = FSDirectory.Open(new DirectoryInfo(IndexDic), new NativeFSLockFactory());
bool isUpdate = IndexReader.IndexExists(DirectoryLuce);
if (isUpdate)
{
if (IndexWriter.IsLocked(DirectoryLuce))
{
IndexWriter.Unlock(DirectoryLuce);
}
} IndexWriter writer = new IndexWriter(DirectoryLuce, NewPanGuAnalyzer, !isUpdate, IndexWriter.MaxFieldLength.UNLIMITED);
List<string> listIsdex = GetResourceTypePublicResources();
List<string> modifyindex = new List<string>();
if (res != null)
{
if (res.OP == "")
{
writer.DeleteDocuments(new Term("ID", res.ID.ToString().Trim()));
modifyindex.Add(res.ID.ToString().Trim());
LogHelper.writeLog("【删除索引编号】 【ID:" + res.ID.ToString().Trim() + "】");
}
else
{ if (IsPublicResources(listIsdex, res.TypeId.Trim()))
{ writer.DeleteDocuments(new Term("ID", res.ID.ToString().Trim())); var path = res.FilePath; string ID = res.ID.ToString().Trim();
string Title = res.Title != null ? res.Title.ToString() : ""; string CreaetBy = res.CreaetBy == null ? "" : res.CreaetBy.ToString();
string UploadTime = res.UploadTime;
string Types = res.Types != null ? res.Types.ToString() : "";
string SimpleDesc = res.SimpleDesc == null ? "" : res.SimpleDesc.ToString();
string Tags = res.Tags != null ? res.Tags.ToString() : "";
string FileName = res.FileName;
try
{
string ContextDesc = "";
AddIndex(writer, ID, Title, Tags, SimpleDesc, "", Types, UploadTime, CreaetBy, FileName);
if (path != "")
{
string[] paths = path.Split(';');
string[] pname = FileName.Split(';');
for (int i = ; i < paths.Length; i++)
{
if (paths[i] != null && paths[i].ToString() != "")
{
string lpath = paths[i].ToString();
string lname = pname[i].ToString();
ContextDesc= new ReadFilesTxt().GetReadContextSingle(ResourceRoute, lpath);
//SimpleDesc=ContextDesc ContextDesc=""
string NewFileName = GetFileName(lpath, lname);
AddIndex(writer, ID, NewFileName, Tags, ContextDesc, lpath, Types, UploadTime, CreaetBy, FileName);
}
}
} // string ContextDesc = new ReadFilesTxt().GetReadContext(ResourceRoute,path);
LogHelper.writeLog("【添加索引编号】 【ID:" + res.ID.ToString().Trim() + "】");
modifyindex.Add(ID);
}
catch (Exception ex)
{
LogHelper.writeLog("【添加索引失败】 【ID:" + ID + "】:" + ex.Message.ToString()); } }
}
}
writer.Optimize();
writer.Close();
ModifyResIndex(modifyindex);
} public void AddIndex(IndexWriter writer, string ID, string Title, string Tags, string SimpleDesc, string ContextDesc, string Types, string UploadTime, string CreaetBy,string FileName)
{
try
{
Lucene.Net.Documents.Document doc = new Lucene.Net.Documents.Document();
doc.Add(new Lucene.Net.Documents.Field("ID", ID, Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.NOT_ANALYZED));//存储且索引
doc.Add(new Lucene.Net.Documents.Field("Title", Title, Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.ANALYZED));//存储且索引
doc.Add(new Lucene.Net.Documents.Field("Tags", Tags, Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.ANALYZED));//存储且索引
doc.Add(new Lucene.Net.Documents.Field("SimpleDesc", SimpleDesc, Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.ANALYZED));//存储且索引
doc.Add(new Lucene.Net.Documents.Field("FileName", FileName, Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.ANALYZED));//存储且索引
doc.Add(new Lucene.Net.Documents.Field("ContextDesc", ContextDesc, Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.ANALYZED));//存储且索引
doc.Add(new Lucene.Net.Documents.Field("Types", Types, Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.ANALYZED));//存储且索引
doc.Add(new Lucene.Net.Documents.Field("UploadTime", UploadTime, Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.NOT_ANALYZED));//存储且索引
doc.Add(new Lucene.Net.Documents.Field("CreaetBy", CreaetBy, Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.NO));
writer.AddDocument(doc);
}
catch (FileNotFoundException fnfe)
{
throw fnfe;
}
catch (Exception ex)
{
throw ex;
}
}
public string GetFileName(object objfilepath, object FileName)
{
string result = "";
if (FileName != null && FileName.ToString() != "")
{
result = FileName.ToString();
}
else
{
if (objfilepath != null && objfilepath.ToString() != "")
{
string filename = objfilepath.ToString().Substring(objfilepath.ToString().LastIndexOf(',') + ).Replace(";", "");
result = filename;
}
}
return result;
}
#endregion #region 获取数据库数据
/// <summary>
/// 获取中心资料库数据
/// </summary>
/// <param name="whereStr"></param>
/// <returns></returns>
public DataTable Get_View_CreateResIndex(string whereStr)
{
string sql = " Select * From Res_View_createResIndex where 1=1 " + whereStr;
DataTable dt = new DataTable(); try
{
DataSet ds = Ruihua.Common.DbHelperSQL.Query(sql);
if (ds != null && ds.Tables != null && ds.Tables.Count > )
{
dt = ds.Tables[];
}
}
catch (Exception ex)
{
LogHelper.writeLog("【 获取中心资料库数据错误】:" + ex.ToString());
}
return dt;
} public void ModifyResIndex(List<string> list)
{
string sql = " update ResourceInfoNew set IsIndex=1 where id in ({0}) ";
StringBuilder sb = new StringBuilder("'-1'");
//Ruihua.Common.DbHelperSQL.connectionString = ConfigurationManager.AppSettings["ResConStr"].ToString();
LogHelper.writeLog("【更新索引编号开始】:" + string.Join(",", list.ToArray()));
if (list.Count > )
{
for (int i = ; i < list.Count; i++)
{
sb.Append(",'" + list[i].ToString() + "'");
}
sql = string.Format(sql, sb.ToString());
int result = Ruihua.Common.DbHelperSQL.ExecuteSql(sql);
LogHelper.writeLog("【更新索引编号结束:" + result.ToString() + "】:" + string.Join(",", list.ToArray()));
}
} /// <summary>
/// 判断是否公共资源
/// </summary>
/// <returns></returns>
public bool IsPublicResources(List<string> list, string Id)
{ if (list.Contains(Id))
{
return true;
}
return false; }
public List<string> GetResourceTypePublicResources()
{
ObjectCache cache = MemoryCache.Default;
List<string> ResourceType = cache["ResourceType"] as List<string>;
List<string> publicresource = new List<string>();
if (ResourceType == null)
{ // Ruihua.Common.DbHelperSQL.connectionString = ConfigurationManager.AppSettings["ResConStr"].ToString();
string sql = "select *From ResourceType ";
DataSet ds = Ruihua.Common.DbHelperSQL.Query(sql);
if (ds != null && ds.Tables != null && ds.Tables.Count > )
{
DataTable dt = ds.Tables[];
var query1 = from q1 in dt.AsEnumerable()
where q1.Field<string>("ParentID") == ""
select q1;
if (query1 != null)
{
foreach (var item in query1)
{
publicresource.Add(item.Field<string>("TID").Trim());
//第二层
AddListString(ref publicresource, dt, item.Field<string>("TID").Trim());
}
}
}
CacheItemPolicy policy = new CacheItemPolicy();
policy.AbsoluteExpiration = DateTimeOffset.Now.AddSeconds(1800.0);//属性设置为 60*30 秒后逐出缓存
cache.Set("ResourceType", publicresource, policy);
}
else
{
publicresource = ResourceType;
}
return publicresource; }
public void AddListString(ref List<string> list, DataTable dt, string Id)
{
var query2 = from q2 in dt.AsEnumerable()
where q2.Field<string>("ParentID") == Id
select q2;
if (query2 != null)
{
foreach (var item in query2)
{
list.Add(item.Field<string>("TID").Trim());
AddListString(ref list, dt, item.Field<string>("TID").Trim());
}
}
} #endregion } #endregion
LuceneNet 搜索一的更多相关文章
- sulin LuceneNet 搜索二
1.添加引用dll using Lucene.Net.Search;using Lucene.Net.Analysis.PanGu;using PanGu;using PanGu.HighLight; ...
- Lucene.Net 入门级实例 浅显易懂。。。
Lucene.Net 阅读目录 开始 Lucene简介 效果图 Demo文件说明 简单使用 重点类的说明 存在问题 调整后 Lucene.Net博文与资源下载 做过站内搜索的朋友应该对Lucene.N ...
- Lucene.Net(转)
出处:http://www.cnblogs.com/piziyimao/archive/2013/01/31/2887072.html 做过站内搜索的朋友应该对Lucene.Net不陌生,没做过的也许 ...
- lucene.net helper类 【结合盘古分词进行搜索的小例子(分页功能)】
转自:http://blog.csdn.net/pukuimin1226/article/details/17558247 添加:2013-12-25 更新:2013-12-26 新增分页功能. ...
- 瞎折腾之 Lucene.Net + MVC 搜索功能(上)
前言 首先,关于Lucene.Net 的文章已经很多了.我这次决定写出来只是为了练练手,虽然在别人看来没什么用,但是自己确实是手动实践了一把.我个人觉得还是有意义的.爱折腾.敢于实践.才能有所收获,才 ...
- Lucene.net 从创建索引到搜索的代码范例
关于Lucene.Net的介绍网上已经很多了在这里就不多介绍Lucene.Net主要分为建立索引,维护索引和搜索索引Field.Store的作用是通过全文检查就能返回对应的内容,而不必再通过id去DB ...
- 记一次企业级爬虫系统升级改造(五):基于JieBaNet+Lucene.Net实现全文搜索
实现效果: 上一篇文章有附全文搜索结果的设计图,下面截一张开发完成上线后的实图: 基本风格是模仿的百度搜索结果,绿色的分页略显小清新. 目前已采集并创建索引的文章约3W多篇,索引文件不算太大,查询速度 ...
- LuceneNet 实现快速大文件大数据查询
做过站内搜索的朋友应该对Lucene.Net不陌生,因为用普通的sql like查询肯定是不行的,太慢了. 首先说明的是--Lucene.Net只是一个全文检索开发包,不是一个成型的搜索引擎, 它的 ...
- Lucene.net(4.8.0) 学习问题记录六:Lucene 的索引系统和搜索过程分析
前言:目前自己在做使用Lucene.net和PanGu分词实现全文检索的工作,不过自己是把别人做好的项目进行迁移.因为项目整体要迁移到ASP.NET Core 2.0版本,而Lucene使用的版本是3 ...
随机推荐
- 在使用python语言的open函数时,提示错误OSError: [Errno 22] Invalid argument: ‘文件路径’
如题,在使用python语言的open函数时,提示错误OSError: [Errno 22] Invalid argument: '文件路径',在查阅了大量资料后也得到了一些解决方案,但是这些解决方案 ...
- Educational Codeforces Round 69 D E
Educational Codeforces Round 69 题解 题目编号 A B C D E F 完成情况 √ √ √ ★ ★ - D. Yet Another Subarray Problem ...
- 洛谷P3239 [HNOI2015]亚瑟王
题目描述 小 K 不慎被 LL 邪教洗脑了,洗脑程度深到他甚至想要从亚瑟王邪教中脱坑.他决定,在脱坑之前,最后再来打一盘亚瑟王.既然是最后一战,就一定要打得漂亮.众所周知,亚瑟王是一个看脸的游戏,技能 ...
- Luogu P1131 [ZJOI2007]时态同步(dfs)
P1131 [ZJOI2007]时态同步 题意 题目描述 小\(Q\)在电子工艺实习课上学习焊接电路板.一块电路板由若干个元件组成,我们不妨称之为节点,并将其用数字\(1,2,3,\dots\).进行 ...
- 黑阀 adb 命令
adb 命令 adb -d shell sh /data/data/me.piebridge.brevent/brevent.sh
- vue element传的值报_self.$scopedSlots.default is not a function
问题描述:使用表格时做了v-if判断:首次渲染没有问题:反复操作便会报错: 解决办法:el-table上给v-if的 el-table-colunm 加上:key="Math.random( ...
- leetcode146周赛-1131-绝对值表达式的最大值
题目描述: class Solution: def maxAbsValExpr(self, arr1, arr2) -> int: def function(s1,s2): result1=[] ...
- Django之深入了解视图层
目录 视图层三板斧 HttpResponse render redirect JsonResponse FBV CBV CBV源码 如何给FBV和CBV加装饰器 视图层三板斧 规定视图函数必须有一个返 ...
- BZOJ2594:水管局长数据加强版
Description SC省MY市有着庞大的地下水管网络,嘟嘟是MY市的水管局长(就是管水管的啦),嘟嘟作为水管局长的工作就是:每天供水公司可能要将一定量的水从x处送往y处,嘟嘟需要为供水公司找到一 ...
- SpringBoot_02_SpringBoot的配置文件
1.SpringBoot配置文件 SpringBoot是基于约定的,所以很多配置都有默认值,但如果想使用自己的配置替换默认配置的话,就可以使用application.properties或者appli ...