LuceneNet 搜索一
1.引用读取PDF文件组件
FontBox-0.1.0-dev.dll
IKVM.GNU.Classpath.dll
IKVM.Runtime.dll
PDFBox-0.7.3.dll
2.添加office 组件 这个就过吧
3.添加盘古分词
PanGu.dll
PanGu.HighLight.dll
PanGu.Lucene.Analyzer.dll
4.添加Lucene.net 引用
Lucene.Net.dll
5.创建索引库
#region 同步资料到索引库
static Queue<ResourcesModel> TaskQueue = new Queue<ResourcesModel>();
private void tmResSync_Tick(object sender, EventArgs e)
{
//读取数据到队列
try
{
InitTaskQueue();
LogHelper.writeLog("WinFrom【同步数据索引库读取数据到队列】成功!!");
}
catch (Exception ex)
{
LogHelper.writeErrLog("WinFrom【同步数据索引库读取数据到队列】:" + ex.Message);
}
}
public void ServiceStart()
{
Thread TaskThread = new Thread(new ThreadStart(ThreadInvoke));
TaskThread.IsBackground = true;
TaskThread.Start();
}
public void ThreadInvoke()
{
while (true)
{
try
{
if (TaskQueue.Count > )
{
ResourcesModel res = null;
lock (TaskQueue)
{
res = TaskQueue.Dequeue();
}
//调用方法
new CreateResIndex().CreateIndex(res);
}
else
{
Thread.Sleep();
}
}
catch (Exception ex)
{
LogHelper.writeErrLog("WinFrom【同步数据索引库出错】:"+ex.ToString());
}
}
}
public void InitTaskQueue()
{
//读取资料中心数据
var query = new CreateResIndex().Get_View_CreateResIndex(" and uploadTime is not null and IsIndex=0 ");
if (query!=null)
{
for (int i = ; i < query.Rows.Count; i++)
{
var model =new ResourcesModel();
model.ID =query.Rows[i]["ID"].ToString();
model.FileName=query.Rows[i]["FileName"]!=null ? query.Rows[i]["FileName"].ToString():"";
model.FilePath=query.Rows[i]["FilePath"]!=null ? query.Rows[i]["FilePath"].ToString():"";
model.CreaetBy=query.Rows[i]["UserName"]!=null ? query.Rows[i]["UserName"].ToString():"";
model.Types=query.Rows[i]["Name"]!=null ? query.Rows[i]["Name"].ToString():"";
model.TypeId=query.Rows[i]["Type"]!=null ? query.Rows[i]["Type"].ToString():"";
model.SimpleDesc=query.Rows[i]["SimpleDesc"]!=null ? query.Rows[i]["SimpleDesc"].ToString():"";
model.Title=query.Rows[i]["Title"]!=null ? query.Rows[i]["Title"].ToString():"";
model.Tags=query.Rows[i]["Tag"]!=null ? query.Rows[i]["Tag"].ToString():"";
model.OP = query.Rows[i]["IsDel"] != null && query.Rows[i]["IsDel"].ToString()!="" ? Convert.ToBoolean(query.Rows[i]["IsDel"].ToString())==true ? "" : "":"";
model.UploadTime = query.Rows[i]["uploadTime"] != null && query.Rows[i]["uploadTime"].ToString() != "" ? Convert.ToDateTime(query.Rows[i]["uploadTime"]).ToString("yyyy-MM-dd"):"";
TaskQueue.Enqueue(model);
}
} }
#endregion
#region ResourcesModel
public class ResourcesModel
{
public ResourcesModel() { } /// <summary>
/// 标识
/// </summary>
public string ID { get; set; } /// <summary>
/// 标题
/// </summary>
public string Title { get; set; } /// <summary>
///标签
/// </summary>
public string Tags { get; set; } /// <summary>
///创建人
/// </summary>
public string CreaetBy { get; set; } /// <summary>
///上传时间
/// </summary>
public string UploadTime { get; set; } /// <summary>
///类别
/// </summary>
public string Types { get; set; } /// <summary>
///简介
/// </summary>
public string SimpleDesc { get; set; }
/// <summary>
///内容
/// </summary>
public string ContextDesc { get; set; }
/// <summary>
/// 有来标注是 删除=0 增加=1 修改=2
/// </summary>
public string OP { get; set; }
/// <summary>
/// 类型Id
/// </summary>
public string TypeId { get; set; }
/// <summary>
/// 文件路径
/// </summary>
public string FilePath { get; set; }
/// <summary>
/// 文件名称
/// </summary>
public string FileName { get; set; }
}
#endregion
#region 读取文件
public class ReadFilesTxt
{
public string ResumeTxt(string path)
{
string str = string.Empty; StreamReader reader = new StreamReader(path, System.Text.Encoding.Default);
str = reader.ReadToEnd(); //再通过查询解析出来的的字符串有没有GB2312 的字段,来判断是否是GB2312格式的,如果是,则重新以GB2312的格式解析
System.Text.RegularExpressions.Regex reGB = new System.Text.RegularExpressions.Regex("GB2312", RegexOptions.IgnoreCase);
System.Text.RegularExpressions.Match mcGB = reGB.Match(str);
if (mcGB.Success)
{
StreamReader reader2 = new StreamReader(path, System.Text.Encoding.GetEncoding("GB2312"));
str = reader2.ReadToEnd();
}
return str;
} private string ResumeWord(string path)
{
string str = string.Empty;
object missing = System.Reflection.Missing.Value;
object readOnly = true;
object docPathp = path;
Microsoft.Office.Interop.Word.Application wordApp = new Microsoft.Office.Interop.Word.Application(); Microsoft.Office.Interop.Word.Document wordDoc = wordApp.Documents.Open(ref docPathp,
ref missing,
ref readOnly,
ref missing,
ref missing,
ref missing,
ref missing,
ref missing,
ref missing,
ref missing,
ref missing,
ref missing,
ref missing,
ref missing,
ref missing,
ref missing);
str = wordDoc.Content.Text;
wordDoc.Close(ref missing, ref missing, ref missing);
wordApp.Quit(ref missing, ref missing, ref missing); return str;
} private string ResumeExcel(string path)
{
string str = string.Empty;
//创建Application对象
Microsoft.Office.Interop.Excel.Application xApp = new Microsoft.Office.Interop.Excel.Application();
xApp.Visible = false;
object readOnly = true;
object missing = System.Reflection.Missing.Value;
////得到WorkBook对象,
Microsoft.Office.Interop.Excel.Workbook xBook = xApp.Workbooks._Open(path,
missing, readOnly, missing, missing,
missing, missing, missing, missing,
missing, missing, missing, missing); var count = xBook.Sheets.Count;
Microsoft.Office.Interop.Excel.Worksheet xSheet;
for (int k = ; k < count; k++)
{
xSheet = (Microsoft.Office.Interop.Excel.Worksheet)xBook.Sheets[k + ];
var rcount = xSheet.UsedRange.Rows.Count;
var ccount = xSheet.UsedRange.Columns.Count; for (int m = ; m < rcount; m++)
{
for (int n = ; n < ccount; n++)
{
str = str + ((Microsoft.Office.Interop.Excel.Range)xSheet.Cells[m + , n + ]).Value2;
}
}
} xSheet = null;
xBook.Close(missing, missing, missing);
xApp.Quit(); return str;
} public string ResumePDF(string path)
{ org.pdfbox.pdmodel.PDDocument doc = org.pdfbox.pdmodel.PDDocument.load(path); org.pdfbox.util.PDFTextStripper pdfStripper = new org.pdfbox.util.PDFTextStripper(); string text = pdfStripper.getText(doc); return text; } public string GetReadContext(string ResourceRoute, string path)
{
StringBuilder sb = new StringBuilder(); try
{
if (path != "")
{
string[] paths = path.Split(';');
for (int i = ; i < paths.Length; i++)
{
if (paths[i] != null && paths[i].ToString() != "")
{ string lpath = paths[i].ToString();
var suffix = lpath.Substring(lpath.LastIndexOf(".") + , lpath.Length - lpath.LastIndexOf(".") - );
if ("doc" == suffix || "docx" == suffix)
{
sb.Append(ResumeWord(ResourceRoute + lpath));
}
else if ("xls" == suffix || "xlsx" == lpath)
{
sb.Append(ResumeExcel(ResourceRoute + lpath));
}
else if ("pdf" == suffix)
{
sb.Append(ResumePDF(ResourceRoute + lpath));
}
else if ("txt" == suffix)
{
sb.Append(ResumeTxt(ResourceRoute + lpath));
} }
}
}
}
catch (Exception ex)
{ LogHelper.writeErrLog( "【读取文件出错:文件名称:" + path + " 】 错误消息:" + ex.Message.ToString());
} return sb.ToString();
} public string GetReadContextSingle(string ResourceRoute, string lpath)
{
StringBuilder sb = new StringBuilder();
try
{
if (lpath != "")
{
var suffix = lpath.Substring(lpath.LastIndexOf(".") + , lpath.Length - lpath.LastIndexOf(".") - );
if ("doc" == suffix || "docx" == suffix)
{
sb.Append(ResumeWord(ResourceRoute + lpath));
}
else if ("xls" == suffix || "xlsx" == lpath)
{
sb.Append(ResumeExcel(ResourceRoute + lpath));
}
else if ("pdf" == suffix)
{
sb.Append(ResumePDF(ResourceRoute + lpath));
}
else if ("txt" == suffix)
{
sb.Append(ResumeTxt(ResourceRoute + lpath));
}
}
}
catch (Exception ex)
{ LogHelper.writeErrLog("【读取文件出错:文件名称:" + ResourceRoute + lpath + " 】 错误消息:" + ex.Message.ToString());
} return sb.ToString();
}
}
#endregion
#region 创建索引
public class CreateResIndex
{
public static string IndexPath = ConfigurationManager.AppSettings["pathIndex"];//索引文件路径
public static string ResourceRoute = ConfigurationManager.AppSettings["ResourceRoute"];//文件路径 // private readonly ILog log = LogManager.GetLogger("CreateIndex"); #region 属性
/// <summary>
/// 盘古分词器
/// </summary>
protected Analyzer NewPanGuAnalyzer
{
get { return new PanGuAnalyzer(); } } /// <summary>
/// Lucene.Net的目录-参数
/// </summary>
public FSDirectory DirectoryLuce
{
get
{
return FSDirectory.Open(new DirectoryInfo(IndexPath), new NativeFSLockFactory());
}
}
#endregion #region 创建索引
/// <summary>
///创建索引
/// </summary>
public void CreateIndex(ResourcesModel res)
{
//创建索引目录
if (!System.IO.Directory.Exists(IndexPath))
{
System.IO.Directory.CreateDirectory(IndexPath);
} //FSDirectory directory = FSDirectory.Open(new DirectoryInfo(IndexDic), new NativeFSLockFactory());
bool isUpdate = IndexReader.IndexExists(DirectoryLuce);
if (isUpdate)
{
if (IndexWriter.IsLocked(DirectoryLuce))
{
IndexWriter.Unlock(DirectoryLuce);
}
} IndexWriter writer = new IndexWriter(DirectoryLuce, NewPanGuAnalyzer, !isUpdate, IndexWriter.MaxFieldLength.UNLIMITED);
List<string> listIsdex = GetResourceTypePublicResources();
List<string> modifyindex = new List<string>();
if (res != null)
{
if (res.OP == "")
{
writer.DeleteDocuments(new Term("ID", res.ID.ToString().Trim()));
modifyindex.Add(res.ID.ToString().Trim());
LogHelper.writeLog("【删除索引编号】 【ID:" + res.ID.ToString().Trim() + "】");
}
else
{ if (IsPublicResources(listIsdex, res.TypeId.Trim()))
{ writer.DeleteDocuments(new Term("ID", res.ID.ToString().Trim())); var path = res.FilePath; string ID = res.ID.ToString().Trim();
string Title = res.Title != null ? res.Title.ToString() : ""; string CreaetBy = res.CreaetBy == null ? "" : res.CreaetBy.ToString();
string UploadTime = res.UploadTime;
string Types = res.Types != null ? res.Types.ToString() : "";
string SimpleDesc = res.SimpleDesc == null ? "" : res.SimpleDesc.ToString();
string Tags = res.Tags != null ? res.Tags.ToString() : "";
string FileName = res.FileName;
try
{
string ContextDesc = "";
AddIndex(writer, ID, Title, Tags, SimpleDesc, "", Types, UploadTime, CreaetBy, FileName);
if (path != "")
{
string[] paths = path.Split(';');
string[] pname = FileName.Split(';');
for (int i = ; i < paths.Length; i++)
{
if (paths[i] != null && paths[i].ToString() != "")
{
string lpath = paths[i].ToString();
string lname = pname[i].ToString();
ContextDesc= new ReadFilesTxt().GetReadContextSingle(ResourceRoute, lpath);
//SimpleDesc=ContextDesc ContextDesc=""
string NewFileName = GetFileName(lpath, lname);
AddIndex(writer, ID, NewFileName, Tags, ContextDesc, lpath, Types, UploadTime, CreaetBy, FileName);
}
}
} // string ContextDesc = new ReadFilesTxt().GetReadContext(ResourceRoute,path);
LogHelper.writeLog("【添加索引编号】 【ID:" + res.ID.ToString().Trim() + "】");
modifyindex.Add(ID);
}
catch (Exception ex)
{
LogHelper.writeLog("【添加索引失败】 【ID:" + ID + "】:" + ex.Message.ToString()); } }
}
}
writer.Optimize();
writer.Close();
ModifyResIndex(modifyindex);
} public void AddIndex(IndexWriter writer, string ID, string Title, string Tags, string SimpleDesc, string ContextDesc, string Types, string UploadTime, string CreaetBy,string FileName)
{
try
{
Lucene.Net.Documents.Document doc = new Lucene.Net.Documents.Document();
doc.Add(new Lucene.Net.Documents.Field("ID", ID, Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.NOT_ANALYZED));//存储且索引
doc.Add(new Lucene.Net.Documents.Field("Title", Title, Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.ANALYZED));//存储且索引
doc.Add(new Lucene.Net.Documents.Field("Tags", Tags, Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.ANALYZED));//存储且索引
doc.Add(new Lucene.Net.Documents.Field("SimpleDesc", SimpleDesc, Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.ANALYZED));//存储且索引
doc.Add(new Lucene.Net.Documents.Field("FileName", FileName, Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.ANALYZED));//存储且索引
doc.Add(new Lucene.Net.Documents.Field("ContextDesc", ContextDesc, Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.ANALYZED));//存储且索引
doc.Add(new Lucene.Net.Documents.Field("Types", Types, Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.ANALYZED));//存储且索引
doc.Add(new Lucene.Net.Documents.Field("UploadTime", UploadTime, Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.NOT_ANALYZED));//存储且索引
doc.Add(new Lucene.Net.Documents.Field("CreaetBy", CreaetBy, Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.NO));
writer.AddDocument(doc);
}
catch (FileNotFoundException fnfe)
{
throw fnfe;
}
catch (Exception ex)
{
throw ex;
}
}
public string GetFileName(object objfilepath, object FileName)
{
string result = "";
if (FileName != null && FileName.ToString() != "")
{
result = FileName.ToString();
}
else
{
if (objfilepath != null && objfilepath.ToString() != "")
{
string filename = objfilepath.ToString().Substring(objfilepath.ToString().LastIndexOf(',') + ).Replace(";", "");
result = filename;
}
}
return result;
}
#endregion #region 获取数据库数据
/// <summary>
/// 获取中心资料库数据
/// </summary>
/// <param name="whereStr"></param>
/// <returns></returns>
public DataTable Get_View_CreateResIndex(string whereStr)
{
string sql = " Select * From Res_View_createResIndex where 1=1 " + whereStr;
DataTable dt = new DataTable(); try
{
DataSet ds = Ruihua.Common.DbHelperSQL.Query(sql);
if (ds != null && ds.Tables != null && ds.Tables.Count > )
{
dt = ds.Tables[];
}
}
catch (Exception ex)
{
LogHelper.writeLog("【 获取中心资料库数据错误】:" + ex.ToString());
}
return dt;
} public void ModifyResIndex(List<string> list)
{
string sql = " update ResourceInfoNew set IsIndex=1 where id in ({0}) ";
StringBuilder sb = new StringBuilder("'-1'");
//Ruihua.Common.DbHelperSQL.connectionString = ConfigurationManager.AppSettings["ResConStr"].ToString();
LogHelper.writeLog("【更新索引编号开始】:" + string.Join(",", list.ToArray()));
if (list.Count > )
{
for (int i = ; i < list.Count; i++)
{
sb.Append(",'" + list[i].ToString() + "'");
}
sql = string.Format(sql, sb.ToString());
int result = Ruihua.Common.DbHelperSQL.ExecuteSql(sql);
LogHelper.writeLog("【更新索引编号结束:" + result.ToString() + "】:" + string.Join(",", list.ToArray()));
}
} /// <summary>
/// 判断是否公共资源
/// </summary>
/// <returns></returns>
public bool IsPublicResources(List<string> list, string Id)
{ if (list.Contains(Id))
{
return true;
}
return false; }
public List<string> GetResourceTypePublicResources()
{
ObjectCache cache = MemoryCache.Default;
List<string> ResourceType = cache["ResourceType"] as List<string>;
List<string> publicresource = new List<string>();
if (ResourceType == null)
{ // Ruihua.Common.DbHelperSQL.connectionString = ConfigurationManager.AppSettings["ResConStr"].ToString();
string sql = "select *From ResourceType ";
DataSet ds = Ruihua.Common.DbHelperSQL.Query(sql);
if (ds != null && ds.Tables != null && ds.Tables.Count > )
{
DataTable dt = ds.Tables[];
var query1 = from q1 in dt.AsEnumerable()
where q1.Field<string>("ParentID") == ""
select q1;
if (query1 != null)
{
foreach (var item in query1)
{
publicresource.Add(item.Field<string>("TID").Trim());
//第二层
AddListString(ref publicresource, dt, item.Field<string>("TID").Trim());
}
}
}
CacheItemPolicy policy = new CacheItemPolicy();
policy.AbsoluteExpiration = DateTimeOffset.Now.AddSeconds(1800.0);//属性设置为 60*30 秒后逐出缓存
cache.Set("ResourceType", publicresource, policy);
}
else
{
publicresource = ResourceType;
}
return publicresource; }
public void AddListString(ref List<string> list, DataTable dt, string Id)
{
var query2 = from q2 in dt.AsEnumerable()
where q2.Field<string>("ParentID") == Id
select q2;
if (query2 != null)
{
foreach (var item in query2)
{
list.Add(item.Field<string>("TID").Trim());
AddListString(ref list, dt, item.Field<string>("TID").Trim());
}
}
} #endregion } #endregion
LuceneNet 搜索一的更多相关文章
- sulin LuceneNet 搜索二
1.添加引用dll using Lucene.Net.Search;using Lucene.Net.Analysis.PanGu;using PanGu;using PanGu.HighLight; ...
- Lucene.Net 入门级实例 浅显易懂。。。
Lucene.Net 阅读目录 开始 Lucene简介 效果图 Demo文件说明 简单使用 重点类的说明 存在问题 调整后 Lucene.Net博文与资源下载 做过站内搜索的朋友应该对Lucene.N ...
- Lucene.Net(转)
出处:http://www.cnblogs.com/piziyimao/archive/2013/01/31/2887072.html 做过站内搜索的朋友应该对Lucene.Net不陌生,没做过的也许 ...
- lucene.net helper类 【结合盘古分词进行搜索的小例子(分页功能)】
转自:http://blog.csdn.net/pukuimin1226/article/details/17558247 添加:2013-12-25 更新:2013-12-26 新增分页功能. ...
- 瞎折腾之 Lucene.Net + MVC 搜索功能(上)
前言 首先,关于Lucene.Net 的文章已经很多了.我这次决定写出来只是为了练练手,虽然在别人看来没什么用,但是自己确实是手动实践了一把.我个人觉得还是有意义的.爱折腾.敢于实践.才能有所收获,才 ...
- Lucene.net 从创建索引到搜索的代码范例
关于Lucene.Net的介绍网上已经很多了在这里就不多介绍Lucene.Net主要分为建立索引,维护索引和搜索索引Field.Store的作用是通过全文检查就能返回对应的内容,而不必再通过id去DB ...
- 记一次企业级爬虫系统升级改造(五):基于JieBaNet+Lucene.Net实现全文搜索
实现效果: 上一篇文章有附全文搜索结果的设计图,下面截一张开发完成上线后的实图: 基本风格是模仿的百度搜索结果,绿色的分页略显小清新. 目前已采集并创建索引的文章约3W多篇,索引文件不算太大,查询速度 ...
- LuceneNet 实现快速大文件大数据查询
做过站内搜索的朋友应该对Lucene.Net不陌生,因为用普通的sql like查询肯定是不行的,太慢了. 首先说明的是--Lucene.Net只是一个全文检索开发包,不是一个成型的搜索引擎, 它的 ...
- Lucene.net(4.8.0) 学习问题记录六:Lucene 的索引系统和搜索过程分析
前言:目前自己在做使用Lucene.net和PanGu分词实现全文检索的工作,不过自己是把别人做好的项目进行迁移.因为项目整体要迁移到ASP.NET Core 2.0版本,而Lucene使用的版本是3 ...
随机推荐
- c# Data = select new{} 返回值的显示
- Activiti学习笔记2 — HelloWorld
一. Maven的POM配置文件 <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="htt ...
- nginx 配置文件备份 nginx.conf and vhosts
bogon:vhosts xingchong$ brew services restart nginx Stopping `nginx`... (might take a while) ==> ...
- <a>中的背景色变大
想要调整文字链接背景颜色或图片的大小可以用padding属性: 但火狐和IE数值相同显示相同,但与360数值相同显示不同(上下宽度会变小.)
- html---三列布局
三列布局 1一 1.两边固定 当中自适应 2.当中列要完整显示 3.当中列要优先加载 <!DOCTYPE html> <html> <head> <meta ...
- uoj37 主旋律
题意:一个班级n个人,如果a爱b,那么a->b一条有向边.问有多少种删边集合使得图仍然强联通? n<=15. 标程: #include<cstdio> #include&l ...
- 小程序跳转传参参数值为url时参数丢失
通过先encodeURIComponent,取到值以后再decodeURIComponent,拼接参数正常传递 A页面 switch: function (e) { var aa = 'UNNZVUf ...
- Ubuntu Service说明与使用方法
1 什么是Ubuntu的Service 网上很多资料说, service就是linux中随开机自启动的, 并且在后台运行的程序. 个人认为, 至少对于Ubuntu来说, 这个说法是不太准确的, 这只不 ...
- An invalid property 'jdbcType ' was found in mapping
大概2种原因: 1 放进去的类型与字段的类型不匹配 2 比较变态,xml中=两边不能有空格! 错误示例如下: #{plat,jdbcType = INTEGER}, 去掉空格后: #{plat,jd ...
- 提前关闭Scrapy爬虫的设置
Scrapy的CloseSpider扩展会在满足条件时自动终止爬虫程序.可以设置CLOSESPIDER_TIMEOUT(秒).CLOSESPIDER_ITEMCOUNT.CLOSESPIDER_PAG ...