LuceneNet 搜索一
1.引用读取PDF文件组件
FontBox-0.1.0-dev.dll
IKVM.GNU.Classpath.dll
IKVM.Runtime.dll
PDFBox-0.7.3.dll
2.添加office 组件 这个就过吧
3.添加盘古分词
PanGu.dll
PanGu.HighLight.dll
PanGu.Lucene.Analyzer.dll
4.添加Lucene.net 引用
Lucene.Net.dll
5.创建索引库
#region 同步资料到索引库
static Queue<ResourcesModel> TaskQueue = new Queue<ResourcesModel>();
private void tmResSync_Tick(object sender, EventArgs e)
{
//读取数据到队列
try
{
InitTaskQueue();
LogHelper.writeLog("WinFrom【同步数据索引库读取数据到队列】成功!!");
}
catch (Exception ex)
{
LogHelper.writeErrLog("WinFrom【同步数据索引库读取数据到队列】:" + ex.Message);
}
}
public void ServiceStart()
{
Thread TaskThread = new Thread(new ThreadStart(ThreadInvoke));
TaskThread.IsBackground = true;
TaskThread.Start();
}
public void ThreadInvoke()
{
while (true)
{
try
{
if (TaskQueue.Count > )
{
ResourcesModel res = null;
lock (TaskQueue)
{
res = TaskQueue.Dequeue();
}
//调用方法
new CreateResIndex().CreateIndex(res);
}
else
{
Thread.Sleep();
}
}
catch (Exception ex)
{
LogHelper.writeErrLog("WinFrom【同步数据索引库出错】:"+ex.ToString());
}
}
}
public void InitTaskQueue()
{
//读取资料中心数据
var query = new CreateResIndex().Get_View_CreateResIndex(" and uploadTime is not null and IsIndex=0 ");
if (query!=null)
{
for (int i = ; i < query.Rows.Count; i++)
{
var model =new ResourcesModel();
model.ID =query.Rows[i]["ID"].ToString();
model.FileName=query.Rows[i]["FileName"]!=null ? query.Rows[i]["FileName"].ToString():"";
model.FilePath=query.Rows[i]["FilePath"]!=null ? query.Rows[i]["FilePath"].ToString():"";
model.CreaetBy=query.Rows[i]["UserName"]!=null ? query.Rows[i]["UserName"].ToString():"";
model.Types=query.Rows[i]["Name"]!=null ? query.Rows[i]["Name"].ToString():"";
model.TypeId=query.Rows[i]["Type"]!=null ? query.Rows[i]["Type"].ToString():"";
model.SimpleDesc=query.Rows[i]["SimpleDesc"]!=null ? query.Rows[i]["SimpleDesc"].ToString():"";
model.Title=query.Rows[i]["Title"]!=null ? query.Rows[i]["Title"].ToString():"";
model.Tags=query.Rows[i]["Tag"]!=null ? query.Rows[i]["Tag"].ToString():"";
model.OP = query.Rows[i]["IsDel"] != null && query.Rows[i]["IsDel"].ToString()!="" ? Convert.ToBoolean(query.Rows[i]["IsDel"].ToString())==true ? "" : "":"";
model.UploadTime = query.Rows[i]["uploadTime"] != null && query.Rows[i]["uploadTime"].ToString() != "" ? Convert.ToDateTime(query.Rows[i]["uploadTime"]).ToString("yyyy-MM-dd"):"";
TaskQueue.Enqueue(model);
}
} }
#endregion
#region ResourcesModel
public class ResourcesModel
{
public ResourcesModel() { } /// <summary>
/// 标识
/// </summary>
public string ID { get; set; } /// <summary>
/// 标题
/// </summary>
public string Title { get; set; } /// <summary>
///标签
/// </summary>
public string Tags { get; set; } /// <summary>
///创建人
/// </summary>
public string CreaetBy { get; set; } /// <summary>
///上传时间
/// </summary>
public string UploadTime { get; set; } /// <summary>
///类别
/// </summary>
public string Types { get; set; } /// <summary>
///简介
/// </summary>
public string SimpleDesc { get; set; }
/// <summary>
///内容
/// </summary>
public string ContextDesc { get; set; }
/// <summary>
/// 有来标注是 删除=0 增加=1 修改=2
/// </summary>
public string OP { get; set; }
/// <summary>
/// 类型Id
/// </summary>
public string TypeId { get; set; }
/// <summary>
/// 文件路径
/// </summary>
public string FilePath { get; set; }
/// <summary>
/// 文件名称
/// </summary>
public string FileName { get; set; }
}
#endregion
#region 读取文件
public class ReadFilesTxt
{
public string ResumeTxt(string path)
{
string str = string.Empty; StreamReader reader = new StreamReader(path, System.Text.Encoding.Default);
str = reader.ReadToEnd(); //再通过查询解析出来的的字符串有没有GB2312 的字段,来判断是否是GB2312格式的,如果是,则重新以GB2312的格式解析
System.Text.RegularExpressions.Regex reGB = new System.Text.RegularExpressions.Regex("GB2312", RegexOptions.IgnoreCase);
System.Text.RegularExpressions.Match mcGB = reGB.Match(str);
if (mcGB.Success)
{
StreamReader reader2 = new StreamReader(path, System.Text.Encoding.GetEncoding("GB2312"));
str = reader2.ReadToEnd();
}
return str;
} private string ResumeWord(string path)
{
string str = string.Empty;
object missing = System.Reflection.Missing.Value;
object readOnly = true;
object docPathp = path;
Microsoft.Office.Interop.Word.Application wordApp = new Microsoft.Office.Interop.Word.Application(); Microsoft.Office.Interop.Word.Document wordDoc = wordApp.Documents.Open(ref docPathp,
ref missing,
ref readOnly,
ref missing,
ref missing,
ref missing,
ref missing,
ref missing,
ref missing,
ref missing,
ref missing,
ref missing,
ref missing,
ref missing,
ref missing,
ref missing);
str = wordDoc.Content.Text;
wordDoc.Close(ref missing, ref missing, ref missing);
wordApp.Quit(ref missing, ref missing, ref missing); return str;
} private string ResumeExcel(string path)
{
string str = string.Empty;
//创建Application对象
Microsoft.Office.Interop.Excel.Application xApp = new Microsoft.Office.Interop.Excel.Application();
xApp.Visible = false;
object readOnly = true;
object missing = System.Reflection.Missing.Value;
////得到WorkBook对象,
Microsoft.Office.Interop.Excel.Workbook xBook = xApp.Workbooks._Open(path,
missing, readOnly, missing, missing,
missing, missing, missing, missing,
missing, missing, missing, missing); var count = xBook.Sheets.Count;
Microsoft.Office.Interop.Excel.Worksheet xSheet;
for (int k = ; k < count; k++)
{
xSheet = (Microsoft.Office.Interop.Excel.Worksheet)xBook.Sheets[k + ];
var rcount = xSheet.UsedRange.Rows.Count;
var ccount = xSheet.UsedRange.Columns.Count; for (int m = ; m < rcount; m++)
{
for (int n = ; n < ccount; n++)
{
str = str + ((Microsoft.Office.Interop.Excel.Range)xSheet.Cells[m + , n + ]).Value2;
}
}
} xSheet = null;
xBook.Close(missing, missing, missing);
xApp.Quit(); return str;
} public string ResumePDF(string path)
{ org.pdfbox.pdmodel.PDDocument doc = org.pdfbox.pdmodel.PDDocument.load(path); org.pdfbox.util.PDFTextStripper pdfStripper = new org.pdfbox.util.PDFTextStripper(); string text = pdfStripper.getText(doc); return text; } public string GetReadContext(string ResourceRoute, string path)
{
StringBuilder sb = new StringBuilder(); try
{
if (path != "")
{
string[] paths = path.Split(';');
for (int i = ; i < paths.Length; i++)
{
if (paths[i] != null && paths[i].ToString() != "")
{ string lpath = paths[i].ToString();
var suffix = lpath.Substring(lpath.LastIndexOf(".") + , lpath.Length - lpath.LastIndexOf(".") - );
if ("doc" == suffix || "docx" == suffix)
{
sb.Append(ResumeWord(ResourceRoute + lpath));
}
else if ("xls" == suffix || "xlsx" == lpath)
{
sb.Append(ResumeExcel(ResourceRoute + lpath));
}
else if ("pdf" == suffix)
{
sb.Append(ResumePDF(ResourceRoute + lpath));
}
else if ("txt" == suffix)
{
sb.Append(ResumeTxt(ResourceRoute + lpath));
} }
}
}
}
catch (Exception ex)
{ LogHelper.writeErrLog( "【读取文件出错:文件名称:" + path + " 】 错误消息:" + ex.Message.ToString());
} return sb.ToString();
} public string GetReadContextSingle(string ResourceRoute, string lpath)
{
StringBuilder sb = new StringBuilder();
try
{
if (lpath != "")
{
var suffix = lpath.Substring(lpath.LastIndexOf(".") + , lpath.Length - lpath.LastIndexOf(".") - );
if ("doc" == suffix || "docx" == suffix)
{
sb.Append(ResumeWord(ResourceRoute + lpath));
}
else if ("xls" == suffix || "xlsx" == lpath)
{
sb.Append(ResumeExcel(ResourceRoute + lpath));
}
else if ("pdf" == suffix)
{
sb.Append(ResumePDF(ResourceRoute + lpath));
}
else if ("txt" == suffix)
{
sb.Append(ResumeTxt(ResourceRoute + lpath));
}
}
}
catch (Exception ex)
{ LogHelper.writeErrLog("【读取文件出错:文件名称:" + ResourceRoute + lpath + " 】 错误消息:" + ex.Message.ToString());
} return sb.ToString();
}
}
#endregion
#region 创建索引
public class CreateResIndex
{
public static string IndexPath = ConfigurationManager.AppSettings["pathIndex"];//索引文件路径
public static string ResourceRoute = ConfigurationManager.AppSettings["ResourceRoute"];//文件路径 // private readonly ILog log = LogManager.GetLogger("CreateIndex"); #region 属性
/// <summary>
/// 盘古分词器
/// </summary>
protected Analyzer NewPanGuAnalyzer
{
get { return new PanGuAnalyzer(); } } /// <summary>
/// Lucene.Net的目录-参数
/// </summary>
public FSDirectory DirectoryLuce
{
get
{
return FSDirectory.Open(new DirectoryInfo(IndexPath), new NativeFSLockFactory());
}
}
#endregion #region 创建索引
/// <summary>
///创建索引
/// </summary>
public void CreateIndex(ResourcesModel res)
{
//创建索引目录
if (!System.IO.Directory.Exists(IndexPath))
{
System.IO.Directory.CreateDirectory(IndexPath);
} //FSDirectory directory = FSDirectory.Open(new DirectoryInfo(IndexDic), new NativeFSLockFactory());
bool isUpdate = IndexReader.IndexExists(DirectoryLuce);
if (isUpdate)
{
if (IndexWriter.IsLocked(DirectoryLuce))
{
IndexWriter.Unlock(DirectoryLuce);
}
} IndexWriter writer = new IndexWriter(DirectoryLuce, NewPanGuAnalyzer, !isUpdate, IndexWriter.MaxFieldLength.UNLIMITED);
List<string> listIsdex = GetResourceTypePublicResources();
List<string> modifyindex = new List<string>();
if (res != null)
{
if (res.OP == "")
{
writer.DeleteDocuments(new Term("ID", res.ID.ToString().Trim()));
modifyindex.Add(res.ID.ToString().Trim());
LogHelper.writeLog("【删除索引编号】 【ID:" + res.ID.ToString().Trim() + "】");
}
else
{ if (IsPublicResources(listIsdex, res.TypeId.Trim()))
{ writer.DeleteDocuments(new Term("ID", res.ID.ToString().Trim())); var path = res.FilePath; string ID = res.ID.ToString().Trim();
string Title = res.Title != null ? res.Title.ToString() : ""; string CreaetBy = res.CreaetBy == null ? "" : res.CreaetBy.ToString();
string UploadTime = res.UploadTime;
string Types = res.Types != null ? res.Types.ToString() : "";
string SimpleDesc = res.SimpleDesc == null ? "" : res.SimpleDesc.ToString();
string Tags = res.Tags != null ? res.Tags.ToString() : "";
string FileName = res.FileName;
try
{
string ContextDesc = "";
AddIndex(writer, ID, Title, Tags, SimpleDesc, "", Types, UploadTime, CreaetBy, FileName);
if (path != "")
{
string[] paths = path.Split(';');
string[] pname = FileName.Split(';');
for (int i = ; i < paths.Length; i++)
{
if (paths[i] != null && paths[i].ToString() != "")
{
string lpath = paths[i].ToString();
string lname = pname[i].ToString();
ContextDesc= new ReadFilesTxt().GetReadContextSingle(ResourceRoute, lpath);
//SimpleDesc=ContextDesc ContextDesc=""
string NewFileName = GetFileName(lpath, lname);
AddIndex(writer, ID, NewFileName, Tags, ContextDesc, lpath, Types, UploadTime, CreaetBy, FileName);
}
}
} // string ContextDesc = new ReadFilesTxt().GetReadContext(ResourceRoute,path);
LogHelper.writeLog("【添加索引编号】 【ID:" + res.ID.ToString().Trim() + "】");
modifyindex.Add(ID);
}
catch (Exception ex)
{
LogHelper.writeLog("【添加索引失败】 【ID:" + ID + "】:" + ex.Message.ToString()); } }
}
}
writer.Optimize();
writer.Close();
ModifyResIndex(modifyindex);
} public void AddIndex(IndexWriter writer, string ID, string Title, string Tags, string SimpleDesc, string ContextDesc, string Types, string UploadTime, string CreaetBy,string FileName)
{
try
{
Lucene.Net.Documents.Document doc = new Lucene.Net.Documents.Document();
doc.Add(new Lucene.Net.Documents.Field("ID", ID, Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.NOT_ANALYZED));//存储且索引
doc.Add(new Lucene.Net.Documents.Field("Title", Title, Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.ANALYZED));//存储且索引
doc.Add(new Lucene.Net.Documents.Field("Tags", Tags, Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.ANALYZED));//存储且索引
doc.Add(new Lucene.Net.Documents.Field("SimpleDesc", SimpleDesc, Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.ANALYZED));//存储且索引
doc.Add(new Lucene.Net.Documents.Field("FileName", FileName, Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.ANALYZED));//存储且索引
doc.Add(new Lucene.Net.Documents.Field("ContextDesc", ContextDesc, Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.ANALYZED));//存储且索引
doc.Add(new Lucene.Net.Documents.Field("Types", Types, Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.ANALYZED));//存储且索引
doc.Add(new Lucene.Net.Documents.Field("UploadTime", UploadTime, Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.NOT_ANALYZED));//存储且索引
doc.Add(new Lucene.Net.Documents.Field("CreaetBy", CreaetBy, Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.NO));
writer.AddDocument(doc);
}
catch (FileNotFoundException fnfe)
{
throw fnfe;
}
catch (Exception ex)
{
throw ex;
}
}
public string GetFileName(object objfilepath, object FileName)
{
string result = "";
if (FileName != null && FileName.ToString() != "")
{
result = FileName.ToString();
}
else
{
if (objfilepath != null && objfilepath.ToString() != "")
{
string filename = objfilepath.ToString().Substring(objfilepath.ToString().LastIndexOf(',') + ).Replace(";", "");
result = filename;
}
}
return result;
}
#endregion #region 获取数据库数据
/// <summary>
/// 获取中心资料库数据
/// </summary>
/// <param name="whereStr"></param>
/// <returns></returns>
public DataTable Get_View_CreateResIndex(string whereStr)
{
string sql = " Select * From Res_View_createResIndex where 1=1 " + whereStr;
DataTable dt = new DataTable(); try
{
DataSet ds = Ruihua.Common.DbHelperSQL.Query(sql);
if (ds != null && ds.Tables != null && ds.Tables.Count > )
{
dt = ds.Tables[];
}
}
catch (Exception ex)
{
LogHelper.writeLog("【 获取中心资料库数据错误】:" + ex.ToString());
}
return dt;
} public void ModifyResIndex(List<string> list)
{
string sql = " update ResourceInfoNew set IsIndex=1 where id in ({0}) ";
StringBuilder sb = new StringBuilder("'-1'");
//Ruihua.Common.DbHelperSQL.connectionString = ConfigurationManager.AppSettings["ResConStr"].ToString();
LogHelper.writeLog("【更新索引编号开始】:" + string.Join(",", list.ToArray()));
if (list.Count > )
{
for (int i = ; i < list.Count; i++)
{
sb.Append(",'" + list[i].ToString() + "'");
}
sql = string.Format(sql, sb.ToString());
int result = Ruihua.Common.DbHelperSQL.ExecuteSql(sql);
LogHelper.writeLog("【更新索引编号结束:" + result.ToString() + "】:" + string.Join(",", list.ToArray()));
}
} /// <summary>
/// 判断是否公共资源
/// </summary>
/// <returns></returns>
public bool IsPublicResources(List<string> list, string Id)
{ if (list.Contains(Id))
{
return true;
}
return false; }
public List<string> GetResourceTypePublicResources()
{
ObjectCache cache = MemoryCache.Default;
List<string> ResourceType = cache["ResourceType"] as List<string>;
List<string> publicresource = new List<string>();
if (ResourceType == null)
{ // Ruihua.Common.DbHelperSQL.connectionString = ConfigurationManager.AppSettings["ResConStr"].ToString();
string sql = "select *From ResourceType ";
DataSet ds = Ruihua.Common.DbHelperSQL.Query(sql);
if (ds != null && ds.Tables != null && ds.Tables.Count > )
{
DataTable dt = ds.Tables[];
var query1 = from q1 in dt.AsEnumerable()
where q1.Field<string>("ParentID") == ""
select q1;
if (query1 != null)
{
foreach (var item in query1)
{
publicresource.Add(item.Field<string>("TID").Trim());
//第二层
AddListString(ref publicresource, dt, item.Field<string>("TID").Trim());
}
}
}
CacheItemPolicy policy = new CacheItemPolicy();
policy.AbsoluteExpiration = DateTimeOffset.Now.AddSeconds(1800.0);//属性设置为 60*30 秒后逐出缓存
cache.Set("ResourceType", publicresource, policy);
}
else
{
publicresource = ResourceType;
}
return publicresource; }
public void AddListString(ref List<string> list, DataTable dt, string Id)
{
var query2 = from q2 in dt.AsEnumerable()
where q2.Field<string>("ParentID") == Id
select q2;
if (query2 != null)
{
foreach (var item in query2)
{
list.Add(item.Field<string>("TID").Trim());
AddListString(ref list, dt, item.Field<string>("TID").Trim());
}
}
} #endregion } #endregion
LuceneNet 搜索一的更多相关文章
- sulin LuceneNet 搜索二
1.添加引用dll using Lucene.Net.Search;using Lucene.Net.Analysis.PanGu;using PanGu;using PanGu.HighLight; ...
- Lucene.Net 入门级实例 浅显易懂。。。
Lucene.Net 阅读目录 开始 Lucene简介 效果图 Demo文件说明 简单使用 重点类的说明 存在问题 调整后 Lucene.Net博文与资源下载 做过站内搜索的朋友应该对Lucene.N ...
- Lucene.Net(转)
出处:http://www.cnblogs.com/piziyimao/archive/2013/01/31/2887072.html 做过站内搜索的朋友应该对Lucene.Net不陌生,没做过的也许 ...
- lucene.net helper类 【结合盘古分词进行搜索的小例子(分页功能)】
转自:http://blog.csdn.net/pukuimin1226/article/details/17558247 添加:2013-12-25 更新:2013-12-26 新增分页功能. ...
- 瞎折腾之 Lucene.Net + MVC 搜索功能(上)
前言 首先,关于Lucene.Net 的文章已经很多了.我这次决定写出来只是为了练练手,虽然在别人看来没什么用,但是自己确实是手动实践了一把.我个人觉得还是有意义的.爱折腾.敢于实践.才能有所收获,才 ...
- Lucene.net 从创建索引到搜索的代码范例
关于Lucene.Net的介绍网上已经很多了在这里就不多介绍Lucene.Net主要分为建立索引,维护索引和搜索索引Field.Store的作用是通过全文检查就能返回对应的内容,而不必再通过id去DB ...
- 记一次企业级爬虫系统升级改造(五):基于JieBaNet+Lucene.Net实现全文搜索
实现效果: 上一篇文章有附全文搜索结果的设计图,下面截一张开发完成上线后的实图: 基本风格是模仿的百度搜索结果,绿色的分页略显小清新. 目前已采集并创建索引的文章约3W多篇,索引文件不算太大,查询速度 ...
- LuceneNet 实现快速大文件大数据查询
做过站内搜索的朋友应该对Lucene.Net不陌生,因为用普通的sql like查询肯定是不行的,太慢了. 首先说明的是--Lucene.Net只是一个全文检索开发包,不是一个成型的搜索引擎, 它的 ...
- Lucene.net(4.8.0) 学习问题记录六:Lucene 的索引系统和搜索过程分析
前言:目前自己在做使用Lucene.net和PanGu分词实现全文检索的工作,不过自己是把别人做好的项目进行迁移.因为项目整体要迁移到ASP.NET Core 2.0版本,而Lucene使用的版本是3 ...
随机推荐
- [笔记]180612 for DevOps
adb devices 识别不了安卓手机:我下的adb interface驱动下载链接:如果设备管理器中ADB Interface是黄色的,就需要先安装adb interface驱动(BD:adb i ...
- WPF 免费控件库
https://github.com/Infragistics/InfragisticsThemesForMicrosoftControls 几款WPF免费控件库,不过运行源码时需要下载三个DLL , ...
- [转]C#的扩展方法解说
C#的扩展方法解说 扩展方法的目的就是为一个现有类型添加一个方法,现有类型既可以是int,string等数据类型,也可以是自定义的数据类型. 为数据类型的添加一个方法的理解:一般来说,int数据类型有 ...
- 关于操作系统中英文切换的.po和.mo介绍
一.文件简介 .po文件,.mo文件,.pot文件是由gettext程序生成或者使用的源代码和编译结果. 1..pot文件 是一种模板文件,其实质与.po文件一样,其中包含了从源代码中提取所有的 ...
- Tensorflow入门篇
参考Tensorflow中文网(http://www.tensorfly.cn/tfdoc/get_started/introduction.html) ,写一个入门. 1.打开pyCharm,新建 ...
- ThinkPHP 删除数据
ThinkPHP删除数据使用delete方法,例如: 直线电机价格 $Form = M('Form'); $Form->delete(5); 表示删除主键为5的数据,delete方法可以删除单个 ...
- IDEA Error:java: Compilation failed: internal java compiler error 解决方案
这是由于版本不一致导致的 file => settings => 搜索找到Java Compiler 把相应jdk版本改成1.8 ctrl+alt+s
- LeeCode-Single Number III
Given an array of numbers nums, in which exactly two elements appear only once and all the other ele ...
- 北京服务业占GDP比重达81.7%
北京服务业占GDP比重达81.7% 2017-05-17 19:46:00 来源: 中国新闻网(北京)举报 0 易信 微信 QQ空间 微博 更多 (原标题:北京服务业占GDP比重达81.7%) ...
- 用JS写的一个简单的时钟
没什么技术含量,单纯的想传上去.手痒了 <!DOCTYPE html> <html> <head> <meta charset="utf-8&quo ...