lucene 实现word,pdf全文检索源码
创建索引: import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.text.SimpleDateFormat;
import java.util.Date; import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.Version;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.poi.hslf.HSLFSlideShow;
import org.apache.poi.hslf.model.Slide;
import org.apache.poi.hslf.model.TextRun;
import org.apache.poi.hslf.usermodel.RichTextRun;
import org.apache.poi.hslf.usermodel.SlideShow;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFDateUtil;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.poifs.filesystem.DocumentEntry;
import org.apache.poi.poifs.filesystem.DocumentInputStream;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.util.LittleEndian;
/**
* 创建索引 Lucene 3.0+
* @author Administrator
*
*/
public class indexer { /**
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
//保存索引文件的地方
String indexDir = "data\\test\\indexDir";
//将要搜索TXT文件的地方
String dateDir = "data\\test\\dateDir";
IndexWriter indexWriter = null;
//创建Directory对象
Directory dir = new SimpleFSDirectory(new File(indexDir));
//创建IndexWriter对象,
//第一个参数是Directory,第二个是分词器,
//第三个表示是否是创建,如果为false为在此基础上面修改,
//第四表示表示分词的最大值,比如说new MaxFieldLength(2),就表示两个字一分,
//一般用IndexWriter.MaxFieldLength.LIMITED
indexWriter = new IndexWriter(dir,new StandardAnalyzer(Version.LUCENE_30),true,
IndexWriter.MaxFieldLength.UNLIMITED);
File[] files = new File(dateDir).listFiles();
for (int i = 0; i < files.length; i++) {
Document doc = null;
if(files[i].getName().endsWith(".txt")){
doc = new Document();
//创建Field对象,并放入doc对象中
doc.add(new Field("contents", new FileReader(files[i])));
doc.add(new Field("filename", files[i].getName(),
Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field("indexDate",DateTools.dateToString(new Date(), DateTools.Resolution.DAY),
Field.Store.YES,Field.Index.NOT_ANALYZED));
}else if(files[i].getName().endsWith(".doc")){
doc = getDocument(files[i]);
}else if(files[i].getName().endsWith(".ppt")){
doc = getPPT(files[i]);
}else if(files[i].getName().endsWith(".xls")){
doc = getExcel(files[i]);
}else if(files[i].getName().endsWith(".pdf")){
doc = getPdf(files[i]);
}else{
doc = new Document();
//创建Field对象,并放入doc对象中
doc.add(new Field("contents", new FileReader(files[i])));
doc.add(new Field("filename", files[i].getName(),
Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field("indexDate",DateTools.dateToString(new Date(), DateTools.Resolution.DAY),
Field.Store.YES,Field.Index.NOT_ANALYZED)); }
//写入IndexWriter
if(doc!= null) indexWriter.addDocument(doc);
}
//查看IndexWriter里面有多少个索引
System.out.println("numDocs:"+indexWriter.numDocs());
indexWriter.close(); } public static Document getDocument(File file) throws Exception {
String docPath = file.getAbsolutePath();
String title = file.getName(); // 创建Document
Document document = new Document(); /*InputStream inputStream = null;
Reader contents = null;
try {
inputStream = new FileInputStream(file);
} catch (FileNotFoundException e) {
e.printStackTrace();
} WordExtractor extractor = new WordExtractor();
//try{
// POIFSFileSystem fsys = new POIFSFileSystem(inputStream);
// DocumentEntry headerProps =
// (DocumentEntry)fsys.getRoot().getEntry("WordDocument");
// DocumentInputStream din = fsys.createDocumentInputStream("WordDocument");
// byte[] header = new byte[headerProps.getSize()]; // din.read(header);
// din.close(); // int info = LittleEndian.getShort(header, 0xa);
// if ((info & 0x4) != 0)
// {
// throw new FastSavedException("Fast-saved files are unsupported at this time");
// }
// if ((info & 0x100) != 0)
// {
// throw new PasswordProtectedException("This document is password protected");
// }
//}finally{ //} try {
contents = new StringReader(extractor.extractText(inputStream));
} catch (Exception e) {
e.printStackTrace();
}*/ StringBuffer contents = new StringBuffer("");// 文档内容
try {
FileInputStream fs = new FileInputStream(docPath);
HWPFDocument doc = new HWPFDocument(fs);
Range range = doc.getRange();
int paragraphCount = range.numParagraphs();// 段落
for (int i = 0; i < paragraphCount; i++) {// 遍历段落读取数据
Paragraph pp = range.getParagraph(i);
contents.append(pp.text());
} } catch (Exception e) { }
String cont = contents.toString().trim(); document.add(new Field("filename", title, Field.Store.YES,
Field.Index.ANALYZED));//TOKENIZED
//document.add(new Field("contents", contents));
document.add(new Field("contents", cont,Field.Store.YES,Field.Index.ANALYZED));
//document.add(new Field("path", docPath, Field.Store.YES,Field.Index.ANALYZED));
document.add(new Field("indexDate",DateTools.dateToString(new Date(), DateTools.Resolution.DAY),
Field.Store.YES,Field.Index.NOT_ANALYZED));
return document;
} public static Document getPPT(File pptFile) throws IOException{
String docPath = pptFile.getAbsolutePath();
String title = pptFile.getName(); StringBuffer contents = new StringBuffer("");// 文档内容
InputStream is = new FileInputStream(pptFile);
SlideShow ppt = new SlideShow(new HSLFSlideShow(is));
Slide[] slides = ppt.getSlides();
//提取文本信息
/*for (Slide each : slides) {
//System.out.println("title:" + each.getTitle()) ;
//System.out.println("content:") ;
TextRun[] textRuns = each.getTextRuns();
for (int i=0 ;i< textRuns.length; i++ ) {
//System.out.println(textRuns[i].getText());
RichTextRun[] richTextRuns = textRuns[i].getRichTextRuns();
for (int j = 0; j < richTextRuns.length; j++) {
//System.out.println(richTextRuns[j].getText());
contents.append(richTextRuns[j].getText());
}
}
contents.append(each.getTitle());
}*/
for(int i=0;i <slides.length;i++){
TextRun[] t = slides[i].getTextRuns();//为了取得幻灯片的文字内容,建立TextRun
for(int j=0;j <t.length;j++){
contents.append(t[j].getText());//这里会将文字内容加到content中去
}
//contents.append(slides[i].getTitle());
} Document document = new Document();
String cont = contents.toString().trim(); document.add(new Field("filename", title, Field.Store.YES,
Field.Index.ANALYZED));//TOKENIZED
//document.add(new Field("contents", contents));
document.add(new Field("contents", cont,Field.Store.YES,Field.Index.ANALYZED));
//document.add(new Field("path", docPath, Field.Store.YES,Field.Index.ANALYZED));
document.add(new Field("indexDate",DateTools.dateToString(new Date(), DateTools.Resolution.DAY),
Field.Store.YES,Field.Index.NOT_ANALYZED));
return document;
} public static Document getPdf(File pdf) {
String pdfpath = pdf.getAbsolutePath();
// 创建输入流读取pdf文件
String title = pdf.getName();
String result = "";
FileInputStream is = null;
PDDocument doc = null;
try {
is = new FileInputStream(pdf);
PDFParser parser = new PDFParser(is);
parser.parse();
doc = parser.getPDDocument();
PDFTextStripper stripper = new PDFTextStripper();
result = stripper.getText(doc); } catch (Exception e) { e.printStackTrace();
} finally {
if (is != null) {
try {
is.close();
} catch (Exception e) {
e.printStackTrace();
}
}
if (doc != null) {
try {
doc.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}
Document document = new Document();
document.add(new Field("filename", title, Field.Store.YES,
Field.Index.ANALYZED));//TOKENIZED
document.add(new Field("contents", result, Field.Store.YES,
Field.Index.ANALYZED));
//document.add(new Field("path", pdfpath, Field.Store.YES,Field.Index.ANALYZED));
return document;
} public static Document getExcel(File fileExcel) throws Exception { InputStream is = new FileInputStream(fileExcel);
StringBuffer content = new StringBuffer(); HSSFWorkbook workbook = new HSSFWorkbook(is); for (int numSheets = 0; numSheets < workbook.getNumberOfSheets(); numSheets++) {
HSSFSheet aSheet = workbook.getSheetAt(numSheets);// 获得一个sheet
content.append("\n");
if (null == aSheet) {
continue;
}
for (int rowNum = 0; rowNum <= aSheet.getLastRowNum(); rowNum++) {
content.append("\n");
HSSFRow aRow = aSheet.getRow(rowNum);
if (null == aRow) {
continue;
} for (short cellNum = 0; cellNum <= aRow.getLastCellNum(); cellNum++) {
HSSFCell aCell = aRow.getCell(cellNum);
if (null == aCell) {
continue;
} if (aCell.getCellType() == HSSFCell.CELL_TYPE_STRING) {
content.append(aCell.getRichStringCellValue().getString());
} else if (aCell.getCellType() == HSSFCell.CELL_TYPE_NUMERIC) {
boolean b = HSSFDateUtil.isCellDateFormatted(aCell);
if (b) {
Date date = aCell.getDateCellValue();
SimpleDateFormat df = new SimpleDateFormat("yyyy-MM-dd");
content.append(df.format(date));
}
}
}
}
} String cont = content.toString();
Document document = new Document();
document.add(new Field("filename",fileExcel.getName(), Field.Store.YES,
Field.Index.ANALYZED));//TOKENIZED
document.add(new Field("contents", cont, Field.Store.YES,
Field.Index.ANALYZED));
//document.add(new Field("path", pdfpath, Field.Store.YES,Field.Index.ANALYZED));
return document;
} public static String readHtml(String urlString) { StringBuffer content = new StringBuffer("");
File file = new File(urlString);
FileInputStream fis = null;
try {
fis = new FileInputStream(file);
// 读取页面
BufferedReader reader = new BufferedReader(new InputStreamReader(
fis,"utf-8"));//这里的字符编码要注意,要对上html头文件的一致,否则会出乱码 String line = null; while ((line = reader.readLine()) != null) {
content.append(line + "\n");
}
reader.close();
} catch (Exception e) {
e.printStackTrace();
}
String contentString = content.toString();
return contentString;
}
}
搜索索引
import java.io.File;
import java.io.IOException; import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.Version;
/**
* 搜索索引 Lucene 3.0+
* @author Administrator
*
*/
public class searcher { public static void main(String[] args) throws IOException, ParseException {
//保存索引文件的地方
String indexDir = "data\\test\\indexDir";
Directory dir = new SimpleFSDirectory(new File(indexDir));
//创建 IndexSearcher对象,相比IndexWriter对象,这个参数就要提供一个索引的目录就行了
IndexSearcher indexSearch = new IndexSearcher(dir);
//创建QueryParser对象,第一个参数表示Lucene的版本,第二个表示搜索Field的字段,第三个表示搜索使用分词器
QueryParser queryParser = new QueryParser(Version.LUCENE_30,
"contents", new StandardAnalyzer(Version.LUCENE_30));
//生成Query对象
Query query = queryParser.parse("arcgis");
//搜索结果 TopDocs里面有scoreDocs[]数组,里面保存着索引值
TopDocs hits = indexSearch.search(query,10);
//hits.totalHits表示一共搜到多少个
System.out.println("找到了"+hits.totalHits+"个");
//循环hits.scoreDocs数据,并使用indexSearch.doc方法把Document还原,再拿出对应的字段的值
for (int i = 0; i < hits.scoreDocs.length; i++) {
ScoreDoc sdoc = hits.scoreDocs[i];
Document doc = indexSearch.doc(sdoc.doc);
System.out.println(doc.get("filename"));
}
indexSearch.close();
}
}
lucene 实现word,pdf全文检索源码的更多相关文章
- 《C++实践之路.pdf》源码
> 源码下载方法 < >> 打开微信 >> 扫描下方二维码 >> 关注林哥私房菜 >> 输入对应编号获取百度网盘提取密码 全书源码[已更新完 ...
- Spring实战(中文4,5版) PDF含源码
Spring实战 读者评价 看了一半后在做评论,物流速度挺快,正版行货,只是运输过程有点印记,但是想必大家和你关注内容,spring 4必之3更加关注的是使用注解做开发,对于初学者还是很有用,但是不排 ...
- C#导出文本内容到word文档源码
将做工程过程中较好的代码片段珍藏起来,下面的代码内容是关于C#导出文本内容到word文档的代码,希望能对小伙伴们也有好处.<%@ Page Language="C#" Aut ...
- tomcat 源码解析
how_tomcat_works https://www.uzh.ch/cmsssl/dam/jcr:00000000-29c9-42ee-0000-000074fab75a/how_tomcat_w ...
- java源码剖析: 对象内存布局、JVM锁以及优化
一.目录 1.启蒙知识预热:CAS原理+JVM对象头内存存储结构 2.JVM中锁优化:锁粗化.锁消除.偏向锁.轻量级锁.自旋锁. 3.总结:偏向锁.轻量级锁,重量级锁的优缺点. 二.启蒙知识预热 开启 ...
- java集合树状结构及源码
java集合树状结构及源码 最近一直想看一下java集合的源码,毕竟平时用的比较多,但总是感觉是跟着习惯new出来一个对象,比如ArrayList,HashMap等等,所以就简单的看了一下,了解了一下 ...
- jdk源码剖析二: 对象内存布局、synchronized终极原理
很多人一提到锁,自然第一个想到了synchronized,但一直不懂源码实现,现特地追踪到C++层来剥开synchronized的面纱. 网上的很多描述大都不全,让人看了不够爽,看完本章,你将彻底了解 ...
- 早前阅读live555源码做的笔记
早前阅读live555源码的时候做了一些简单的笔记.现在看来那个时候对C++的理解还是不够,还有很多不足.当时对很多名词也不是很熟悉,对一些类的描述也很生硬,所以笔记中有一些不通畅之处. 阅读live ...
- tomcat 源码分析
Tomcat源码分析——Session管理分析(下) Tomcat源码分析——Session管理分析(上) Tomcat源码分析——请求原理分析(下) Tomcat源码分析——请 ...
随机推荐
- PHP扩展开发01:第一个扩展【转】
我们先假设业务场景,是需要有这么一个扩展,提供一个叫ccvita_string的函数,他的主要作用是返回一段字符.(这个业务场景实在太假,大家就这么看看吧)对应的PHP代码可能是这样: functio ...
- Android Studio-ApplicationId 与 PackageName
BuildType 中所设置的『applicationIdSuffix』属性,按照这个属性的字面翻译为:『applicationId 的后缀』 Android 应用都有自己的包名.包名是设备上每个应用 ...
- Java加密算法 AES
版权声明:本文为博主原创文章,转载请注明出处. [java] view plain copy print?在CODE上查看代码片派生到我的代码片 package com.stone.security; ...
- mysql sqlmap 注入尝试
假设注入点为 http://www.abc.com/news.php?id=12 //探测数据库信息 sqlmap -u http://www.abc.com/news.php?id=12 –dbs ...
- C# ADO.NET参数查询
using System; using System.Collections.Generic; using System.ComponentModel; using System.Data; usin ...
- jQuery Validation Plugin学习
http://blog.csdn.net/violet_day/article/details/14109261 jQuery Validation Plugin Demo 一.默认校验规则 (1)r ...
- SQL Server 2008 Values 新用途
SQL Server 2008中新增功能:可以使用单个Insert命令插入多行. Create table Demo_Values (PKID int not null identity(1,1) p ...
- Js 学习资料
Js 语法 http://www.php100.com/manual/jquery/ jqGrid控件 http://www.trirand.com/blog/jqgrid/jqgrid.html
- MVC小系列(十六)【在控制器级别或具体Action级别上动态设定模板页(Layout)】
方法就是使用:ActionFilterAttribute它的几个方法:OnActionExecuted,OnActionExecuting,OnResultExecuted和OnResultExecu ...
- 经典SQL语句大全(绝对的经典)
”,start为起始位置,length为字符串长度,实际应用中以len(expression)取得其长度3,right(char_expr,int_expr) 返回字符串右边第int_expr个字符, ...