创建索引:

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.text.SimpleDateFormat;
import java.util.Date; import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.Version;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.poi.hslf.HSLFSlideShow;
import org.apache.poi.hslf.model.Slide;
import org.apache.poi.hslf.model.TextRun;
import org.apache.poi.hslf.usermodel.RichTextRun;
import org.apache.poi.hslf.usermodel.SlideShow;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFDateUtil;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.poifs.filesystem.DocumentEntry;
import org.apache.poi.poifs.filesystem.DocumentInputStream;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.util.LittleEndian;
/**
* 创建索引 Lucene 3.0+
* @author Administrator
*
*/
public class indexer { /**
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
//保存索引文件的地方
String indexDir = "data\\test\\indexDir";
//将要搜索TXT文件的地方
String dateDir = "data\\test\\dateDir";
IndexWriter indexWriter = null;
//创建Directory对象
Directory dir = new SimpleFSDirectory(new File(indexDir));
//创建IndexWriter对象,
//第一个参数是Directory,第二个是分词器,
//第三个表示是否是创建,如果为false为在此基础上面修改,
//第四表示表示分词的最大值,比如说new MaxFieldLength(2),就表示两个字一分,
//一般用IndexWriter.MaxFieldLength.LIMITED
indexWriter = new IndexWriter(dir,new StandardAnalyzer(Version.LUCENE_30),true,
IndexWriter.MaxFieldLength.UNLIMITED);
File[] files = new File(dateDir).listFiles();
for (int i = 0; i < files.length; i++) {
Document doc = null;
if(files[i].getName().endsWith(".txt")){
doc = new Document();
//创建Field对象,并放入doc对象中
doc.add(new Field("contents", new FileReader(files[i])));
doc.add(new Field("filename", files[i].getName(),
Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field("indexDate",DateTools.dateToString(new Date(), DateTools.Resolution.DAY),
Field.Store.YES,Field.Index.NOT_ANALYZED));
}else if(files[i].getName().endsWith(".doc")){
doc = getDocument(files[i]);
}else if(files[i].getName().endsWith(".ppt")){
doc = getPPT(files[i]);
}else if(files[i].getName().endsWith(".xls")){
doc = getExcel(files[i]);
}else if(files[i].getName().endsWith(".pdf")){
doc = getPdf(files[i]);
}else{
doc = new Document();
//创建Field对象,并放入doc对象中
doc.add(new Field("contents", new FileReader(files[i])));
doc.add(new Field("filename", files[i].getName(),
Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field("indexDate",DateTools.dateToString(new Date(), DateTools.Resolution.DAY),
Field.Store.YES,Field.Index.NOT_ANALYZED)); }
//写入IndexWriter
if(doc!= null) indexWriter.addDocument(doc);
}
//查看IndexWriter里面有多少个索引
System.out.println("numDocs:"+indexWriter.numDocs());
indexWriter.close(); } public static Document getDocument(File file) throws Exception {
String docPath = file.getAbsolutePath();
String title = file.getName(); // 创建Document
Document document = new Document(); /*InputStream inputStream = null;
Reader contents = null;
try {
inputStream = new FileInputStream(file);
} catch (FileNotFoundException e) {
e.printStackTrace();
} WordExtractor extractor = new WordExtractor();
//try{
// POIFSFileSystem fsys = new POIFSFileSystem(inputStream);
// DocumentEntry headerProps =
// (DocumentEntry)fsys.getRoot().getEntry("WordDocument");
// DocumentInputStream din = fsys.createDocumentInputStream("WordDocument");
// byte[] header = new byte[headerProps.getSize()]; // din.read(header);
// din.close(); // int info = LittleEndian.getShort(header, 0xa);
// if ((info & 0x4) != 0)
// {
// throw new FastSavedException("Fast-saved files are unsupported at this time");
// }
// if ((info & 0x100) != 0)
// {
// throw new PasswordProtectedException("This document is password protected");
// }
//}finally{ //} try {
contents = new StringReader(extractor.extractText(inputStream));
} catch (Exception e) {
e.printStackTrace();
}*/ StringBuffer contents = new StringBuffer("");// 文档内容
try {
FileInputStream fs = new FileInputStream(docPath);
HWPFDocument doc = new HWPFDocument(fs);
Range range = doc.getRange();
int paragraphCount = range.numParagraphs();// 段落
for (int i = 0; i < paragraphCount; i++) {// 遍历段落读取数据
Paragraph pp = range.getParagraph(i);
contents.append(pp.text());
} } catch (Exception e) { }
String cont = contents.toString().trim(); document.add(new Field("filename", title, Field.Store.YES,
Field.Index.ANALYZED));//TOKENIZED
//document.add(new Field("contents", contents));
document.add(new Field("contents", cont,Field.Store.YES,Field.Index.ANALYZED));
//document.add(new Field("path", docPath, Field.Store.YES,Field.Index.ANALYZED));
document.add(new Field("indexDate",DateTools.dateToString(new Date(), DateTools.Resolution.DAY),
Field.Store.YES,Field.Index.NOT_ANALYZED));
return document;
} public static Document getPPT(File pptFile) throws IOException{
String docPath = pptFile.getAbsolutePath();
String title = pptFile.getName(); StringBuffer contents = new StringBuffer("");// 文档内容
InputStream is = new FileInputStream(pptFile);
SlideShow ppt = new SlideShow(new HSLFSlideShow(is));
Slide[] slides = ppt.getSlides();
//提取文本信息
/*for (Slide each : slides) {
//System.out.println("title:" + each.getTitle()) ;
//System.out.println("content:") ;
TextRun[] textRuns = each.getTextRuns();
for (int i=0 ;i< textRuns.length; i++ ) {
//System.out.println(textRuns[i].getText());
RichTextRun[] richTextRuns = textRuns[i].getRichTextRuns();
for (int j = 0; j < richTextRuns.length; j++) {
//System.out.println(richTextRuns[j].getText());
contents.append(richTextRuns[j].getText());
}
}
contents.append(each.getTitle());
}*/
for(int i=0;i <slides.length;i++){
TextRun[] t = slides[i].getTextRuns();//为了取得幻灯片的文字内容,建立TextRun
for(int j=0;j <t.length;j++){
contents.append(t[j].getText());//这里会将文字内容加到content中去
}
//contents.append(slides[i].getTitle());
} Document document = new Document();
String cont = contents.toString().trim(); document.add(new Field("filename", title, Field.Store.YES,
Field.Index.ANALYZED));//TOKENIZED
//document.add(new Field("contents", contents));
document.add(new Field("contents", cont,Field.Store.YES,Field.Index.ANALYZED));
//document.add(new Field("path", docPath, Field.Store.YES,Field.Index.ANALYZED));
document.add(new Field("indexDate",DateTools.dateToString(new Date(), DateTools.Resolution.DAY),
Field.Store.YES,Field.Index.NOT_ANALYZED));
return document;
} public static Document getPdf(File pdf) {
String pdfpath = pdf.getAbsolutePath();
// 创建输入流读取pdf文件
String title = pdf.getName();
String result = "";
FileInputStream is = null;
PDDocument doc = null;
try {
is = new FileInputStream(pdf);
PDFParser parser = new PDFParser(is);
parser.parse();
doc = parser.getPDDocument();
PDFTextStripper stripper = new PDFTextStripper();
result = stripper.getText(doc); } catch (Exception e) { e.printStackTrace();
} finally {
if (is != null) {
try {
is.close();
} catch (Exception e) {
e.printStackTrace();
}
}
if (doc != null) {
try {
doc.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}
Document document = new Document();
document.add(new Field("filename", title, Field.Store.YES,
Field.Index.ANALYZED));//TOKENIZED
document.add(new Field("contents", result, Field.Store.YES,
Field.Index.ANALYZED));
//document.add(new Field("path", pdfpath, Field.Store.YES,Field.Index.ANALYZED));
return document;
} public static Document getExcel(File fileExcel) throws Exception { InputStream is = new FileInputStream(fileExcel);
StringBuffer content = new StringBuffer(); HSSFWorkbook workbook = new HSSFWorkbook(is); for (int numSheets = 0; numSheets < workbook.getNumberOfSheets(); numSheets++) {
HSSFSheet aSheet = workbook.getSheetAt(numSheets);// 获得一个sheet
content.append("\n");
if (null == aSheet) {
continue;
}
for (int rowNum = 0; rowNum <= aSheet.getLastRowNum(); rowNum++) {
content.append("\n");
HSSFRow aRow = aSheet.getRow(rowNum);
if (null == aRow) {
continue;
} for (short cellNum = 0; cellNum <= aRow.getLastCellNum(); cellNum++) {
HSSFCell aCell = aRow.getCell(cellNum);
if (null == aCell) {
continue;
} if (aCell.getCellType() == HSSFCell.CELL_TYPE_STRING) {
content.append(aCell.getRichStringCellValue().getString());
} else if (aCell.getCellType() == HSSFCell.CELL_TYPE_NUMERIC) {
boolean b = HSSFDateUtil.isCellDateFormatted(aCell);
if (b) {
Date date = aCell.getDateCellValue();
SimpleDateFormat df = new SimpleDateFormat("yyyy-MM-dd");
content.append(df.format(date));
}
}
}
}
} String cont = content.toString();
Document document = new Document();
document.add(new Field("filename",fileExcel.getName(), Field.Store.YES,
Field.Index.ANALYZED));//TOKENIZED
document.add(new Field("contents", cont, Field.Store.YES,
Field.Index.ANALYZED));
//document.add(new Field("path", pdfpath, Field.Store.YES,Field.Index.ANALYZED));
return document;
} public static String readHtml(String urlString) { StringBuffer content = new StringBuffer("");
File file = new File(urlString);
FileInputStream fis = null;
try {
fis = new FileInputStream(file);
// 读取页面
BufferedReader reader = new BufferedReader(new InputStreamReader(
fis,"utf-8"));//这里的字符编码要注意,要对上html头文件的一致,否则会出乱码 String line = null; while ((line = reader.readLine()) != null) {
content.append(line + "\n");
}
reader.close();
} catch (Exception e) {
e.printStackTrace();
}
String contentString = content.toString();
return contentString;
}
}

  搜索索引


import java.io.File;
import java.io.IOException; import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.Version;
/**
* 搜索索引 Lucene 3.0+
* @author Administrator
*
*/
public class searcher { public static void main(String[] args) throws IOException, ParseException {
//保存索引文件的地方
String indexDir = "data\\test\\indexDir";
Directory dir = new SimpleFSDirectory(new File(indexDir));
//创建 IndexSearcher对象,相比IndexWriter对象,这个参数就要提供一个索引的目录就行了
IndexSearcher indexSearch = new IndexSearcher(dir);
//创建QueryParser对象,第一个参数表示Lucene的版本,第二个表示搜索Field的字段,第三个表示搜索使用分词器
QueryParser queryParser = new QueryParser(Version.LUCENE_30,
"contents", new StandardAnalyzer(Version.LUCENE_30));
//生成Query对象
Query query = queryParser.parse("arcgis");
//搜索结果 TopDocs里面有scoreDocs[]数组,里面保存着索引值
TopDocs hits = indexSearch.search(query,10);
//hits.totalHits表示一共搜到多少个
System.out.println("找到了"+hits.totalHits+"个");
//循环hits.scoreDocs数据,并使用indexSearch.doc方法把Document还原,再拿出对应的字段的值
for (int i = 0; i < hits.scoreDocs.length; i++) {
ScoreDoc sdoc = hits.scoreDocs[i];
Document doc = indexSearch.doc(sdoc.doc);
System.out.println(doc.get("filename"));
}
indexSearch.close();
}
}

  

lucene 实现word,pdf全文检索源码的更多相关文章

  1. 《C++实践之路.pdf》源码

    > 源码下载方法 < >> 打开微信 >> 扫描下方二维码 >> 关注林哥私房菜 >> 输入对应编号获取百度网盘提取密码 全书源码[已更新完 ...

  2. Spring实战(中文4,5版) PDF含源码

    Spring实战 读者评价 看了一半后在做评论,物流速度挺快,正版行货,只是运输过程有点印记,但是想必大家和你关注内容,spring 4必之3更加关注的是使用注解做开发,对于初学者还是很有用,但是不排 ...

  3. C#导出文本内容到word文档源码

    将做工程过程中较好的代码片段珍藏起来,下面的代码内容是关于C#导出文本内容到word文档的代码,希望能对小伙伴们也有好处.<%@ Page Language="C#" Aut ...

  4. tomcat 源码解析

    how_tomcat_works https://www.uzh.ch/cmsssl/dam/jcr:00000000-29c9-42ee-0000-000074fab75a/how_tomcat_w ...

  5. java源码剖析: 对象内存布局、JVM锁以及优化

    一.目录 1.启蒙知识预热:CAS原理+JVM对象头内存存储结构 2.JVM中锁优化:锁粗化.锁消除.偏向锁.轻量级锁.自旋锁. 3.总结:偏向锁.轻量级锁,重量级锁的优缺点. 二.启蒙知识预热 开启 ...

  6. java集合树状结构及源码

    java集合树状结构及源码 最近一直想看一下java集合的源码,毕竟平时用的比较多,但总是感觉是跟着习惯new出来一个对象,比如ArrayList,HashMap等等,所以就简单的看了一下,了解了一下 ...

  7. jdk源码剖析二: 对象内存布局、synchronized终极原理

    很多人一提到锁,自然第一个想到了synchronized,但一直不懂源码实现,现特地追踪到C++层来剥开synchronized的面纱. 网上的很多描述大都不全,让人看了不够爽,看完本章,你将彻底了解 ...

  8. 早前阅读live555源码做的笔记

    早前阅读live555源码的时候做了一些简单的笔记.现在看来那个时候对C++的理解还是不够,还有很多不足.当时对很多名词也不是很熟悉,对一些类的描述也很生硬,所以笔记中有一些不通畅之处. 阅读live ...

  9. tomcat 源码分析

    Tomcat源码分析——Session管理分析(下)    Tomcat源码分析——Session管理分析(上)     Tomcat源码分析——请求原理分析(下)     Tomcat源码分析——请 ...

随机推荐

  1. 征服 Nginx + Tomcat【转】

    征服 Nginx + Tomcat Server Architecture/Distributed nginxtomcatsessioncluster  2年前一直折腾Apache,现如今更习惯Ngi ...

  2. JS数组定义

     JS数组定义收藏 function StorePage(){d=document;t=d.selection?(d.selection.type!='None'?d.selection.create ...

  3. iOS开发中一些常用的方法

    1.压缩图片 #pragma mark 处理图片 - (void)useImage:(UIImage *)image { NSLog(@"with-----%f heught-----%f& ...

  4. Linux内核:关于中断你须要知道的

    1.中断处理程序与其它内核函数真正的差别在于,中断处理程序是被内核调用来对应中断的,而它们执行于中断上下文(原子上下文)中,在该上下文中执行的代码不可堵塞. 中断就是由硬件打断操作系统. 2.异常与中 ...

  5. “:Choose a destination with a supported architecture in order to run on this device.”

    我在编译从GitHub上clone下来的<TweeJump>时,出现如下错误:":Choose a destination with a supported architectu ...

  6. FileFilter, FilenameFilter用法和文件排序

    FileFilter和FilenameFilter这两个类的用法都很简单,都只有一个方法 FileFilter /*** @param pathname The abstract pathname t ...

  7. jquery mobile跳转到指定id时怎样传递参数

    在jquery mobile 中,每一个页面都是一个page,当我们需要从一个页面跳转到另一个页面时,可以在href中指定id,可是该怎么把一个page中的参数传递到另外一个page中,几经琢磨,发现 ...

  8. ArcGIS中文件共享锁定数据溢出 这个方法不行,建议用gdb,不要用mdb

    ArcGIS中文件共享锁定数据溢出 (2011-11-24 15:52:41) 转载▼ 标签: 杂谈 分类: GIS 文件共享锁定数溢出.(Error 3052)1. Access数据库,同时操作大量 ...

  9. HTML左边和右边是固定的宽度但是中间是自动的布局方式

    对于这个布局方式我们可以是用绝对定位的方式来实现这个效果 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN&q ...

  10. 安卓蓝牙技术Bluetooth使用流程(Bluetooth详解)

    一:蓝牙设备之间的通讯首要包含了四个进程 设置蓝牙设备 寻觅局域网内也许或许匹配的设备 衔接设备 设备之间的数据传输 二:详细编程完结 1. 发动蓝牙功用 首要经过调用静态办法getDefaultAd ...