lucene 实现word，pdf全文检索源码

创建索引：

import java.io.BufferedReader;

import java.io.File;

import java.io.FileInputStream;

import java.io.FileNotFoundException;

import java.io.FileReader;

import java.io.IOException;

import java.io.InputStream;

import java.io.InputStreamReader;

import java.io.Reader;

import java.io.StringReader;

import java.text.SimpleDateFormat;

import java.util.Date;   

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.document.DateTools;

import org.apache.lucene.document.Document;

import org.apache.lucene.document.Field;

import org.apache.lucene.index.IndexWriter;

import org.apache.lucene.store.Directory;

import org.apache.lucene.store.SimpleFSDirectory;

import org.apache.lucene.util.Version;

import org.apache.pdfbox.pdfparser.PDFParser;

import org.apache.pdfbox.pdmodel.PDDocument;

import org.apache.pdfbox.util.PDFTextStripper;

import org.apache.poi.hslf.HSLFSlideShow;

import org.apache.poi.hslf.model.Slide;

import org.apache.poi.hslf.model.TextRun;

import org.apache.poi.hslf.usermodel.RichTextRun;

import org.apache.poi.hslf.usermodel.SlideShow;

import org.apache.poi.hssf.usermodel.HSSFCell;

import org.apache.poi.hssf.usermodel.HSSFDateUtil;

import org.apache.poi.hssf.usermodel.HSSFRow;

import org.apache.poi.hssf.usermodel.HSSFSheet;

import org.apache.poi.hssf.usermodel.HSSFWorkbook;

import org.apache.poi.hwpf.HWPFDocument;

import org.apache.poi.hwpf.usermodel.Paragraph;

import org.apache.poi.hwpf.usermodel.Range;

import org.apache.poi.poifs.filesystem.DocumentEntry;

import org.apache.poi.poifs.filesystem.DocumentInputStream;

import org.apache.poi.poifs.filesystem.POIFSFileSystem;

import org.apache.poi.util.LittleEndian;

/**

 * 创建索引 Lucene 3.0+

 * @author Administrator

 *

 */

public class indexer {   

    /**

     * @param args

     * @throws Exception

     */

    public static void main(String[] args) throws Exception {

        //保存索引文件的地方

        String indexDir = "data\\test\\indexDir";

        //将要搜索TXT文件的地方

        String dateDir = "data\\test\\dateDir";

        IndexWriter indexWriter = null;

        //创建Directory对象

        Directory dir = new SimpleFSDirectory(new File(indexDir));

        //创建IndexWriter对象,

        //第一个参数是Directory,第二个是分词器,

        //第三个表示是否是创建,如果为false为在此基础上面修改,

        //第四表示表示分词的最大值，比如说new MaxFieldLength(2)，就表示两个字一分，

        //一般用IndexWriter.MaxFieldLength.LIMITED

        indexWriter = new IndexWriter(dir,new StandardAnalyzer(Version.LUCENE_30),true,

        		IndexWriter.MaxFieldLength.UNLIMITED);

        File[] files = new File(dateDir).listFiles();

        for (int i = 0; i < files.length; i++) {

        	Document doc = null;

        	if(files[i].getName().endsWith(".txt")){

	            doc = new Document();

	            //创建Field对象，并放入doc对象中

	            doc.add(new Field("contents", new FileReader(files[i])));

	            doc.add(new Field("filename", files[i].getName(),

	                                Field.Store.YES, Field.Index.NOT_ANALYZED));

	            doc.add(new Field("indexDate",DateTools.dateToString(new Date(), DateTools.Resolution.DAY),

	            		Field.Store.YES,Field.Index.NOT_ANALYZED));

        	}else if(files[i].getName().endsWith(".doc")){

        			doc = getDocument(files[i]);

        	}else if(files[i].getName().endsWith(".ppt")){

        		doc = getPPT(files[i]);

        	}else if(files[i].getName().endsWith(".xls")){

        		doc = getExcel(files[i]);

        	}else if(files[i].getName().endsWith(".pdf")){

        		doc = getPdf(files[i]);

        	}else{

        		doc = new Document();

	            //创建Field对象，并放入doc对象中

	            doc.add(new Field("contents", new FileReader(files[i])));

	            doc.add(new Field("filename", files[i].getName(),

	                                Field.Store.YES, Field.Index.NOT_ANALYZED));

	            doc.add(new Field("indexDate",DateTools.dateToString(new Date(), DateTools.Resolution.DAY),

	            		Field.Store.YES,Field.Index.NOT_ANALYZED));   

        	}

        	//写入IndexWriter

        	if(doc!= null) indexWriter.addDocument(doc);

        }

        //查看IndexWriter里面有多少个索引

        System.out.println("numDocs："+indexWriter.numDocs());

        indexWriter.close();

    } 

    public static Document getDocument(File file) throws Exception {

		String docPath = file.getAbsolutePath();

		String title = file.getName();

		// 创建Document

		Document document = new Document();

		/*InputStream inputStream = null;

		Reader contents = null;

		try {

			inputStream = new FileInputStream(file);

		} catch (FileNotFoundException e) {

			e.printStackTrace();

		}

		WordExtractor extractor = new WordExtractor();

		//try{

		//	POIFSFileSystem fsys = new POIFSFileSystem(inputStream);

		//	DocumentEntry headerProps =

		//	         (DocumentEntry)fsys.getRoot().getEntry("WordDocument");

		//	DocumentInputStream din = fsys.createDocumentInputStream("WordDocument");

		//	byte[] header = new byte[headerProps.getSize()];

		//	din.read(header);

		//	din.close();

		//	int info = LittleEndian.getShort(header, 0xa);

		//	if ((info & 0x4) != 0)

		//	{

		//		throw new FastSavedException("Fast-saved files are unsupported at this time");

		//	}

		//	if ((info & 0x100) != 0)

		//	{

		//		throw new PasswordProtectedException("This document is password protected");

		//	}

		//}finally{

		//}

		try {

			contents = new StringReader(extractor.extractText(inputStream));

		} catch (Exception e) {

			e.printStackTrace();

		}*/

		StringBuffer contents = new StringBuffer("");// 文档内容

        try {

        	FileInputStream fs = new FileInputStream(docPath);

            HWPFDocument doc = new HWPFDocument(fs);

            Range range = doc.getRange();

            int paragraphCount = range.numParagraphs();// 段落

            for (int i = 0; i < paragraphCount; i++) {// 遍历段落读取数据

                Paragraph pp = range.getParagraph(i);

                contents.append(pp.text());

            } 

        } catch (Exception e) {

        }

        String cont = contents.toString().trim();

		document.add(new Field("filename", title, Field.Store.YES,

				Field.Index.ANALYZED));//TOKENIZED

		//document.add(new Field("contents", contents));

		document.add(new Field("contents", cont,Field.Store.YES,Field.Index.ANALYZED));

		//document.add(new Field("path", docPath, Field.Store.YES,Field.Index.ANALYZED));

		document.add(new Field("indexDate",DateTools.dateToString(new Date(), DateTools.Resolution.DAY),

        		Field.Store.YES,Field.Index.NOT_ANALYZED));

		return document;

	}

    public static Document getPPT(File pptFile) throws IOException{

    	String docPath = pptFile.getAbsolutePath();

		String title = pptFile.getName();

    	StringBuffer contents = new StringBuffer("");// 文档内容

    	InputStream is = new FileInputStream(pptFile);

    	SlideShow ppt = new SlideShow(new HSLFSlideShow(is));

    	Slide[] slides = ppt.getSlides();

    	//提取文本信息

    	/*for (Slide each : slides) {

    		//System.out.println("title:" + each.getTitle()) ;

    		//System.out.println("content:") ;

    		TextRun[] textRuns = each.getTextRuns();

    		for (int i=0 ;i< textRuns.length; i++ ) {

    			//System.out.println(textRuns[i].getText());

    			RichTextRun[] richTextRuns = textRuns[i].getRichTextRuns();

    			for (int j = 0; j < richTextRuns.length; j++) {

    				//System.out.println(richTextRuns[j].getText());

    				contents.append(richTextRuns[j].getText());

    			}

    		}

    		contents.append(each.getTitle());

    	}*/

    	for(int i=0;i <slides.length;i++){

            TextRun[] t = slides[i].getTextRuns();//为了取得幻灯片的文字内容，建立TextRun

            for(int   j=0;j <t.length;j++){

            	contents.append(t[j].getText());//这里会将文字内容加到content中去

            }

            //contents.append(slides[i].getTitle());

        }

    	Document document = new Document();

    	String cont = contents.toString().trim();

		document.add(new Field("filename", title, Field.Store.YES,

				Field.Index.ANALYZED));//TOKENIZED

		//document.add(new Field("contents", contents));

		document.add(new Field("contents", cont,Field.Store.YES,Field.Index.ANALYZED));

		//document.add(new Field("path", docPath, Field.Store.YES,Field.Index.ANALYZED));

		document.add(new Field("indexDate",DateTools.dateToString(new Date(), DateTools.Resolution.DAY),

        		Field.Store.YES,Field.Index.NOT_ANALYZED));

    	return document;

    }

    public static Document getPdf(File pdf) {

		String pdfpath = pdf.getAbsolutePath();

		// 创建输入流读取pdf文件

		String title = pdf.getName();

		String result = "";

		FileInputStream is = null;

		PDDocument doc = null;

		try {

			is = new FileInputStream(pdf);

			PDFParser parser = new PDFParser(is);

			parser.parse();

			doc = parser.getPDDocument();

			PDFTextStripper stripper = new PDFTextStripper();

			result = stripper.getText(doc);

		} catch (Exception e) {

			e.printStackTrace();

		} finally {

			if (is != null) {

				try {

					is.close();

				} catch (Exception e) {

					e.printStackTrace();

				}

			}

			if (doc != null) {

				try {

					doc.close();

				} catch (Exception e) {

					e.printStackTrace();

				}

			}

		}

		Document document = new Document();

		document.add(new Field("filename", title, Field.Store.YES,

				Field.Index.ANALYZED));//TOKENIZED

		document.add(new Field("contents", result, Field.Store.YES,

				Field.Index.ANALYZED));

		//document.add(new Field("path", pdfpath, Field.Store.YES,Field.Index.ANALYZED));

		return document;

	}

    public static Document getExcel(File fileExcel) throws Exception {

    	InputStream is = new FileInputStream(fileExcel);

        StringBuffer content = new StringBuffer();

        HSSFWorkbook workbook = new HSSFWorkbook(is);

        for (int numSheets = 0; numSheets < workbook.getNumberOfSheets(); numSheets++) {

            HSSFSheet aSheet = workbook.getSheetAt(numSheets);// 获得一个sheet

            content.append("\n");

            if (null == aSheet) {

               continue;

            }

            for (int rowNum = 0; rowNum <= aSheet.getLastRowNum(); rowNum++) {

               content.append("\n");

               HSSFRow aRow = aSheet.getRow(rowNum);

               if (null == aRow) {

                   continue;

               }

               for (short cellNum = 0; cellNum <= aRow.getLastCellNum(); cellNum++) {

                   HSSFCell aCell = aRow.getCell(cellNum);

                   if (null == aCell) {

                      continue;

                   }

                   if (aCell.getCellType() == HSSFCell.CELL_TYPE_STRING) {

                      content.append(aCell.getRichStringCellValue().getString());

                   } else if (aCell.getCellType() == HSSFCell.CELL_TYPE_NUMERIC) {

                      boolean b = HSSFDateUtil.isCellDateFormatted(aCell);

                      if (b) {

                          Date date = aCell.getDateCellValue();

                          SimpleDateFormat df = new SimpleDateFormat("yyyy-MM-dd");

                          content.append(df.format(date));

                      }

                   }

               }

            }

        }

        String cont = content.toString();

        Document document = new Document();

		document.add(new Field("filename",fileExcel.getName(), Field.Store.YES,

				Field.Index.ANALYZED));//TOKENIZED

		document.add(new Field("contents", cont, Field.Store.YES,

				Field.Index.ANALYZED));

		//document.add(new Field("path", pdfpath, Field.Store.YES,Field.Index.ANALYZED));

		return document;

     }

    public static String readHtml(String urlString) {

        StringBuffer content = new StringBuffer("");

        File file = new File(urlString);

        FileInputStream fis = null;

        try {

            fis = new FileInputStream(file);

            // 读取页面

            BufferedReader reader = new BufferedReader(new InputStreamReader(

                    fis,"utf-8"));//这里的字符编码要注意，要对上html头文件的一致，否则会出乱码

            String line = null;

            while ((line = reader.readLine()) != null) {

                content.append(line + "\n");

            }

            reader.close();

        } catch (Exception e) {

            e.printStackTrace();

        }

        String contentString = content.toString();

        return contentString;

    }

}

　　搜索索引


import java.io.File;

import java.io.IOException;   

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.queryParser.ParseException;

import org.apache.lucene.queryParser.QueryParser;

import org.apache.lucene.search.IndexSearcher;

import org.apache.lucene.search.Query;

import org.apache.lucene.search.ScoreDoc;

import org.apache.lucene.search.TopDocs;

import org.apache.lucene.store.Directory;

import org.apache.lucene.store.SimpleFSDirectory;

import org.apache.lucene.util.Version;

/**

 * 搜索索引 Lucene 3.0+

 * @author Administrator

 *

 */

public class searcher {   

    public static void main(String[] args) throws IOException, ParseException {

        //保存索引文件的地方

        String indexDir = "data\\test\\indexDir";

        Directory dir = new SimpleFSDirectory(new File(indexDir));

        //创建 IndexSearcher对象，相比IndexWriter对象，这个参数就要提供一个索引的目录就行了

        IndexSearcher indexSearch = new IndexSearcher(dir);

        //创建QueryParser对象,第一个参数表示Lucene的版本,第二个表示搜索Field的字段,第三个表示搜索使用分词器

        QueryParser queryParser = new QueryParser(Version.LUCENE_30,

                "contents", new StandardAnalyzer(Version.LUCENE_30));

        //生成Query对象

        Query query = queryParser.parse("arcgis");

        //搜索结果 TopDocs里面有scoreDocs[]数组，里面保存着索引值

        TopDocs hits = indexSearch.search(query,10);

        //hits.totalHits表示一共搜到多少个

        System.out.println("找到了"+hits.totalHits+"个");

        //循环hits.scoreDocs数据，并使用indexSearch.doc方法把Document还原，再拿出对应的字段的值

        for (int i = 0; i < hits.scoreDocs.length; i++) {

            ScoreDoc sdoc = hits.scoreDocs[i];

            Document doc = indexSearch.doc(sdoc.doc);

            System.out.println(doc.get("filename"));

        }

        indexSearch.close();

    }

}

lucene 实现word，pdf全文检索源码的更多相关文章

《C++实践之路.pdf》源码
> 源码下载方法 < >> 打开微信 >> 扫描下方二维码 >> 关注林哥私房菜 >> 输入对应编号获取百度网盘提取密码全书源码[已更新完 ...
Spring实战（中文4,5版） PDF含源码
Spring实战读者评价看了一半后在做评论,物流速度挺快,正版行货,只是运输过程有点印记,但是想必大家和你关注内容,spring 4必之3更加关注的是使用注解做开发,对于初学者还是很有用,但是不排 ...
C#导出文本内容到word文档源码
将做工程过程中较好的代码片段珍藏起来,下面的代码内容是关于C#导出文本内容到word文档的代码,希望能对小伙伴们也有好处.<%@ Page Language="C#" Aut ...
tomcat　源码解析
how_tomcat_works https://www.uzh.ch/cmsssl/dam/jcr:00000000-29c9-42ee-0000-000074fab75a/how_tomcat_w ...
java源码剖析: 对象内存布局、JVM锁以及优化
一.目录 1.启蒙知识预热:CAS原理+JVM对象头内存存储结构 2.JVM中锁优化:锁粗化.锁消除.偏向锁.轻量级锁.自旋锁. 3.总结:偏向锁.轻量级锁,重量级锁的优缺点. 二.启蒙知识预热开启 ...
java集合树状结构及源码
java集合树状结构及源码最近一直想看一下java集合的源码,毕竟平时用的比较多,但总是感觉是跟着习惯new出来一个对象,比如ArrayList,HashMap等等,所以就简单的看了一下,了解了一下 ...
jdk源码剖析二: 对象内存布局、synchronized终极原理
很多人一提到锁,自然第一个想到了synchronized,但一直不懂源码实现,现特地追踪到C++层来剥开synchronized的面纱. 网上的很多描述大都不全,让人看了不够爽,看完本章,你将彻底了解 ...
早前阅读live555源码做的笔记
早前阅读live555源码的时候做了一些简单的笔记.现在看来那个时候对C++的理解还是不够,还有很多不足.当时对很多名词也不是很熟悉,对一些类的描述也很生硬,所以笔记中有一些不通畅之处. 阅读live ...
tomcat 源码分析
Tomcat源码分析——Session管理分析(下) Tomcat源码分析——Session管理分析(上) Tomcat源码分析——请求原理分析(下) Tomcat源码分析——请 ...

随机推荐

初学ios遇到问题记录01
刚刚接触IOS,花了一段时间看我基础部分的OC后就想试着弄个小程序,于是看到 http://www.cnblogs.com/LooDo/p/3907064.html博文中的小程序,博主分析的很详细, ...
android112 jni 把java的字符串转换成c的字符串，数组处理
package com.itheima.charencode; import android.os.Bundle; import android.app.Activity; import androi ...
在Linux环境中使用Ext3文件系统
Linux缺省情况下使用的文件系统为Ext2,ext2文件系统的确高效稳定.但是,随着Linux系统在关键业务中的应用,Linux文件系统的弱点也渐渐显露出来了:其中系统缺省使用的ext2文件系统 ...
最简便的MySql数据库备份方法
http://www.kankanews.com/ICkengine/archives/194.shtml 使用MYSQL进行数据库备份,又很正规的数据库备份方法,同其他的数据库服务器有相同的概念,但 ...
C#操控条形码扫描枪
// 条码扫描器 // 窗体部分相关代码: using System; using System.Collections.Generic; using System.ComponentModel; u ...
검색엔진의 크롤링과 인덱싱의 차이 (robots.txt 파일과 meta robots 태그의 차이점)
검색엔진의 크롤링과 인덱싱의 차이크롤링 제어는 robots.txt인덱싱 제어는 < meta name="robots" content="noindex& ...
pop动画大全只能时代程序员更应该关心效果而不是冷冰冰的代码
下载地址 https://pan.baidu.com/s/1o8pQWau
IOS_OC_本地推送知识总结
知识点介绍一. 推送通知介绍(了解) 二. 本地推送通知本地通知的基本使用本地通知的不常用属性删除重复的通知通知的处理1-跳转界面通知的处理2-程序退出分类的设置/快捷回复一. 推送通 ...
C#一些小技巧
在C#实现类似Typedef的所有功能 Typedef这个关键字,是比较好用的东西,因为有时候我们需要使用一些别名来帮助我们记忆某些结构体或者类的共用.(个人觉得这是C与C++唯一能吸引我的东西)为了 ...
ios专题 - 图片（UIImage）获取方法
说到图片获取的方法,就得看API文档. UIImage生成实例的方法有: 1)imageNamed 从指定文件返回对象. 这个方法有个比较特殊的地方:该方法首先从系统缓存中寻找该图片,如果有,则从缓存 ...

lucene 实现word，pdf全文检索源码

lucene 实现word，pdf全文检索源码的更多相关文章

随机推荐

热门专题