lucene做简单的文件索引

package com.mylucene;

import java.io.File;

import java.io.FileReader;

import java.io.IOException;

import java.io.Reader;

import java.nio.CharBuffer;

import java.util.ArrayList;

import java.util.List;

import org.apache.lucene.LucenePackage;

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.document.Field;

import org.apache.lucene.document.TextField;

import org.apache.lucene.index.DirectoryReader;

import org.apache.lucene.index.IndexWriter;

import org.apache.lucene.index.IndexWriterConfig;

import org.apache.lucene.queryparser.classic.MultiFieldQueryParser;

import org.apache.lucene.search.IndexSearcher;

import org.apache.lucene.search.Query;

import org.apache.lucene.search.ScoreDoc;

import org.apache.lucene.store.Directory;

import org.apache.lucene.store.FSDirectory;

import org.apache.lucene.util.Version;

public class MyLuceneTest {

    /**

     * 依据内容，构建索引

     * @param analyzer

     * @param directory

     * @param items

     * @return

     */

    private boolean buildIndexer(Analyzer analyzer, Directory directory, List<Item> items) {

        IndexWriter iwriter = null;

        try {

            // 配置索引

            iwriter = new IndexWriter(directory, new IndexWriterConfig(

                    Version.LUCENE_47, analyzer));

            // 删除全部document

            iwriter.deleteAll();

            // 将文档信息存入索引

            Document doc[] = new Document[items.size()];

            for (int i = 0; i < items.size(); i++) {

                doc[i] = new Document();

                Item item = items.get(i);

                java.lang.reflect.Field[] fields = item.getClass().getDeclaredFields();

                for (java.lang.reflect.Field field : fields) {

                    String fieldName = field.getName();

                   // System.out.println(fieldName);

                    String getMethodName = "get"+toFirstLetterUpperCase(fieldName);

                    Object obj = item.getClass().getMethod(getMethodName).invoke(item);

                    //System.out.println((String)obj);

                    doc[i].add(new Field(fieldName, (String)obj, TextField.TYPE_STORED));

                   // Field field1 = new Field("", new FileReader(new File("")));

                   // doc[1].add(field1);

                }

                iwriter.addDocument(doc[i]);

            }

        } catch (Exception e) {

            e.printStackTrace();

            return false;

        } finally {

            try {

                iwriter.close();

            } catch (IOException e) {

            }

        }

        return true;

    }

    /**

     * 依据keyword搜索索引

     * @param analyzer

     * @param directory

     * @param keyword

     * @return

     */

    public List<Item> searchIndexer(Analyzer analyzer, Directory directory, String keyword) {

        DirectoryReader ireader = null;

        List<Item> result = new ArrayList<Item>();

        try {

            // 设定搜索文件夹

            ireader = DirectoryReader.open(directory);

            IndexSearcher isearcher = new IndexSearcher(ireader);

            // 对多field进行搜索

            java.lang.reflect.Field[] fields = Item.class.getDeclaredFields();

            int length = fields.length;

            String[] multiFields = new String[length];

            for (int i = 0; i < length; i++) {

                multiFields[i] = fields[i].getName();

            }

            MultiFieldQueryParser parser = new MultiFieldQueryParser(

                    Version.LUCENE_47, multiFields, analyzer);

            // 设定详细的搜索词

            Query query = parser.parse(keyword);

            ScoreDoc[] hits = isearcher.search(query, null, 10).scoreDocs;

            for (int i = 0; i < hits.length; i++) {

                Document hitDoc = isearcher.doc(hits[i].doc);

                Item item = new Item();

                for (String field : multiFields) {

                    String setMethodName = "set"+toFirstLetterUpperCase(field);

                    item.getClass().getMethod(setMethodName, String.class).invoke(item, hitDoc.get(field));

                }

                result.add(item);

            }

        } catch (Exception e) {

            e.printStackTrace();

            return null;

        } finally {

            try {

                ireader.close();

                directory.close();

            } catch (IOException e) {

            }

        }

        return result;

    }

    /**

     * 首字母转大写

     * @param str

     * @return

     */

    public static String toFirstLetterUpperCase(String str) {

        if(str == null || str.length() < 2){

            return str;

        }

        return str.substring(0, 1).toUpperCase() + str.substring(1, str.length());

     }  

    public static void main(String[] args) throws Exception {

    	System.out.println(LucenePackage.get());

        MyLuceneTest demo = new MyLuceneTest();

        Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_47);

       // Analyzer analyzer = new SmartChineseAnalyzer(Version.LUCENE_47);

        List<Item> items = new ArrayList<Item>();

        /*items.add(new Item("1", "中国", "This is the text to be greatly indexed."));

        items.add(new Item("2", "second", "This is great"));

        items.add(new Item("3", "third", "I love apple and pear. "));

        items.add(new Item("4", "four", "我是中国人"));

        items.add(new Item("5", "five", "中华人民共和国"));

        */File dataFile = new File("C:/mylucene");

        File[] dataFiles = dataFile.listFiles();

        for(int i = 0; i < dataFiles.length; i++){

        	Reader txtReader = new FileReader(dataFiles[i]);

        	char []buff = new char[10000];

        	txtReader.read(buff);

        	String str = String.valueOf(buff);

        	System.out.println(buff);

        	items.add(new Item(dataFiles[i].getCanonicalPath(),dataFiles[i].getName(),str));

        	//System.out.println(dataFiles[i].getCanonicalPath());

        	//System.out.println(dataFiles[i].getName());

        	//System.out.println(buff);

        	//System.out.println(txtReader.toString());

        }

        // 索引存到内存中的文件夹

        //Directory directory = new RAMDirectory();

        // 索引存储到硬盘

        File file = new File("c:/lucene");

        Directory directory = FSDirectory.open(file);

        demo.buildIndexer(analyzer, directory, items);

        List<Item> result = demo.searchIndexer(analyzer, directory, "中国");

        for (Item item : result) {

            System.out.println(item.toString());

        }

    }

}

package com.mylucene;

public class Item {



    private String id;

    private String title;

    private String content;



    public Item() {

    }



    public Item(String id, String title, String content) {

        this.id = id;

        this.title = title;

        this.content = content;

    }



    public String getId() {

        return id;

    }

    public void setId(String id) {

        this.id = id;

    }

    public String getTitle() {

        return title;

    }

    public void setTitle(String title) {

        this.title = title;

    }

    public String getContent() {

        return content;

    }

    public void setContent(String content) {

        this.content = content;

    }



    public String toString() {

        StringBuilder sb = new StringBuilder();

        sb.append("[id=").append(id).append(",title=").append(title)

            .append(",content=").append(content).append("]");

        return sb.toString();

    }

}

这里是将文件的的三个属性进行了一下抽象，而且运用还有一个类去表示，在曾经版本号中是运用Reader进行读取文件，而且在文件进行加入索引的时候直接对Reader读取的对象进行加入。不须要将其全部进行读出都进行封装。

这里就是文件很大的时候内存将会存不下，导致内存不足或者数组越界的可能。这里应该还能够像曾经版本号一样能够直接对文件建立索引的。我相信是我没有找到好的解决的方法。所以应该多研究一下4.8的api。

lucene做简单的文件索引的更多相关文章

lucene 内存索引和文件索引合并
IndexWriter.addIndexes(ramDirectory); http://blog.csdn.net/qq_28042463/article/details/51538283 在luc ...
B树、B-树、B+树、B*树介绍，和B+树更适合做文件索引的原因
今天看数据库,书中提到:由于索引是采用 B 树结构存储的,所以对应的索引项并不会被删除,经过一段时间的增删改操作后,数据库中就会出现大量的存储碎片, 这和磁盘碎片.内存碎片产生原理是类似的,这些存储碎 ...
使用 jquery 的上传文件插件 uploadify 3.1 配合 java 来做一个简单的文件上次功能。并且在界面上有radio 的选择内容也要上传
使用 jquery 的上传文件插件 uploadify 3.1 配合 java 来做一个简单的文件上次功能.并且在界面上有radio 的选择内容也要上传 uploadify 插件的下载和文档地址 ...
2、Lucene 最简单的使用（小例子）
在了解了Lucene以后,我打算亲手来做一个Lucene的小例子,这个例子只是Lucene最简单的应用:使用Lucene实现标准的英文搜索: 1.下载Lucene 下载Lucene,到Lucene的官 ...
用Lucene.net对数据库建立索引及搜索<转>
用Lucene.net对数据库建立索引及搜索最近我一直在研究 Lucene.net ,发现Lucene.net对数据库方面建索引的文章在网上很少见,其实它是可以对数据库进行索引的,我闲着没事,写了个 ...
Lucene3.6.2包介绍，第一个Lucene案例介绍，查看索引信息的工具lukeall介绍，Luke查看的索引库内容，索引查找过程
2.Lucene3.6.2包介绍,第一个Lucene案例介绍,查看索引信息的工具lukeall介绍,Luke查看的索引库内容,索引查找过程 2014-12-07 23:39 2623人阅读评论(0) ...
Lucene的配置及创建索引全文检索
Lucene 是一个开放源代码的全文检索引擎工具包,但它不是一个完整的全文检索引擎,而是一个全文检索引擎的架构,提供了完整的查询引擎和索引引擎,部分文本分析引擎(英文与德文两种西方语言).Lucene ...
Lucene.Net简单例子-01
前面已经简单介绍了Lucene.Net,下面来看一个实际的例子 1.1 引用必要的bll文件.这里不再介绍(Lucene.Net PanGu PanGu.HightLight PanGu.Luc ...
用Lucene对文档进行索引搜索
问题现在给出很多份文档,现在对某个搜索词感兴趣,想找到相关的文档. 简单搜索一种简单粗暴的做法是: 1.读取每个文档:2.找到其中含有搜索词的文档:3.对找到的文档中搜索词出现的次数统计:4.根据 ...

随机推荐

PendingIntent.getBroadcast第四个参数flags
(1) android.app.PendingIntent.FLAG_UPDATE_CURRENT 如果PendingIntent已经存在,保留它并且只替换它的extra数据. (2) android ...
解决 Xcode7 中多个模拟器的办法
转自: http://www.oschina.net/code/snippet_196012_50574 1.关闭xcode 2.终端输入 sudo killall -9 com.apple.Core ...
VS 2013--工程的创建，scanf报错，常用快捷键，行号设置
一.创建一个工程(这里是C++,其他的一样的) 在vs页面上点击文件-->新建-->项目: 会出现如下界面,自己改名字和存贮位置就可以了确定,然后点击下一步: 这样就建好了一个工程,然 ...
win7 php 配置多个网站
1.在C:\WINDOWS\system32\drivers\etc目录下,打开Hosts 添加A站和B站的DNS映射,如127.0.0.1 local.zhengxin.com127.0.0.1 l ...
版本管理神器git上手
由于以前折腾过svn,虽然最终没有用成功,但是也算有经验,git入门还是比较简单的. 在新目录下建立初始化版本库 : git init git add file git add file2 git ...
Webgrid参数格式
显示图片 grid.Column(null,"图片",format:p=>Html.Raw(string.Format("<img src='{0}'/> ...
QT 静态编译后中文可能会出现乱码
QT 静态编译后中文可能会出现乱码.这是因为处理文字编码的 libqcncodecs 库是以 plugin 形式存放在 QT 静态编译目录/plugs/codecs/libqcncodecs.a 文件 ...
javascript入门视频第一天小案例制作零基础开始学习javascript
JavaScript 是我们网页设计师必备的技能之一.我们主要用javascript来写的是网页特效.我们从零基础开始学习javascript入门. 但是,好的同学刚开始不知道怎么学习,接触js,因此 ...
Python 内置函数 range的使用
内置range函数可以用来方便的产生等差的数值序列.如: >>> range(5) [0, 1, 2, 3, 4] >>> range(1,5) [1, 2, 3, ...
[置顶] Java套接字Socket编程
1)概念网络编程基本模型就客户端到服务器的模型,也就是我们常见的C/S模型.简单的说就是两个进程间相互通信的过程.即通信双方一方作为服务器等待客户端提出请求并给以回应,另一方作为客户端向服务器提出请 ...

lucene做简单的文件索引

lucene做简单的文件索引的更多相关文章

随机推荐

热门专题