Lucene笔记一

Lucene就是一个全文检索的工具，建立索引用的，类似于新华字典的目录

这里使用的是lucene-4.4.0版本，入门代码所需jar包如下图所示（解压lucene-4.4.0后的目录）：

入门代码：

import java.io.File;

import java.io.IOException;

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.document.IntField;

import org.apache.lucene.document.StringField;

import org.apache.lucene.document.TextField;

import org.apache.lucene.document.Field.Store;

import org.apache.lucene.index.DirectoryReader;

import org.apache.lucene.index.IndexReader;

import org.apache.lucene.index.IndexWriter;

import org.apache.lucene.index.IndexWriterConfig;

import org.apache.lucene.index.IndexableField;

import org.apache.lucene.index.Term;

import org.apache.lucene.search.IndexSearcher;

import org.apache.lucene.search.Query;

import org.apache.lucene.search.ScoreDoc;

import org.apache.lucene.search.TermQuery;

import org.apache.lucene.search.TopDocs;

import org.apache.lucene.store.Directory;

import org.apache.lucene.store.FSDirectory;

import org.apache.lucene.util.Version;

import org.junit.Test;

/*8

 * luceneDemo

 *

 */

public class TestLucene {

    /**

     * 通过lucene 提供的api 对数据建立索引，indexWriter

     * @throws IOException

     *

     */

    @Test

    public void testAdd() throws IOException{

        //索引在硬盘上面存放的位置..

        Directory directory=FSDirectory.open(new File("D:/INDEX"));

        //lucene 当前使用的版本...

        Version matchVersion=Version.LUCENE_44;

        //分词器...(把一段文本分词)（黑马程序员是高端的培训机构）

        //analzyer 是一个抽象类，具体的切分词规则由子类实现...

        Analyzer analyzer=new StandardAnalyzer(matchVersion);

        IndexWriterConfig config=new IndexWriterConfig(matchVersion, analyzer);

        //构造索引写入的对象..

        IndexWriter indexWriter=new IndexWriter(directory, config);

        //往索引库里面写数据..

        //索引库里面的数据都是document 一个document相当于是一条记录

        //这个document里面的数据相当于索引结构..

        Document document=new Document();

        IndexableField indexableField=new IntField("id",1, Store.YES);

        IndexableField stringfield=new StringField("title","对王召廷的个人评价",Store.YES);

        IndexableField teIndexableField=new TextField("content","风流倜傥有点黄",Store.YES);

        document.add(indexableField);

        document.add(stringfield);

        document.add(teIndexableField);

        //索引库里面接收的数据都是document对象

        indexWriter.addDocument(document);

        indexWriter.close();

    }

    /**

     * 对建立的索引进行搜索...

     * 通过indexSearcher 去搜索...

     * @throws IOException

     */

    @Test

    public void testSearcher() throws IOException{

        //索引在硬盘上面存放的位置..

        Directory directory=FSDirectory.open(new File("D:/INDEX"));

        //把索引目录里面的索引读取到IndexReader 当中...

        IndexReader indexReader=DirectoryReader.open(directory);

//        /构造搜索索引的对象..

        IndexSearcher indexSearcher=new IndexSearcher(indexReader);

        //Query 它是一个查询条件对象，它是一个抽象类，不同的查询规则就构造不同的子类...

        Query query=new TermQuery(new Term("title", "对王召廷的个人评价"));

        //检索符合query 条件的前面N 条记录..

        //

        TopDocs topDocs=indexSearcher.search(query, 10);

        //返回总记录数...

        System.out.println(topDocs.totalHits);

        //存放的都是document 的id

        ScoreDoc scoreDocs []=topDocs.scoreDocs;

        for(ScoreDoc scoreDoc:scoreDocs){

            //返回的就是document id

            int docID=scoreDoc.doc;

            //我还需要根据id 检索到对应的document

            Document document=indexSearcher.doc(docID);

            System.out.println("id=="+document.get("id"));

            System.out.println("title=="+document.get("title"));

            System.out.println("content=="+document.get("content"));

        }

    }

}

原理分析图：

demo演示：

根据入门代码流程提炼工具类代码：

import java.io.File;

import java.io.IOException;

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.index.DirectoryReader;

import org.apache.lucene.index.IndexReader;

import org.apache.lucene.index.IndexWriter;

import org.apache.lucene.index.IndexWriterConfig;

import org.apache.lucene.search.IndexSearcher;

import org.apache.lucene.store.Directory;

import org.apache.lucene.store.FSDirectory;

import org.apache.lucene.util.Version;

/**

 * lucene 工具类...

 * @author Administrator

 *

 */

/**

 * 提炼规则，假设这段代码可以完成一个功能，把这个代码提炼到一个方法里面去，假设这个方法在某个业务罗继承可以共用，那么往上抽取，

 * 假设在其它逻辑层也可以用，提炼到工具类里面去。

 *

 */

public class LuceneUtils {

    private static IndexWriter indexWriter=null;

    private static IndexSearcher indexSearcher=null;

    //索引存放目录..

    private static Directory directory=null;

    private static IndexWriterConfig indexWriterConfig=null;

    private static Version version=null;

    private static Analyzer analyzer=null;

    static {

        try {

            directory=FSDirectory.open(new File(Constants.URL));

            version=Version.LUCENE_44;

            analyzer=new StandardAnalyzer(version);

            indexWriterConfig=new IndexWriterConfig(version, analyzer);

        } catch (IOException e) {

            e.printStackTrace();

        }

    }

    /**

     *

     * @return 返回用于操作索引的对象...

     * @throws IOException

     */

    public static IndexWriter getIndexWriter() throws IOException{

        indexWriter=new IndexWriter(directory, indexWriterConfig);

        return indexWriter;

    }

    /**

     * 返回用于搜索索引的对象...

     * @return

     * @throws IOException

     */

    public static IndexSearcher  getIndexSearcher() throws IOException{

        IndexReader indexReader=DirectoryReader.open(directory);

        indexSearcher=new IndexSearcher(indexReader);

        return indexSearcher;

    }

    /**

     *

     * 返回lucene 当前的版本...

     * @return

     */

    public static Version getVersion() {

        return version;

    }

    /**

     *

     * 返回lucene 当前使用的分词器..

     * @return

     */

    public static Analyzer getAnalyzer() {

        return analyzer;

    }

}

public class Constants {

    /**

     * 索引存放的目录

     */

    public static final String URL="d:/indexdir/news";

}

bean：

package cn.itcast.bean;

public class Article {

    private int id;

    public int getId() {

        return id;

    }

    public void setId(int id) {

        this.id = id;

    }

    public String getTitle() {

        return title;

    }

    public void setTitle(String title) {

        this.title = title;

    }

    public String getContent() {

        return content;

    }

    public void setContent(String content) {

        this.content = content;

    }

    public String getAuthor() {

        return author;

    }

    public void setAuthor(String author) {

        this.author = author;

    }

    public String getUrl() {

        return url;

    }

    public void setUrl(String url) {

        this.url = url;

    }

    private String title;

    private String content;

    private String author;

    private String url;

}

转换工具类：

package cn.itcast.lucene;

import org.apache.lucene.document.Document;

import org.apache.lucene.document.IntField;

import org.apache.lucene.document.StringField;

import org.apache.lucene.document.TextField;

import org.apache.lucene.document.Field.Store;

import org.apache.lucene.index.IndexableField;

import cn.itcast.bean.Article;

/*8

 * 对象与索引库document 之间的转化

 *

 */

public class ArticleToDocument {

    public static Document articleToDocument(Article article){

        Document document=new Document();

        IntField idfield=new IntField("id", article.getId(), Store.YES);

        //StringField 对应的值不分词，textField 分词..

        TextField titleField=new TextField("title", article.getTitle(),Store.YES);

        TextField contentField=new TextField("content", article.getContent(),Store.YES);

        //修改这个字段对应的权重值，默认这个值为1f

//        contentField.setBoost(3f);

        StringField authorField=new StringField("author", article.getAuthor(), Store.YES);

        StringField urlField=new StringField("url", article.getUrl(), Store.YES);

        document.add(idfield);

        document.add(titleField);

        document.add(contentField);

        document.add(authorField);

        document.add(urlField);

        return document;

    }

}

Dao层：

package cn.itcast.dao;

import java.io.IOException;

import org.apache.lucene.document.Document;

import org.apache.lucene.index.IndexWriter;

import org.apache.lucene.index.Term;

import org.apache.lucene.queryparser.classic.MultiFieldQueryParser;

import org.apache.lucene.queryparser.classic.QueryParser;

import org.apache.lucene.search.IndexSearcher;

import org.apache.lucene.search.Query;

import org.apache.lucene.search.ScoreDoc;

import org.apache.lucene.search.TopDocs;

import cn.itcast.bean.Article;

import cn.itcast.lucene.ArticleToDocument;

import cn.itcast.uitls.LuceneUtils;

/**

 * 使用lucene 的API 来操作索引库..

 * @author Administrator

 *

 */

public class LuceneDao {

    public void addIndex(Article article) throws IOException{

        IndexWriter indexWriter=LuceneUtils.getIndexWriter();

        Document doc=ArticleToDocument.articleToDocument(article);

        indexWriter.addDocument(doc);

        indexWriter.close();

    }

    /**

     * 删除符合条件的记录...

     * @param fieldName

     * @param fieldValue

     * @throws IOException

     */

    public void delIndex(String fieldName,String fieldValue) throws IOException{

        IndexWriter indexWriter=LuceneUtils.getIndexWriter();

        //一定要梦想，万一实现了勒

        Term term=new Term(fieldName, fieldValue);

        indexWriter.deleteDocuments(term);

        indexWriter.close();

    }

    /**

     *

     * 更新

     *

     * update table set ?  where condtion

     * @throws IOException

     *

     *

     */

    public void updateIndex(String fieldName,String fieldValue,Article article) throws IOException{

        IndexWriter indexWriter=LuceneUtils.getIndexWriter();

        /**

         * 1:term 设置更新的条件...

         *

         * 2:设置更新的内容的对象..

         *

         */

        Term term=new Term(fieldName,fieldValue);

        Document doc=ArticleToDocument.articleToDocument(article);

        /**

         *

         * 在lucene 里面是先删除符合这个条件term 的记录，在创建一个doc 记录...

         *

         */

        indexWriter.updateDocument(term, doc);

        indexWriter.close();

    }

    /**

     * 0,10

     * 10,10

     * 20,10

     * @param keywords

     * @throws Exception

     */

    public void findIndex(String keywords,int firstResult,int maxResult) throws Exception{

        IndexSearcher indexSearcher=LuceneUtils.getIndexSearcher();

        //第一个条件.. 单字段查询...

//        Query query=new TermQuery(new Term("title","梦想"))

        //select *  from  table where fieldname="" or content=""

        String fields []={"title","content"};

        //第二种条件：使用查询解析器，多字段。。。 我们需要重新导入一个jar queryParser 的jar... 位置在lucene解压后的queryparser文件夹下

        QueryParser queryParser=new MultiFieldQueryParser(LuceneUtils.getVersion(),fields,LuceneUtils.getAnalyzer());

//        /这个事一个条件..

        Query query=queryParser.parse(keywords);

        //query 它是一个查询条件，query 是一个抽象类，不同的查询规则构造部同的子类即可

        //检索符合query 条件的前面N 条记录...

        //检索的是索引目录... (总记录数，socreDOC (docID))

        //使用lucene 提供的api 进行操作...

        TopDocs topDocs=indexSearcher.search(query,firstResult+maxResult);

//        /存放的是docID

        ScoreDoc scoreDocs []=topDocs.scoreDocs;

        //判断:scoreDocs 的length  (实际取出来的数量..) 与 firstResult+maxResult 的值取小值...

        //在java jdk 里面提供了一个api

        int endResult=Math.min(scoreDocs.length, firstResult+maxResult);

        for(int i=firstResult;i<endResult;i++){

//            /取出来的是docID,这个id 是lucene 自己来维护。

            int docID=scoreDocs[i].doc;

            Document document=indexSearcher.doc(docID);

            System.out.println("id==="+document.get("id"));

            System.out.println("title==="+document.get("title"));

            System.out.println("content==="+document.get("content"));

            System.out.println("url==="+document.get("url"));

            System.out.println("author==="+document.get("author"));

        }

    }

}

测试类：

package cn.itcast.junit;

import java.io.IOException;

import org.junit.Test;

import cn.itcast.bean.Article;

import cn.itcast.dao.LuceneDao;

/**

 * 测试luceneDao

 * @author Administrator

 *

 */

public class LuceneDaoTest {

    private LuceneDao luceneDao=new LuceneDao();

    @Test

    public void testCreate() throws IOException{

        for(int i=28;i<=28;i++){

            Article article=new Article();

            article.setId(i);

            article.setTitle("一定要梦想，万一实现了勒");

            article.setContent("矫情我觉得这句话太矫情了矫情矫情矫情矫情矫情矫情");

            article.setUrl("http://www.tianmao.com");

            article.setAuthor("马云");

            luceneDao.addIndex(article);

        }

    }

    @Test

    public void testsearcher() throws Exception{

//        article.setTitle("一定要梦想，万一实现了勒");   textfield   分词     标准分词器

//        article.setContent("我觉得这句话太矫情了");   textfield   分词    标准分词器

        luceneDao.findIndex("梦想",20,10);

    }

    @Test

    public void testdelete() throws IOException{

        String fieldName="title";

        String fieldValue="定";

        luceneDao.delIndex(fieldName, fieldValue);

    }

    @Test

    public void testUpdate() throws IOException{

        String fieldName="title";

        String fieldValue="定";

        Article article=new Article();

        article.setId(9527);

        article.setTitle("一定要梦想，万一实现了勒");

        article.setContent("我觉得这句话太矫情了");

        article.setUrl("http://www.tianmao.com");

        article.setAuthor("马云");

        luceneDao.updateIndex(fieldName, fieldValue, article);

    }

}

分词器的流程图：

关于分词器，网上可以找到很多种类的分词器配合Lucene使用，相关分词规则查看对应说明。

举例如下：

Analyzer analyzer=new StandardAnalyzer(Version.LUCENE_44);//中文单字切分、英文按空格切分成单词

Analyzer analyzer=new CJKAnalyzer(Version.LUCENE_44);//二分法分词，中文相连的两个词作为一个索引

Analyzer analyzer=new IKAnalyzer();//第三方的分词器，对中文支持较好，可以自定义分词单词与停用词

索引库优化

package cn.itcast.lucene;

import java.io.File;

import java.io.IOException;

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.index.DirectoryReader;

import org.apache.lucene.index.IndexReader;

import org.apache.lucene.index.IndexWriter;

import org.apache.lucene.index.IndexWriterConfig;

import org.apache.lucene.index.LogDocMergePolicy;

import org.apache.lucene.index.MergePolicy;

import org.apache.lucene.index.Term;

import org.apache.lucene.search.IndexSearcher;

import org.apache.lucene.search.Query;

import org.apache.lucene.search.TermQuery;

import org.apache.lucene.search.TopDocs;

import org.apache.lucene.store.Directory;

import org.apache.lucene.store.FSDirectory;

import org.apache.lucene.store.IOContext;

import org.apache.lucene.store.RAMDirectory;

import org.apache.lucene.util.Version;

import org.junit.Test;

import cn.itcast.uitls.Constants;

public class TestOptimise {

    /*8

     * 优化的第一种方式:通过 IndexWriterConfig 优化设置mergePolicy（合并策略）

     *

     *

     */

    public void testoptimise() throws IOException{

        Directory directory=FSDirectory.open(new File(Constants.URL));

        Analyzer analyzer=new StandardAnalyzer(Version.LUCENE_44);

        IndexWriterConfig config=new IndexWriterConfig(Version.LUCENE_44, analyzer);

        LogDocMergePolicy mergePolicy=new LogDocMergePolicy();

        /**

         * 当这个值越小，更少的内存会被运用当创建索引的时候，搜索的时候越快，创建的时候越慢。

         * 当这个值越大，更多的内存会被运用当创建索引的时候，搜索的时候越慢，创建的时候越快..

         * larger values >10

         *

         * 2<=smaller<=10

         *

         */

        //设置合并因子..

        mergePolicy.setMergeFactor(10);

//        /设置索引的合并策略..

        config.setMergePolicy(mergePolicy);

        IndexWriter indexWriter=new IndexWriter(directory, config);

    }

    /**

     * 通过directory 去优化....

     * @throws IOException

     *

     */

    @Test

    public void testoptimise2() throws IOException{

        //现在的索引放在硬盘上面...

        Directory directory=FSDirectory.open(new File(Constants.URL));

//        /通过这个对象吧directory 里面的数据读取到directory1 里面来..

        IOContext ioContext=new IOContext();

        //相办法吧directory 的索引读取到内存当中来...

        Directory directory1=new RAMDirectory(directory,ioContext);

        IndexReader indexReader=DirectoryReader.open(directory1);

        IndexSearcher indexSearcher=new IndexSearcher(indexReader);

        Query query=new TermQuery(new Term("title", "想"));

        TopDocs topDocs=indexSearcher.search(query, 100);

        System.out.println(topDocs.totalHits);

    }

    /**

     * 索引文件越大，会影响检索的速度..  (减少索引文件的大小)

     *

     * 1:排除停用词..

     *

     */

    public void testoptimise3(){

    }

    /**

     * 将索引分目盘存放  将数据归类...

     *

     */

    public void testoptimise4(){

    }

}

Lucene笔记一的更多相关文章

lucene笔记
lucene全文检索全文检索是计算机程序通过扫描文章中的每一个词, 对每一个词建立一个索引, 指明该词在文章中出现的次数和位置. 当用户查询时根据建立的索引查找,类似于通过字典的检索字表查字的过程
Lucene笔记二
lucene 的排序 package cn.itcast.lucene; import java.io.IOException; import org.apache.lucene.document.D ...
lucene教程【转】【补】
现实流程 lucene 相关jar包第一个:Lucene-core-4.0.0.jar, 其中包括了常用的文档,索引,搜索,存储等相关核心代码. 第二个:Lucene-analyzers-commo ...
Lucene/Solr搜索引擎开发笔记 - 第1章 Solr安装与部署（Jetty篇）
一.为何开博客写<Lucene/Solr搜索引擎开发笔记> 本人毕业于2011年,2011-2014的三年时间里,在深圳前50强企业工作,从事工业控制领域的机器视觉方向,主要使用语言为C/ ...
Lucene学习笔记（更新）
1.Lucene学习笔记 http://www.cnblogs.com/hanganglin/articles/3453415.html
lucene 5.2.0学习笔记
package com.bc.cas.manager; import com.bc.cas.dao.BookDao; import com.bc.cas.model.entity.Book; impo ...
Apache Lucene学习笔记
Hadoop概述 Apache lucene: 全球第一个开源的全文检索引擎工具包完整的查询引擎和搜索引擎部分文本分析引擎开发人员在此基础建立完整的全文检索引擎以下为转载:http://www ...
[lucene系列笔记1]lucene6的安装与配置（Windows系统）
lucene是一个java开源的高效全文检索工具包,最近做项目要用到,把学习的过程记录一下. 第一步:下载安装jdk 1.首先从官网下载jdk(下载之前先查看你的电脑是多少位操作系统,如果是32就下载 ...
Lucene学习笔记
师兄推荐我学习Lucene这门技术,用了两天时间,大概整理了一下相关知识点. 一.什么是Lucene Lucene即全文检索.全文检索是计算机程序通过扫描文章中的每一个词,对每一个词建立一个索引,指明 ...

随机推荐

Java设计模式（18）——行为模式之迭代子模式（Iterator）
一.概述概念 UML简图 // Aggregate:聚集(集合) 角色抽象迭代子:定义遍历元素所需要的接口具体迭代子:实现抽象迭代子接口,保持游标聚集/具体聚集:定义/实现创建迭代子对象的接口 ...
杭州优步uber司机第二组奖励政策
-8月9日更新- 优步杭州第二组: 定义为激活时间在2015/6/8之后2015/8/3之前的车主(以优步后台数据显示为准) 滴滴快车单单2.5倍,注册地址:http://www.udache.com ...
厦门Uber优步司机奖励政策（12月14日到12月20日）
滴快车单单2.5倍,注册地址:http://www.udache.com/ 如何注册Uber司机(全国版最新最详细注册流程)/月入2万/不用抢单:http://www.cnblogs.com/mfry ...
mysql数据库目录存放位置更改
http://haowen.blog.51cto.com/3486731/1274721 mysql数据库存储路径更改使用了VPS一段时间之后发现磁盘空间快满了.本人的VPS在购买的时候买了500g ...
Net Core学习笔记
Net Core 官网:https://dotnet.github.io/ Net Core Api: https://docs.microsoft.com/en-us/dotnet/api/?vie ...
「日常训练&知识学习」莫队算法（二）：树上莫队（Count on a tree II，SPOJ COT2）
题意与分析题意是这样的,给定一颗节点有权值的树,然后给若干个询问,每次询问让你找出一条链上有多少个不同权值. 写这题之前要参看我的三个blog:Codeforces Round #326 Div. ...
了解Python控制流语句——if语句
控制流截止到现在,在我们所看过的程序中,总是有一系列语句从上到下精确排列,并交由 Python 忠实地执行.如果你想改变这一工作流程,应该怎么做?就像这样的情况:你需要程序作出一些决定,并依据不同的 ...
TW实习日记：第22天
今天开发项目的还没完成的功能点,没什么难的,样式复制粘贴,JSON表单配一配,接口调一调,基本就完成了.不过中间在写后台的一些接口时,发现被自己之前写的一些方法给坑了.为什么这样说呢,因为在之前的几个 ...
js for循环实例
1.求1-100的寄数和? //2.奇数求和 var ppt=0 for(var i=1;i<=100;i+=2){ ppt+=i } 2.求1-100的偶数和 var num=0 for(va ...
UVa 1585 - Score - ACM/ICPC Seoul 2005 解题报告 - C语言
1.题目大意给出一个由O和X组成的字符串(长度为80以内),每个O的得分为目前连续出现的O的数量,X得分为0,统计得分. 2.思路实在说不出了,这题没过脑AC的.直接贴代码吧.=_= 3.代码 # ...

Lucene笔记一

Lucene笔记一的更多相关文章

随机推荐

热门专题