Lucence使用入门

参考:

https://blog.csdn.net/u014209975/article/details/50525624

https://www.cnblogs.com/hanyinglong/p/5395600.html

http://lucene.apache.org/core/4_0_0/core/overview-summary.html

https://www.jianshu.com/p/0a2bbe0f4c42

依赖:

lucene-analyzers.jar

lucene-benchmark.jar

lucene-core.jar

lucene-highlighter.jar

lucene-memory.jar

lucene-parser.jar

lucene-remote.jar

lucene-smartcn.jar

实体类:

package com.h3c.lucence;

import java.io.Serializable;

public class Entity implements Serializable {

	private static final long serialVersionUID = 3701082756628915138L;

	private Integer id;

	private String type;

    private String virtualDoc;

    private String summary;

    private float score;

    public Integer getId() {

		return id;

	}

	public void setId(Integer id) {

		this.id = id;

	}

	public String getType() {

		return type;

	}

	public void setType(String type) {

		this.type = type;

	}

	public String getVirtualDoc() {

        if (null == virtualDoc) {

            // TODO 根据entity的值构造虚拟的文档，包括所有属性及对应的值，用于全文检索

        	// 格式：字段1:属性值1,字段2:属性值2,...

        }

        return virtualDoc;

    }

    public void setVirtualDoc(String virtualDoc) {

        this.virtualDoc = virtualDoc;

    }

    public String getSummary() {

        StringBuilder sb = new StringBuilder();

        String tmpSum = summary;

        tmpSum = tmpSum.replace("<SPAN style=\"color:red;\">", "");

        tmpSum = tmpSum.replace("</SPAN>", "");

        String virtualDoc2 = getVirtualDoc();

        int length = tmpSum.length();

        int firstIndex = virtualDoc2.indexOf(tmpSum);

        if (firstIndex > 0) {

            sb.append("...");

        }

        sb.append(summary);

        if (firstIndex + length < virtualDoc2.length()) {

            sb.append("...");

        }

        return sb.toString();

    }

    public void setSummary(String summary) {

        this.summary = summary;

    }

    public float getScore() {

        return score;

    }

    public void setScore(float score) {

        this.score = score;

    }

}

Demo类:

package com.h3c.lucence;

import java.io.Closeable;

import java.io.File;

import java.io.IOException;

import java.util.ArrayList;

import java.util.List;

import java.util.regex.Matcher;

import java.util.regex.Pattern;

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.analysis.TokenStream;

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.document.Field;

import org.apache.lucene.index.IndexReader;

import org.apache.lucene.index.IndexWriter;

import org.apache.lucene.index.IndexWriterConfig;

import org.apache.lucene.index.Term;

import org.apache.lucene.search.BooleanClause;

import org.apache.lucene.search.BooleanQuery;

import org.apache.lucene.search.IndexSearcher;

import org.apache.lucene.search.PrefixQuery;

import org.apache.lucene.search.Query;

import org.apache.lucene.search.ScoreDoc;

import org.apache.lucene.search.TermQuery;

import org.apache.lucene.search.TopDocs;

import org.apache.lucene.search.WildcardQuery;

import org.apache.lucene.search.highlight.Highlighter;

import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;

import org.apache.lucene.search.highlight.QueryScorer;

import org.apache.lucene.search.highlight.SimpleHTMLFormatter;

import org.apache.lucene.search.highlight.SimpleSpanFragmenter;

import org.apache.lucene.search.highlight.TokenSources;

import org.apache.lucene.store.Directory;

import org.apache.lucene.store.FSDirectory;

import org.apache.lucene.util.Version;

public class Demo {

    /** lucene索引目录 */

    private static Directory ciIndexDir;

    private static final String CI_CONTENT_FLAG = "virtualDoc";

    /** 分词分析工具，使用标准分析工具，单个含字和连续的英文单词作为索引。 */

    private static final Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);

    private static Pattern VALID_IPV4_PATTERN = null;

    private static Pattern VALID_IPV6_PATTERN = null;

    private static final String ipv4Pattern = "(([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.){3}([01]?\\d\\d?|2[0-4]\\d|25[0-5])";

    private static final String ipv6Pattern = "([0-9a-f]{1,4}:){7}([0-9a-f]){1,4}";

    private static IndexWriter indexWriter;

    static {

    	VALID_IPV4_PATTERN = Pattern.compile(ipv4Pattern, Pattern.CASE_INSENSITIVE);

        VALID_IPV6_PATTERN = Pattern.compile(ipv6Pattern, Pattern.CASE_INSENSITIVE);

        IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_36, analyzer);

        conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);

        try {

            indexWriter = new IndexWriter(getCiIndexDir(), conf);

        } catch (IOException e) {

        	e.printStackTrace();

        }

    }

    private static Directory getCiIndexDir() {

        if (null == ciIndexDir) {

            try {

                ciIndexDir = FSDirectory.open(new File("D://indexs"));

            } catch (IOException e) {

            	e.printStackTrace();

            }

        }

        return ciIndexDir;

    }

    private static boolean isIpAddress(String ipAddress) {

        Matcher m1 = VALID_IPV4_PATTERN.matcher(ipAddress);

        Matcher m2 = VALID_IPV6_PATTERN.matcher(ipAddress);

        return m1.matches() || m2.matches();

    }

    private static boolean isChinese(char c) {

        Character.UnicodeBlock ub = Character.UnicodeBlock.of(c);

        if (ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS || ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS

                || ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A || ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B

                || ub == Character.UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION || ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS

                || ub == Character.UnicodeBlock.GENERAL_PUNCTUATION) {

            return true;

        }

        return false;

    }

    private static BooleanQuery parseChineseCharacters(String inputString){

    	BooleanQuery query = new BooleanQuery();

    	if(isIpAddress(inputString)){

    		query.add(new TermQuery(new Term(CI_CONTENT_FLAG,inputString)), BooleanClause.Occur.MUST);

    		return query;

    	}

        BooleanQuery fieldQuery = new BooleanQuery();

        boolean isWord = false;

        StringBuilder tempWord = new StringBuilder();

        inputString = inputString.toLowerCase();

    	BooleanQuery booleanQuery = new BooleanQuery();

    	int length = inputString.length();

    	Query termQuery = null;

    	for(int i=0; i<length; i++){

    		char c = inputString.charAt(i);

    		if(c >= 'a' && c <= 'z' || c >= '0' && c <= '9'){//English character

    			isWord = true;

    			tempWord.append(c);

    		}

    		else{//Delimiter or Chinese character

    			isWord = false;

    			if(tempWord.length() > 0){

    				termQuery = new PrefixQuery(new Term(CI_CONTENT_FLAG,tempWord.toString()));

//    				booleanQuery.add(termQuery,BooleanClause.Occur.MUST);

    				booleanQuery.add(termQuery,BooleanClause.Occur.SHOULD);

    				tempWord = new StringBuilder();

    			}

    		}

    		if(!isWord){

    			termQuery = new TermQuery(new Term(CI_CONTENT_FLAG,String.valueOf(c)));

    			if(isChinese(c)){//Chinese character

//        			booleanQuery.add(termQuery,BooleanClause.Occur.MUST);

        			booleanQuery.add(termQuery,BooleanClause.Occur.SHOULD);

    			}

    			else{//Delimiter

        			booleanQuery.add(termQuery,BooleanClause.Occur.SHOULD);

    			}

    		}

    	}

    	if(tempWord.length() > 0){

    		termQuery = new WildcardQuery(new Term(CI_CONTENT_FLAG,tempWord.toString()+"*"));

			booleanQuery.add(termQuery,BooleanClause.Occur.SHOULD);

			termQuery = new WildcardQuery(new Term(CI_CONTENT_FLAG,"*" + tempWord.toString()));

			booleanQuery.add(termQuery,BooleanClause.Occur.SHOULD);

		}

    	// Begin 处理全局字段匹配

        termQuery = new WildcardQuery(new Term(CI_CONTENT_FLAG,inputString+"*"));

		booleanQuery.add(termQuery,BooleanClause.Occur.SHOULD);

		termQuery = new WildcardQuery(new Term(CI_CONTENT_FLAG,"*" + inputString));

		booleanQuery.add(termQuery,BooleanClause.Occur.SHOULD);

		termQuery = new WildcardQuery(new Term(CI_CONTENT_FLAG,"*" + inputString + "*"));

		booleanQuery.add(termQuery,BooleanClause.Occur.SHOULD);

		// End 处理全局字段匹配

    	BooleanClause clause = new BooleanClause(booleanQuery, BooleanClause.Occur.MUST);

    	fieldQuery.add(clause);

        BooleanClause fieldClause = new BooleanClause(fieldQuery, BooleanClause.Occur.MUST);

        query.add(fieldClause);

        return query;

    }

    /**

     * 全文检索

     * @param queryStr

     * @throws Exception

     */

    private static void contentSearch(String queryStr, boolean highlight) throws Exception {

        IndexReader indexReader = null;

        IndexSearcher indexSearcher = null;

        try {

            indexReader = IndexReader.open(getCiIndexDir());

            indexSearcher = new IndexSearcher(indexReader);

            //组合查询条件，需要根据业务自己定义

            Query query = parseChineseCharacters(queryStr);

			TopDocs hits = indexSearcher.search(query, Integer.MAX_VALUE);

            if(hits.totalHits > 0) {

	            if (highlight) {

	                QueryScorer scorer = new QueryScorer(query, CI_CONTENT_FLAG);

	                SimpleHTMLFormatter formatter = new SimpleHTMLFormatter("<SPAN style=\"color:red;\">", "</SPAN>");

	                Highlighter highlighter = new Highlighter(formatter, scorer);

	                highlighter

	                    .setTextFragmenter(new SimpleSpanFragmenter(scorer, 100));

	                for (ScoreDoc scoreDoc : hits.scoreDocs) {

	                    Document doc = indexSearcher.doc(scoreDoc.doc);

	                    System.out.println(doc.get("virtualDoc"));

	                    Entity entity = null;

	                    entity = convertToEntity(doc, indexSearcher.getIndexReader(), scoreDoc.doc, highlighter);

	                    entity.setScore(scoreDoc.score);

	                }

	            } else {

	                for (ScoreDoc scoreDoc : hits.scoreDocs) {

	                    Document doc = indexSearcher.doc(scoreDoc.doc);

	                    System.out.println(doc.get("virtualDoc"));

	                    Entity entity = null;

	                    entity = convertToEntity(doc);

	                    entity.setScore(scoreDoc.score);

	                }

	            }

            }

        } catch (IOException ioe) {

        	ioe.printStackTrace();

        } finally {

        	close(indexSearcher);

        	close(indexReader);

        }

    }

    /**

     * 对实现Closeable接口的统一关闭

     * @param object

     */

    private static void close(Closeable object) {

    	if(null != object) {

    		try {

				object.close();

			} catch (IOException e) {

			}

    	}

    }

    /**

     * 实体转换为Doc

     * @param entity

     * @return

     */

    public static Document convertToDocument(Entity entity) {

        Document doc = new Document();

        String virtualDoc = entity.getVirtualDoc();

        //Field.Store.Yes存储，Field.Index.ANALYZED分词

        doc.add(new Field("id", String.valueOf(entity.getId()), Field.Store.YES, Field.Index.NOT_ANALYZED));

        doc.add(new Field("type", entity.getType(), Field.Store.YES, Field.Index.NOT_ANALYZED));

        doc.add(new Field(CI_CONTENT_FLAG, null == virtualDoc ? " " : virtualDoc, Field.Store.YES, Field.Index.ANALYZED));

        return doc;

    }

    /**

     * Doc转换为实体

     * @param doc

     * @return

     */

    public static Entity convertToEntity(Document doc) {

    	Entity ci = new Entity();

    	ci.setId(Integer.valueOf(doc.get("id")));

    	ci.setType(doc.get("type"));

        ci.setVirtualDoc(doc.get(CI_CONTENT_FLAG));

        return ci;

    }

    /**

     * 检索Entity，含高亮信息

     * @param doc

     * @param indexReader

     * @param docId

     * @param highlighter

     * @return

     * @throws IOException

     * @throws InvalidTokenOffsetsException

     */

    public static Entity convertToEntity(Document doc, IndexReader indexReader, int docId, Highlighter highlighter)

        throws IOException, InvalidTokenOffsetsException {

    	Entity entity = convertToEntity(doc);

        String virtualDoc = entity.getVirtualDoc();

        TokenStream stream = TokenSources.getAnyTokenStream(indexReader, docId, CI_CONTENT_FLAG, doc, analyzer);

        String highlighterSummary = highlighter.getBestFragment(stream, virtualDoc);

        if(highlighterSummary == null){

        	highlighterSummary = virtualDoc;

        }

        entity.setSummary(highlighterSummary);

        return entity;

    }

    /**

     * 给entity信息增加索引

     * @param entity

     */

    public static void addIndex(Entity entity) {

        try {

        	deleteIndex(entity);

            Document doc = convertToDocument(entity);

            indexWriter.addDocument(doc);

            indexWriter.commit();

        } catch (Exception e) {

           e.printStackTrace();

        }

    }

    /**

     * 批量增加索引

     * @param list

     */

    public static void addIndexs(List<Entity> list) {

        try {

        	List<Document> docs = new ArrayList<Document>();

        	deleteIndexs(list);

            for (Entity entity : list) {

                Document doc = convertToDocument(entity);

                docs.add(doc);

            }

            indexWriter.addDocuments(docs);

            indexWriter.commit();

        } catch (Exception e) {

        	e.printStackTrace();

        }

    }

    /**

     * 给实体信息更新索引

     * @param entity

     */

    public static void updateIndex(Entity entity) {

        try {

            addIndex(entity);

        } catch (Exception e) {

            e.printStackTrace();

        }

    }

    /**

     * 删除entity列表信息对应的索引

     * @param entity

     */

    public static void deleteIndexs(List<Entity> list) {

        try {

        	int size = list.size();

        	Term[] terms = new Term[size];

        	for(int i=0; i<size; i++) {

        		terms[i] = new Term("id", list.get(i).getId().toString());

        	}

            indexWriter.deleteDocuments(terms);

            indexWriter.commit();

        } catch (Exception e) {

            e.printStackTrace();

        }

    }

    /**

     * 删除实体信息对应的索引

     * @param entity

     */

    public static void deleteIndex(Entity entity) {

        try {

            indexWriter.deleteDocuments(new Term("id", entity.getId().toString()));

            indexWriter.commit();

        } catch (Exception e) {

            e.printStackTrace();

        }

    }

    /**

     * 删除实体类型对应的所以索引信息

     * @param type

     */

    public static void deleteIndexByType(String type) {

        try {

            indexWriter.deleteDocuments(new Term("type", type));

            indexWriter.commit();

        } catch (Exception e) {

            e.printStackTrace();

        }

    }

    @Override

    protected void finalize() throws Throwable {

        indexWriter.close();

    }

    public static void main(String[] args) throws Exception {

		String queryStr = "http://mail6c1.shenzhenair.com";

		contentSearch(queryStr, true);

	}

}

Lucence使用入门的更多相关文章

【solr专题之一】Solr快速入门分类： H4_SOLR/LUCENCE 2014-07-02 14:59 2403人阅读评论(0) 收藏
一.Solr学习相关资料 1.官方材料 (1)快速入门:http://lucene.apache.org/solr/4_9_0/tutorial.html,以自带的example项目快速介绍发Solr ...
Google Guava入门（一）
Guava作为Java编程的助手,可以提升开发效率,对Guava设计思想的学习则极大的有益于今后的编程之路.故在此对<Getting Started with Google Guava>一 ...
Elasticsearch7.X 入门学习第一课笔记----基本概念
原文:Elasticsearch7.X 入门学习第一课笔记----基本概念版权声明:本文为博主原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明. 本文链接:https: ...
Lucene从入门到实战
Lucene 在了解Lucene之前,我们先了解下全文数据查询. 全文数据查询我们的数据一般分为两种:结构化数据和非结构化数据结构化数据:有固定格式或有限长度的数据,如数据库中的数据.元数据非结 ...
Angular2入门系列教程7-HTTP（一）-使用Angular2自带的http进行网络请求
上一篇:Angular2入门系列教程6-路由(二)-使用多层级路由并在在路由中传递复杂参数感觉这篇不是很好写,因为涉及到网络请求,如果采用真实的网络请求,这个例子大家拿到手估计还要自己写一个web ...
ABP入门系列（1）——学习Abp框架之实操演练
作为.Net工地搬砖长工一名,一直致力于挖坑(Bug)填坑(Debug),但技术却不见长进.也曾热情于新技术的学习,憧憬过成为技术大拿.从前端到后端,从bootstrap到javascript,从py ...
Oracle分析函数入门
一.Oracle分析函数入门分析函数是什么?分析函数是Oracle专门用于解决复杂报表统计需求的功能强大的函数,它可以在数据中进行分组然后计算基于组的某种统计值,并且每一组的每一行都可以返回一个统计 ...
Angular2入门系列教程6-路由（二）-使用多层级路由并在在路由中传递复杂参数
上一篇:Angular2入门系列教程5-路由(一)-使用简单的路由并在在路由中传递参数之前介绍了简单的路由以及传参,这篇文章我们将要学习复杂一些的路由以及传递其他附加参数.一个好的路由系统可以使我们 ...
Angular2入门系列教程5-路由（一）-使用简单的路由并在在路由中传递参数
上一篇:Angular2入门系列教程-服务上一篇文章我们将Angular2的数据服务分离出来,学习了Angular2的依赖注入,这篇文章我们将要学习Angualr2的路由为了编写样式方便,我们这篇 ...

随机推荐

JAVA中的面向对象与内存解析_2
构造方法(构造函数) • 使用new +构造方法创建一个新的对象. • 构造函数是定义在Java类中的一个用来初始化对象的函数. • 构造函数与类同名且没有返回值. • 例如:Person类的构造 ...
cross-env
cross-env跨平台设置环境变量安装npm install --save-dev cross-env config文件下新建环境对应文件新建编译命令修改build/webpack.prod. ...
flink批处理中的source以及sink介绍
一.flink在批处理中常见的source flink在批处理中常见的source主要有两大类: 1.基于本地集合的source(Collection-based-source) 2.基于文件的sou ...
python基础--二分查找
# 二分查找 def sort_search(lst,key): """ 二分查找 :param lst: 有序数列 :param key: 要查找的关键值 :retur ...
poj1285 Combinations, Once Again（泛化背包）
题目传送门 Combinations, Once Again Time Limit: 1000MS Memory Limit: 65536K Total Submissions: 1897 A ...
从现在开始强迫自己使用 Reflect
静态方法 Reflect.apply(target, thisArg, args) 等同于 Function.prototype.apply.call(func, thisArg, args) Ref ...
bootstrap-thymeleaf-分页
1.HTML代码 <div th:fragment="paginater"> <ul th:id="paginaterUlID" th:if= ...
Flask-SQLAlchemy使用方法
Flask-SQLAlchemy使用起来非常有趣,对于基本应用十分容易使用,并且对于大型项目易于扩展.有关完整的指南,请参阅 SQLAlchemy 的 API 文档. 常见情况下对于只有一个 Flas ...
for循环（foreach型）语法
分支结构if 语句语法

Lucence使用入门

Lucence使用入门的更多相关文章

随机推荐

热门专题