参考:

https://blog.csdn.net/u014209975/article/details/50525624

https://www.cnblogs.com/hanyinglong/p/5395600.html

http://lucene.apache.org/core/4_0_0/core/overview-summary.html

https://www.jianshu.com/p/0a2bbe0f4c42

依赖:

lucene-analyzers.jar
lucene-benchmark.jar
lucene-core.jar
lucene-highlighter.jar
lucene-memory.jar
lucene-parser.jar
lucene-remote.jar
lucene-smartcn.jar

实体类:

package com.h3c.lucence;

import java.io.Serializable;

public class Entity implements Serializable {

	private static final long serialVersionUID = 3701082756628915138L;

	private Integer id;

	private String type;

    private String virtualDoc;

    private String summary;

    private float score;

    public Integer getId() {
return id;
} public void setId(Integer id) {
this.id = id;
} public String getType() {
return type;
} public void setType(String type) {
this.type = type;
} public String getVirtualDoc() {
if (null == virtualDoc) {
// TODO 根据entity的值构造虚拟的文档,包括所有属性及对应的值,用于全文检索
// 格式:字段1:属性值1,字段2:属性值2,...
}
return virtualDoc;
} public void setVirtualDoc(String virtualDoc) {
this.virtualDoc = virtualDoc;
} public String getSummary() {
StringBuilder sb = new StringBuilder();
String tmpSum = summary;
tmpSum = tmpSum.replace("<SPAN style=\"color:red;\">", "");
tmpSum = tmpSum.replace("</SPAN>", "");
String virtualDoc2 = getVirtualDoc();
int length = tmpSum.length();
int firstIndex = virtualDoc2.indexOf(tmpSum);
if (firstIndex > 0) {
sb.append("...");
}
sb.append(summary);
if (firstIndex + length < virtualDoc2.length()) {
sb.append("...");
} return sb.toString();
} public void setSummary(String summary) {
this.summary = summary;
} public float getScore() {
return score;
} public void setScore(float score) {
this.score = score;
}
}

Demo类:

package com.h3c.lucence;

import java.io.Closeable;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.search.highlight.SimpleSpanFragmenter;
import org.apache.lucene.search.highlight.TokenSources;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version; public class Demo {
/** lucene索引目录 */
private static Directory ciIndexDir; private static final String CI_CONTENT_FLAG = "virtualDoc"; /** 分词分析工具,使用标准分析工具,单个含字和连续的英文单词作为索引。 */
private static final Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36); private static Pattern VALID_IPV4_PATTERN = null;
private static Pattern VALID_IPV6_PATTERN = null;
private static final String ipv4Pattern = "(([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.){3}([01]?\\d\\d?|2[0-4]\\d|25[0-5])";
private static final String ipv6Pattern = "([0-9a-f]{1,4}:){7}([0-9a-f]){1,4}"; private static IndexWriter indexWriter; static {
VALID_IPV4_PATTERN = Pattern.compile(ipv4Pattern, Pattern.CASE_INSENSITIVE);
VALID_IPV6_PATTERN = Pattern.compile(ipv6Pattern, Pattern.CASE_INSENSITIVE);
IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_36, analyzer);
conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
try {
indexWriter = new IndexWriter(getCiIndexDir(), conf);
} catch (IOException e) {
e.printStackTrace();
}
} private static Directory getCiIndexDir() {
if (null == ciIndexDir) {
try {
ciIndexDir = FSDirectory.open(new File("D://indexs"));
} catch (IOException e) {
e.printStackTrace();
}
}
return ciIndexDir;
} private static boolean isIpAddress(String ipAddress) {
Matcher m1 = VALID_IPV4_PATTERN.matcher(ipAddress);
Matcher m2 = VALID_IPV6_PATTERN.matcher(ipAddress);
return m1.matches() || m2.matches();
} private static boolean isChinese(char c) {
Character.UnicodeBlock ub = Character.UnicodeBlock.of(c);
if (ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS || ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS
|| ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A || ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B
|| ub == Character.UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION || ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS
|| ub == Character.UnicodeBlock.GENERAL_PUNCTUATION) {
return true;
}
return false;
} private static BooleanQuery parseChineseCharacters(String inputString){
BooleanQuery query = new BooleanQuery();
if(isIpAddress(inputString)){
query.add(new TermQuery(new Term(CI_CONTENT_FLAG,inputString)), BooleanClause.Occur.MUST);
return query;
}
BooleanQuery fieldQuery = new BooleanQuery();
boolean isWord = false;
StringBuilder tempWord = new StringBuilder();
inputString = inputString.toLowerCase();
BooleanQuery booleanQuery = new BooleanQuery();
int length = inputString.length();
Query termQuery = null;
for(int i=0; i<length; i++){
char c = inputString.charAt(i);
if(c >= 'a' && c <= 'z' || c >= '0' && c <= '9'){//English character
isWord = true;
tempWord.append(c);
}
else{//Delimiter or Chinese character
isWord = false;
if(tempWord.length() > 0){
termQuery = new PrefixQuery(new Term(CI_CONTENT_FLAG,tempWord.toString()));
// booleanQuery.add(termQuery,BooleanClause.Occur.MUST);
booleanQuery.add(termQuery,BooleanClause.Occur.SHOULD);
tempWord = new StringBuilder();
}
}
if(!isWord){
termQuery = new TermQuery(new Term(CI_CONTENT_FLAG,String.valueOf(c)));
if(isChinese(c)){//Chinese character
// booleanQuery.add(termQuery,BooleanClause.Occur.MUST);
booleanQuery.add(termQuery,BooleanClause.Occur.SHOULD);
}
else{//Delimiter
booleanQuery.add(termQuery,BooleanClause.Occur.SHOULD);
} }
}
if(tempWord.length() > 0){
termQuery = new WildcardQuery(new Term(CI_CONTENT_FLAG,tempWord.toString()+"*"));
booleanQuery.add(termQuery,BooleanClause.Occur.SHOULD); termQuery = new WildcardQuery(new Term(CI_CONTENT_FLAG,"*" + tempWord.toString()));
booleanQuery.add(termQuery,BooleanClause.Occur.SHOULD);
} // Begin 处理全局字段匹配
termQuery = new WildcardQuery(new Term(CI_CONTENT_FLAG,inputString+"*"));
booleanQuery.add(termQuery,BooleanClause.Occur.SHOULD); termQuery = new WildcardQuery(new Term(CI_CONTENT_FLAG,"*" + inputString));
booleanQuery.add(termQuery,BooleanClause.Occur.SHOULD); termQuery = new WildcardQuery(new Term(CI_CONTENT_FLAG,"*" + inputString + "*"));
booleanQuery.add(termQuery,BooleanClause.Occur.SHOULD);
// End 处理全局字段匹配 BooleanClause clause = new BooleanClause(booleanQuery, BooleanClause.Occur.MUST);
fieldQuery.add(clause); BooleanClause fieldClause = new BooleanClause(fieldQuery, BooleanClause.Occur.MUST);
query.add(fieldClause); return query;
} /**
* 全文检索
* @param queryStr
* @throws Exception
*/
private static void contentSearch(String queryStr, boolean highlight) throws Exception {
IndexReader indexReader = null;
IndexSearcher indexSearcher = null;
try {
indexReader = IndexReader.open(getCiIndexDir());
indexSearcher = new IndexSearcher(indexReader); //组合查询条件,需要根据业务自己定义
Query query = parseChineseCharacters(queryStr); TopDocs hits = indexSearcher.search(query, Integer.MAX_VALUE);
if(hits.totalHits > 0) {
if (highlight) {
QueryScorer scorer = new QueryScorer(query, CI_CONTENT_FLAG);
SimpleHTMLFormatter formatter = new SimpleHTMLFormatter("<SPAN style=\"color:red;\">", "</SPAN>");
Highlighter highlighter = new Highlighter(formatter, scorer);
highlighter
.setTextFragmenter(new SimpleSpanFragmenter(scorer, 100)); for (ScoreDoc scoreDoc : hits.scoreDocs) {
Document doc = indexSearcher.doc(scoreDoc.doc);
System.out.println(doc.get("virtualDoc"));
Entity entity = null;
entity = convertToEntity(doc, indexSearcher.getIndexReader(), scoreDoc.doc, highlighter);
entity.setScore(scoreDoc.score);
}
} else {
for (ScoreDoc scoreDoc : hits.scoreDocs) {
Document doc = indexSearcher.doc(scoreDoc.doc);
System.out.println(doc.get("virtualDoc"));
Entity entity = null;
entity = convertToEntity(doc);
entity.setScore(scoreDoc.score);
}
}
}
} catch (IOException ioe) {
ioe.printStackTrace();
} finally {
close(indexSearcher);
close(indexReader);
}
} /**
* 对实现Closeable接口的统一关闭
* @param object
*/
private static void close(Closeable object) {
if(null != object) {
try {
object.close();
} catch (IOException e) {
}
}
} /**
* 实体转换为Doc
* @param entity
* @return
*/
public static Document convertToDocument(Entity entity) {
Document doc = new Document();
String virtualDoc = entity.getVirtualDoc();
//Field.Store.Yes存储,Field.Index.ANALYZED分词
doc.add(new Field("id", String.valueOf(entity.getId()), Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field("type", entity.getType(), Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field(CI_CONTENT_FLAG, null == virtualDoc ? " " : virtualDoc, Field.Store.YES, Field.Index.ANALYZED));
return doc;
} /**
* Doc转换为实体
* @param doc
* @return
*/
public static Entity convertToEntity(Document doc) {
Entity ci = new Entity();
ci.setId(Integer.valueOf(doc.get("id")));
ci.setType(doc.get("type"));
ci.setVirtualDoc(doc.get(CI_CONTENT_FLAG));
return ci;
} /**
* 检索Entity,含高亮信息
* @param doc
* @param indexReader
* @param docId
* @param highlighter
* @return
* @throws IOException
* @throws InvalidTokenOffsetsException
*/
public static Entity convertToEntity(Document doc, IndexReader indexReader, int docId, Highlighter highlighter)
throws IOException, InvalidTokenOffsetsException { Entity entity = convertToEntity(doc);
String virtualDoc = entity.getVirtualDoc();
TokenStream stream = TokenSources.getAnyTokenStream(indexReader, docId, CI_CONTENT_FLAG, doc, analyzer);
String highlighterSummary = highlighter.getBestFragment(stream, virtualDoc);
if(highlighterSummary == null){
highlighterSummary = virtualDoc;
}
entity.setSummary(highlighterSummary); return entity;
} /**
* 给entity信息增加索引
* @param entity
*/
public static void addIndex(Entity entity) {
try {
deleteIndex(entity);
Document doc = convertToDocument(entity);
indexWriter.addDocument(doc);
indexWriter.commit();
} catch (Exception e) {
e.printStackTrace();
}
} /**
* 批量增加索引
* @param list
*/
public static void addIndexs(List<Entity> list) {
try {
List<Document> docs = new ArrayList<Document>();
deleteIndexs(list);
for (Entity entity : list) {
Document doc = convertToDocument(entity);
docs.add(doc);
}
indexWriter.addDocuments(docs);
indexWriter.commit();
} catch (Exception e) {
e.printStackTrace();
}
} /**
* 给实体信息更新索引
* @param entity
*/
public static void updateIndex(Entity entity) {
try {
addIndex(entity);
} catch (Exception e) {
e.printStackTrace();
}
} /**
* 删除entity列表信息对应的索引
* @param entity
*/
public static void deleteIndexs(List<Entity> list) {
try {
int size = list.size();
Term[] terms = new Term[size];
for(int i=0; i<size; i++) {
terms[i] = new Term("id", list.get(i).getId().toString());
}
indexWriter.deleteDocuments(terms);
indexWriter.commit();
} catch (Exception e) {
e.printStackTrace();
}
} /**
* 删除实体信息对应的索引
* @param entity
*/
public static void deleteIndex(Entity entity) {
try {
indexWriter.deleteDocuments(new Term("id", entity.getId().toString()));
indexWriter.commit();
} catch (Exception e) {
e.printStackTrace();
}
} /**
* 删除实体类型对应的所以索引信息
* @param type
*/
public static void deleteIndexByType(String type) {
try {
indexWriter.deleteDocuments(new Term("type", type));
indexWriter.commit();
} catch (Exception e) {
e.printStackTrace();
}
} @Override
protected void finalize() throws Throwable {
indexWriter.close();
} public static void main(String[] args) throws Exception {
String queryStr = "http://mail6c1.shenzhenair.com";
contentSearch(queryStr, true);
}
}

Lucence使用入门的更多相关文章

  1. 【solr专题之一】Solr快速入门 分类: H4_SOLR/LUCENCE 2014-07-02 14:59 2403人阅读 评论(0) 收藏

    一.Solr学习相关资料 1.官方材料 (1)快速入门:http://lucene.apache.org/solr/4_9_0/tutorial.html,以自带的example项目快速介绍发Solr ...

  2. Google Guava入门(一)

    Guava作为Java编程的助手,可以提升开发效率,对Guava设计思想的学习则极大的有益于今后的编程之路.故在此对<Getting Started with Google Guava>一 ...

  3. Elasticsearch7.X 入门学习第一课笔记----基本概念

    原文:Elasticsearch7.X 入门学习第一课笔记----基本概念 版权声明:本文为博主原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明. 本文链接:https: ...

  4. Lucene从入门到实战

    Lucene 在了解Lucene之前,我们先了解下全文数据查询. 全文数据查询 我们的数据一般分为两种:结构化数据和非结构化数据 结构化数据:有固定格式或有限长度的数据,如数据库中的数据.元数据 非结 ...

  5. Angular2入门系列教程7-HTTP(一)-使用Angular2自带的http进行网络请求

    上一篇:Angular2入门系列教程6-路由(二)-使用多层级路由并在在路由中传递复杂参数 感觉这篇不是很好写,因为涉及到网络请求,如果采用真实的网络请求,这个例子大家拿到手估计还要自己写一个web ...

  6. ABP入门系列(1)——学习Abp框架之实操演练

    作为.Net工地搬砖长工一名,一直致力于挖坑(Bug)填坑(Debug),但技术却不见长进.也曾热情于新技术的学习,憧憬过成为技术大拿.从前端到后端,从bootstrap到javascript,从py ...

  7. Oracle分析函数入门

    一.Oracle分析函数入门 分析函数是什么?分析函数是Oracle专门用于解决复杂报表统计需求的功能强大的函数,它可以在数据中进行分组然后计算基于组的某种统计值,并且每一组的每一行都可以返回一个统计 ...

  8. Angular2入门系列教程6-路由(二)-使用多层级路由并在在路由中传递复杂参数

    上一篇:Angular2入门系列教程5-路由(一)-使用简单的路由并在在路由中传递参数 之前介绍了简单的路由以及传参,这篇文章我们将要学习复杂一些的路由以及传递其他附加参数.一个好的路由系统可以使我们 ...

  9. Angular2入门系列教程5-路由(一)-使用简单的路由并在在路由中传递参数

    上一篇:Angular2入门系列教程-服务 上一篇文章我们将Angular2的数据服务分离出来,学习了Angular2的依赖注入,这篇文章我们将要学习Angualr2的路由 为了编写样式方便,我们这篇 ...

随机推荐

  1. C# 几进制 转换到几进制

    public string ConvertString(string value, int fromBase, int toBase) { int intValue = Convert.ToInt32 ...

  2. centOS发布.Net Core 2.0 API

    1.dotnet  xxx.dll & & 放在启动参数后面表示设置此进程为后台进程.(目前测试无效) 2.ps -ef | grep xxx ps:将某个进程显示出来 -A 显示所有 ...

  3. Redis项目实战,一些经验总结

    来源:https://my.oschina.net/u/920698/blog/3031587 背景 Redis 是一个开源的内存数据结构存储系统. 可以作为数据库.缓存和消息中间件使用. 支持多种类 ...

  4. nowcoder A hard problem /// 数位DP

    题目大意: 称一个数x的各个数位之和为f(x) 求区间L R之间 有多少个数x%f(x)==0 #include <bits/stdc++.h> using namespace std; ...

  5. 将循环结果通过管道 传递给其他命令处理(done |)

  6. centos7下的Firewalld

    一.介绍 防火墙守护 firewalld 服务引入了一个信任级别的概念来管理与之相关联的连接与接口.它支持 ipv4 与 ipv6,并支持网桥,采用 firewall-cmd (command) 或 ...

  7. opengl 库glew

    OpenGL OpenGL是个专业的3D程序接口,是一个功能强大,调用方便的底层3D图形库.OpenGL的前身是SGI公司为其图形工作站开发的IRIS GL.IRIS GL是一个工业标准的3D图形软件 ...

  8. mysql中级操作

    解析sql执行过程 show VARIABLES like '%profil%' //查看是否开启了剖析 如没开启set profiling=1; 启用 show profiles; set @que ...

  9. find和grep技巧

    1. find ./ -name "*streaming*"  查找文件 2.  grep -r KUBE_LOGTOSTDERR /etc/kubernetes/*  查找内容

  10. Hadoop(一)阿里云hadoop集群配置

    集群配置 三台ECS云服务器 配置步骤 1.准备工作 1.1 创建/bigdata目录 mkdir /bigdatacd /bigdatamkdir /app 1.2修改主机名为node01.node ...