Lucene全文检索引擎

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"

 xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">

<modelVersion>4.0.0</modelVersion>

  <groupId>demo.lucene</groupId>

  <artifactId>Lucene01</artifactId>

  <version>0.0.1-SNAPSHOT</version>

  <build/>

  <dependencies>

    <!-- lucene核心包 -->

    <dependency>

        <groupId>org.apache.lucene</groupId>

        <artifactId>lucene-core</artifactId>

        <version>5.3.1</version>

    </dependency>

    <!-- lucene查询解析包 -->

    <dependency>

        <groupId>org.apache.lucene</groupId>

        <artifactId>lucene-queryparser</artifactId>

        <version>5.3.1</version>

    </dependency>

    <!-- lucene解析器包 -->

    <dependency>

        <groupId>org.apache.lucene</groupId>

        <artifactId>lucene-analyzers-common</artifactId>

        <version>5.3.1</version>

    </dependency>

  </dependencies>

</project>

import java.io.File;

import java.io.FileReader;

import java.nio.file.Paths;

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.document.Field;

import org.apache.lucene.document.TextField;

import org.apache.lucene.index.IndexWriter;

import org.apache.lucene.index.IndexWriterConfig;

import org.apache.lucene.store.Directory;

import org.apache.lucene.store.FSDirectory;

/**

 * 建立索引的类

 * @author Ni Shengwu

 *

 */

public class Indexer {

    private IndexWriter writer; //写索引实例

    //构造方法，实例化IndexWriter

    public Indexer(String indexDir) throws Exception {

        Directory dir = FSDirectory.open(Paths.get(indexDir));

        Analyzer analyzer = new StandardAnalyzer(); //标准分词器，会自动去掉空格啊，is a the等单词

        IndexWriterConfig config = new IndexWriterConfig(analyzer); //将标准分词器配到写索引的配置中

        writer = new IndexWriter(dir, config); //实例化写索引对象

    }

    //关闭写索引

    public void close() throws Exception {

        writer.close();

    }

    //索引指定目录下的所有文件

    public int indexAll(String dataDir) throws Exception {

        File[] files = new File(dataDir).listFiles(); //获取该路径下的所有文件

        for(File file : files) {

            indexFile(file); //调用下面的indexFile方法，对每个文件进行索引

        }

        return writer.numDocs(); //返回索引的文件数

    }

    //索引指定的文件

    private void indexFile(File file) throws Exception {

        System.out.println("索引文件的路径：" + file.getCanonicalPath());

        Document doc = getDocument(file); //获取该文件的document

        writer.addDocument(doc); //调用下面的getDocument方法，将doc添加到索引中

    }

    //获取文档，文档里再设置每个字段，就类似于数据库中的一行记录

    private Document getDocument(File file) throws Exception{

        Document doc = new Document();

        //添加字段

        doc.add(new TextField("contents", new FileReader(file))); //添加内容

        doc.add(new TextField("fileName", file.getName(), Field.Store.YES)); //添加文件名，并把这个字段存到索引文件里

        doc.add(new TextField("fullPath", file.getCanonicalPath(), Field.Store.YES)); //添加文件路径

        return doc;

    }

    public static void main(String[] args) {

        String indexDir = "D:\\lucene"; //将索引保存到的路径

        String dataDir = "D:\\lucene\\data"; //需要索引的文件数据存放的目录

        Indexer indexer = null;

        int indexedNum = 0;

        long startTime = System.currentTimeMillis(); //记录索引开始时间

        try {

            indexer = new Indexer(indexDir);

            indexedNum = indexer.indexAll(dataDir);

        } catch (Exception e) {

            e.printStackTrace();

        } finally {

            try {

                indexer.close();

            } catch (Exception e) {

                e.printStackTrace();

            }

        }

        long endTime = System.currentTimeMillis(); //记录索引结束时间

        System.out.println("索引耗时" + (endTime-startTime) + "毫秒");

        System.out.println("共索引了" + indexedNum + "个文件");

    }

}

import java.nio.file.Paths;

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.index.DirectoryReader;

import org.apache.lucene.index.IndexReader;

import org.apache.lucene.queryparser.classic.QueryParser;

import org.apache.lucene.search.IndexSearcher;

import org.apache.lucene.search.Query;

import org.apache.lucene.search.ScoreDoc;

import org.apache.lucene.search.TopDocs;

import org.apache.lucene.store.Directory;

import org.apache.lucene.store.FSDirectory;

public class Searcher {

	public static void search(String indexDir, String q) throws Exception {

        Directory dir = FSDirectory.open(Paths.get(indexDir)); //获取要查询的路径，也就是索引所在的位置

        IndexReader reader = DirectoryReader.open(dir);

        IndexSearcher searcher = new IndexSearcher(reader);

        Analyzer analyzer = new StandardAnalyzer(); //标准分词器，会自动去掉空格啊，is a the等单词

        QueryParser parser = new QueryParser("contents", analyzer); //查询解析器

        Query query = parser.parse(q); //通过解析要查询的String，获取查询对象

        long startTime = System.currentTimeMillis(); //记录索引开始时间

        TopDocs docs = searcher.search(query, 10);//开始查询，查询前10条数据，将记录保存在docs中

        long endTime = System.currentTimeMillis(); //记录索引结束时间

        System.out.println("匹配" + q + "共耗时" + (endTime-startTime) + "毫秒");

        System.out.println("查询到" + docs.totalHits + "条记录");

        for(ScoreDoc scoreDoc : docs.scoreDocs) { //取出每条查询结果

            Document doc = searcher.doc(scoreDoc.doc); //scoreDoc.doc相当于docID,根据这个docID来获取文档

            System.out.println(doc.get("fullPath")); //fullPath是刚刚建立索引的时候我们定义的一个字段

        }

        reader.close();

    }

    public static void main(String[] args) {

        String indexDir = "D:\\lucene";

        String q = "generate-maven-artifacts"; //查询这个字符串

        try {

            search(indexDir, q);

        } catch (Exception e) {

            e.printStackTrace();

        }

    }

}

pom.xml

Lucene全文检索引擎的更多相关文章

Apache Lucene(全文检索引擎)—创建索引
目录返回目录:http://www.cnblogs.com/hanyinglong/p/5464604.html 本项目Demo已上传GitHub,欢迎大家fork下载学习:https://gith ...
Lucene 全文检索引擎
Apache Lucene PS: 苦学一周全文检索,由原来的搜索小白,到初次涉猎,感觉每门技术都博大精深,其中精髓亦是不可一日而语.那小博猪就简单介绍一下这一周的学习历程, 仅供各位程序猿们参考,这 ...
Apache Lucene(全文检索引擎)—分词器
目录返回目录:http://www.cnblogs.com/hanyinglong/p/5464604.html 本项目Demo已上传GitHub,欢迎大家fork下载学习:https://gith ...
【Lucene】Apache Lucene全文检索引擎架构之构建索引2
上一篇博文中已经对全文检索有了一定的了解,这篇文章主要来总结一下全文检索的第一步:构建索引.其实上一篇博文中的示例程序已经对构建索引写了一段程序了,而且那个程序还是挺完善的.不过从知识点的完整性来考虑 ...
【Lucene】Apache Lucene全文检索引擎架构之入门实战1
Lucene是一套用于全文检索和搜寻的开源程式库,由Apache软件基金会支持和提供.Lucene提供了一个简单却强大的应用程式接口,能够做全文索引和搜寻.在Java开发环境里Lucene是一个成熟的 ...
Apache Lucene(全文检索引擎)—搜索
目录返回目录:http://www.cnblogs.com/hanyinglong/p/5464604.html 本项目Demo已上传GitHub,欢迎大家fork下载学习:https://gith ...
【Lucene】Apache Lucene全文检索引擎架构之中文分词和高亮显示4
前面总结的都是使用Lucene的标准分词器,这是针对英文的,但是中文的话就不顶用了,因为中文的语汇与英文是不同的,所以一般我们开发的时候,有中文的话肯定要使用中文分词了,这一篇博文主要介绍一下如何使用 ...
【Lucene】Apache Lucene全文检索引擎架构之搜索功能3
上一节主要总结了一下Lucene是如何构建索引的,这一节简单总结一下Lucene中的搜索功能.主要分为几个部分,对特定项的搜索:查询表达式QueryParser的使用:指定数字范围内搜索:指定字符串开 ...
全文检索引擎　Lucene.net
全文搜索引擎是目前广泛应用的主流搜索引擎.它的工作原理是计算机索引程序通过扫描文章中的每一个词,对每一个词建立一个索引,指明该词在文章中出现的次数和位置,当用户查询时,检索程序就根据事先建立的索引进行 ...

随机推荐

linux上搭建ftp
linux上搭建ftp 重要解决如何搭建ftp 解决用户指定访问其根目录解决访问ftp超时连接解决ftp主动连接.被动连接的问题 1.安装ftp ...
LeetCode 650 - 2 Keys Keyboard
LeetCode 第650题 Initially on a notepad only one character 'A' is present. You can perform two operati ...
Nunit测试工具使用实例
前言: 本文主要是介绍了Nunit的基本使用,其中参详了很多已有的文章,由于最近要使用其进行测试,所以对网上的文章做了下整理,同时加入了一些自己的实践. NUnit的属性 TestFixture 它标 ...
python检查IP地址正确性
一.自动动手,丰衣足食 #encoding=utf-8 import os,sys def check_ip(ipaddr): addr = ipaddr.strip().split('.') #切割 ...
HashMap实现原理
学习笔记之HashMap篇,简单学习了解HashMap的实现原理和扩容. 大家都知道HashMap处理数据很快,时间复杂度O(1),那么是怎么做到的呢?那就先了解一下常见数据结构. 一般来说,我们把存 ...
hadoop streaming编程小demo(python版)
大数据团队搞数据质量评测.自动化质检和监控平台是用django,MR也是通过python实现的.(后来发现有orc压缩问题,python不知道怎么解决,正在改成java版本) 这里展示一个python ...
Javascript 面向对象编程—封装
前言 Javascript是一种基于对象(object-based)的语言,你遇到的所有东西几乎都是对象.但是,它又不是一种真正的面向对象编程(OOP)语言,因为它的语法中没有class(类) ...
【转载】quickLayout.css-快速构建结构兼容的web页面
文章转载自张鑫旭-鑫空间-鑫生活 http://www.zhangxinxu.com/wordpress/ 原文链接:http://www.zhangxinxu.com/wordpress/?p=4 ...
win10 uwp BadgeLogo 颜色
本文讲的是在上传应用商店出现BadgeLogo颜色问题,和如何解决,因为我是渣渣,本文可能带有一定的主观性和局限性,说的东西可能不对或者不符合每个人的预期.如果觉得我有讲的不对的,就多多包含,或者直接 ...
LINUX下分区命令Parted详解
通常划分分区工具我们用的比较多是fdisk命令,但是现在由于磁盘越来越廉价,而且磁盘空间越来越大.而fdisk工具他对分区是有大小限制的,它只能划分小于2T的磁盘.现在的磁盘空间已经远远大于2T,有两 ...

Lucene全文检索引擎

Lucene全文检索引擎的更多相关文章

随机推荐

热门专题