42、lucene和机器学习进行全文搜索，并排序

package com.lucene.test;

import java.io.BufferedInputStream;

import java.io.File;

import java.io.FileInputStream;

import java.io.IOException;

import java.io.InputStream;

import java.nio.file.Paths;

import java.util.ArrayList;

import java.util.List;

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.analysis.core.WhitespaceAnalyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.document.Field.Store;

import org.apache.lucene.document.TextField;

import org.apache.lucene.index.DirectoryReader;

import org.apache.lucene.index.IndexReader;

import org.apache.lucene.index.IndexWriter;

import org.apache.lucene.index.IndexWriterConfig;

import org.apache.lucene.queryparser.classic.ParseException;

import org.apache.lucene.queryparser.classic.QueryParser;

import org.apache.lucene.search.IndexSearcher;

import org.apache.lucene.search.Query;

import org.apache.lucene.search.ScoreDoc;

import org.apache.lucene.search.TopDocs;

import org.apache.lucene.store.Directory;

import org.apache.lucene.store.FSDirectory;

import org.joone.engine.FullSynapse;

import org.joone.engine.LinearLayer;

import org.joone.engine.Monitor;

import org.joone.engine.NeuralNetEvent;

import org.joone.engine.NeuralNetListener;

import org.joone.engine.SigmoidLayer;

import org.joone.engine.learning.TeachingSynapse;

import org.joone.io.MemoryInputSynapse;

import org.joone.io.MemoryOutputSynapse;

import org.joone.net.NeuralNet;

import org.junit.Test;

import org.wltea.analyzer.lucene.IKAnalyzer;

import com.lucene.domain.Article;

public class TestLucene implements NeuralNetListener{

    private NeuralNet nnet = null;

    private MemoryInputSynapse inputSynapse,desireOutputSynapse;

    LinearLayer input;

    SigmoidLayer hidden,output;

    boolean singleThreadMode = true;

    //XOR input

    private double[][] inputArray = new double[][]{

            {0.0,0.0},

            {0.0,1.0},

            {1.0,0.0},

            {1.0,1.0}

    };

    //XOR desired output

    private double[][] desiredOutputArray = new double[][]{

            {0.0},

            {1.0},

            {1.0},

            {1.0}

    };

    /**

     * 创建索引

     * @throws Exception

     */

    @Test

    public void testCreateIndex() throws Exception{

        int fileNum = 1;

        List<String> contents = new ArrayList<String>();

        InputStream inputStream = null;

        String value = null;

        File directory = new File("./20_newsgroups");

        if(directory.isDirectory()){

            File[] files = directory.listFiles();

            for (int i = 0; i < 1; i++) {

                if(files[i].isDirectory()){

                    File[] subFiles = files[i].listFiles();

                    for (int j = 0; j < 10; j++) {

                        inputStream = new BufferedInputStream(new FileInputStream(subFiles[j]));

                        StringBuffer tempContent = new StringBuffer();

                        byte[] bytes = new byte[1024*10];

                        int len = 0;

                        while((len = inputStream.read(bytes))!=-1){

                            tempContent = tempContent.append(new String(bytes));

                        }

                        value = tempContent.toString();

                        System.out.println(value);

                        inputStream.close();

                        Article article = new Article(fileNum,subFiles[j].getName(),tempContent.toString());

                        Directory saveDirectory = FSDirectory.open(Paths.get("./indexDir/"));

                        //分词器

                        Analyzer analyzer = new WhitespaceAnalyzer();

                        IndexWriterConfig iwc = new IndexWriterConfig(analyzer);

                        Document doc = new Document();

                        doc.add(new TextField("id", article.getId().toString(), Store.YES));

                        doc.add(new TextField("title", article.getTitle(), Store.YES));

                        doc.add(new TextField("content", article.getContent(), Store.YES));

                        IndexWriter indexWriter = new IndexWriter(saveDirectory,iwc);

                        System.out.println("have already add file to fileDocment system"+fileNum);

                        indexWriter.addDocument(doc);

                        indexWriter.close();//释放资源

                        fileNum = fileNum+1;

                    }

                }

            }

        }

        //1.将需要添加的实体构造成实体对象

        Article article = new Article(1,"Lucene是全文检索框架",

                "全文检索(Full-Test Retrieval)是以文本作为检索对象，找出含有指定词汇的文本。"+

                "全面，准确和快速是衡量全文检索系统的关键指标。");

        //2，保存到数据库（此步骤暂时省略）

        //3、建立索引(lucene)

        //索引库目录

        //将 Article 转换为Document

        //保存到索引库中

    }

    /**

     * 测试搜索

     * @throws IOException

     * @throws ParseException

     */

    @Test

    public void testSearch() throws IOException, ParseException{

        //1、搜索条件

        String queryCondition = "philosophical";

        //2、执行搜索(lucene)

        List<Article> articles = new ArrayList<Article>();

        //----------搜索代码------------------------

        Directory directory = FSDirectory.open(Paths.get("./indexDir/"));

        Analyzer analyzer = new WhitespaceAnalyzer();//创建分词器

        //把查询字符串转换为Query对象(只在title中查询)

        QueryParser queryParser = new QueryParser("content",analyzer);

        Query query = queryParser.parse(queryCondition);

        //2执行搜索得到搜索结果

        IndexReader indexReader = DirectoryReader.open(directory);

        IndexSearcher indexSearcher = new IndexSearcher(indexReader);

        TopDocs topDocs = indexSearcher.search(query, 100);

        Integer count = topDocs.totalHits;//总结果数量

        ScoreDoc[] scoreDocs = topDocs.scoreDocs;//返回前N条结果

        //2.3处理结果

        for (int i = 0; i < scoreDocs.length; i++) {

            ScoreDoc scoreDoc= scoreDocs[i];

            int docId = scoreDoc.doc;

            System.out.println("得分是："+scoreDoc.score+"内部编号是:"+docId);

            //根据内部编号取出真正的Document数据

            Document doc = indexSearcher.doc(docId);

            //将document转化为Article

            Article article = new Article(Integer.parseInt(doc.get("id")),doc.get("title"),doc.get("content"));

            articles.add(article);

        }

        //------------------------------------------

        //3、控制台显示结果

        System.err.print("总结果数:"+count);

        for (Article article : articles) {

            System.out.println("查询结果：ID为："+article.getId()+"，title为："+article.getTitle());

        }

        indexSearcher.getIndexReader().close();

    }

    @Test

    public void testNeuralNet(){

        TestLucene testLucene = new TestLucene();

        testLucene.initNeuralNet();

        testLucene.train();

        testLucene.interrogate();

    }

    public void initNeuralNet(){

        //First create the three layers

        input = new LinearLayer();

        hidden = new SigmoidLayer();

        output = new SigmoidLayer();

        //set the dimensions of the layers

        input.setRows(2);

        hidden.setRows(3);

        output.setRows(1);

        input.setLayerName("L.input");

        hidden.setLayerName("L.hidden");

        output.setLayerName("L.output");

        //Now create the two Synapses

        FullSynapse synapse_IH = new FullSynapse();//input -->hidden conn

        FullSynapse synapse_HO = new FullSynapse();//hidden -->output conn

        //Connect the input layer whit the hidden layer

        input.addOutputSynapse(synapse_IH);

        hidden.addInputSynapse(synapse_IH);

        //Connect the hidden layer whit the output layer

        hidden.addOutputSynapse(synapse_HO);

        output.addInputSynapse(synapse_HO);

        //the input to the neural net

        inputSynapse = new MemoryInputSynapse();

        input.addInputSynapse(inputSynapse);

        //The Trainer and its desired output

        desireOutputSynapse = new MemoryInputSynapse();

        TeachingSynapse trainer = new TeachingSynapse();

        trainer.setDesired(desireOutputSynapse);

        //Now we add this structure to a NeuralNet object

        nnet = new NeuralNet();

        nnet.addLayer(input,NeuralNet.INPUT_LAYER);

        nnet.addLayer(hidden,NeuralNet.HIDDEN_LAYER);

        nnet.addLayer(output, NeuralNet.OUTPUT_LAYER);

        nnet.setTeacher(trainer);

        output.addOutputSynapse(trainer);

        nnet.addNeuralNetListener(this);

    }

    public void train(){

        //set the inputs

        inputSynapse.setInputArray(inputArray);

        inputSynapse.setAdvancedColumnSelector("1,2");

        //set the desired outputs

        desireOutputSynapse.setInputArray(desiredOutputArray);

        desireOutputSynapse.setAdvancedColumnSelector("1");

        //get the monitor object to train or feed forward

        Monitor monitor = nnet.getMonitor();

        //set the monitor parameters

        monitor.setLearningRate(0.8);

        monitor.setMomentum(0.3);

        monitor.setTrainingPatterns(inputArray.length);

        monitor.setTotCicles(5000);

        monitor.setLearning(true);

        long initms = System.currentTimeMillis();

        //Run the network in single-thread,synchronized mode

        nnet.getMonitor().setSingleThreadMode(singleThreadMode);

        nnet.go(true);

        System.out.println("Total time="+(System.currentTimeMillis()-initms)+"ms");

    }

    public void interrogate(){

        double[][] inputArray = new double[][]{

                {0.0,1.0},

                {1.0,0.0},

                {1.0,1.0},

                {0.0,0.0}

        };

        //set the inputs

        inputSynapse.setInputArray(inputArray);

        inputSynapse.setAdvancedColumnSelector("1,2");

        Monitor monitor = nnet.getMonitor();

        monitor.setTrainingPatterns(4);

        monitor.setTotCicles(1);

        monitor.setLearning(false);

        MemoryOutputSynapse memOut = new MemoryOutputSynapse();

        //set the output synapse to write the output of the net

        if(nnet != null){

            nnet.addOutputSynapse(memOut);

            System.out.println(nnet.check());

            nnet.getMonitor().setSingleThreadMode(singleThreadMode);

            nnet.go();

            for (int i = 0; i < 4; i++) {

                double[] pattern = memOut.getNextPattern();

                System.out.println("Output pattern #"+(i+1)+"="+pattern[0]);

            }

            System.out.println("Interrogating Finished");

        }

    }

    public void cicleTerminated(NeuralNetEvent arg0) {

    }

    public void errorChanged(NeuralNetEvent e) {

        Monitor mon=(Monitor) e.getSource();

        if(mon.getCurrentCicle()%100==0){

            System.out.println("Epoch:"+(mon.getTotCicles()-mon.getCurrentCicle())+"RMSE:"

                    +mon.getGlobalError());

        }

    }

    public void netStarted(NeuralNetEvent e) {

        Monitor mon = (Monitor) e.getSource();

        System.out.println("Network started for ");

        if(mon.isLearning()){

            System.out.println("training");

        }else{

            System.out.println("interrogation.");

        }

    }

    public void netStopped(NeuralNetEvent e) {

        Monitor mon = (Monitor) e.getSource();

        System.out.println("Network stopped . Last RMSE="

                +mon.getGlobalError());

    }

    public void netStoppedError(NeuralNetEvent e, String error) {

        System.out.println("Network stopped due the following error:"

                +error);

    }

}

结果

得分是：0.25462872内部编号是:7840

得分是：0.24006625内部编号是:7841

查询结果：ID为：2，title为：51060总结果数:2

查询结果：ID为：1，title为：49960

42、lucene和机器学习进行全文搜索，并排序的更多相关文章

基于JieBaNet+Lucene.Net实现全文搜索
实现效果: 上一篇文章有附全文搜索结果的设计图,下面截一张开发完成上线后的实图: 基本风格是模仿的百度搜索结果,绿色的分页略显小清新. 目前已采集并创建索引的文章约3W多篇,索引文件不算太大,查询速度 ...
Apache Solr采用Java开发、基于Lucene的全文搜索服务器
http://docs.spring.io/spring-data/solr/ 首先介绍一下solr: Apache Solr (读音: SOLer) 是一个开源.高性能.采用Java开发.基于Luc ...
OSCHina技术导向：Java全文搜索框架Lucene
Lucene 是apache软件基金会一个开放源代码的全文检索引擎工具包,是一个全文检索引擎的架构,提供了完整的查询引擎和索引引擎,部分文本分析引擎.Lucene的目的是为软件开发人员提供一个简单易用 ...
记一次企业级爬虫系统升级改造（五）：基于JieBaNet+Lucene.Net实现全文搜索
实现效果: 上一篇文章有附全文搜索结果的设计图,下面截一张开发完成上线后的实图: 基本风格是模仿的百度搜索结果,绿色的分页略显小清新. 目前已采集并创建索引的文章约3W多篇,索引文件不算太大,查询速度 ...
lucene全文搜索之四：创建索引搜索器、6种文档搜索器实现以及搜索结果分析（结合IKAnalyzer分词器的搜索器）基于lucene5.5.3
前言: 前面几章已经很详细的讲解了如何创建索引器对索引进行增删查(没有更新操作).如何管理索引目录以及如何使用分词器,上一章讲解了如何生成索引字段和创建索引文档,并把创建的索引文档保存到索引目录,到这 ...
lucene全文搜索之三：生成索引字段，创建索引文档（给索引字段加权）基于lucene5.5.3
前言:上一章中我们已经实现了索引器的创建,但是我们没有索引文档,本章将会讲解如何生成字段.创建索引文档,给字段加权以及保存文档到索引器目录 luncene5.5.3集合jar包下载地址:http:// ...
lucene全文搜索之二：创建索引器（创建IKAnalyzer分词器和索引目录管理）基于lucene5.5.3
前言: lucene全文搜索之一中讲解了lucene开发搜索服务的基本结构,本章将会讲解如何创建索引器.管理索引目录和中文分词器的使用. 包括标准分词器,IKAnalyzer分词器以及两种索引目录的创 ...
lucene全文搜索之一：lucene的主要功能和基本结构（基于lucene5.5.3）
前言:lucene并不是像solr或elastic那样提供现成的.直接部署可用的系统,而是一套jar包,提供了一些常见语言分词.构建索引和创建搜索器等等功能的API,我们常用到的也就是分词器.索引目录 ...
C# 全文搜索Lucene
全文出自:https://blog.csdn.net/huangwenhua5000/article/details/9341751 1 lucene简介1.1 什么是luceneLucene是一个全 ...

随机推荐

Log4net按照不同级别写入多个日志文件
[assembly: log4net.Config.XmlConfigurator(Watch = true)]//注入在一个Web应用项目中,我使用了Fluent NHibernate作为数据访问 ...
黄聪：css3实现图片划过一束光闪过效果(图片光影掠过效果)
CSS代码 .guangshu { display:block; position: relative; width:800px; height:450px; margin:0 auto;} .gua ...
新安装loadrunner无法录制脚本的原因之一及解决方案
eg:IE浏览器 1.新安装的loadrunner录制脚本,一直是加载中的状态: 2.苦思冥想终于找到解决方案: 3.IE浏览器-->设置-->Internet选项 4."安全& ...
java 泛型编程学习
先发布,以后有空再修改... 第一次看到<java核心技术卷一>中关于泛型这部分的时候感觉很复杂,似乎有说不完的约束条件,让人难以理解.当时只是囫囵吞枣般过了一遍,也没有看出个什么来.现在 ...
CE程序
注:开发工具用VS2008 安装Windows mobile设备中心进行调试. 项目平台 1.窗体属性设置,然后将size改成238, 320 2.效果 3.调试过程选择“部署” 4.配置文件的简单读 ...
Unity关于一个UGUI优化效率的方法
无意间发现了一个小技巧.如下图所示,可以发现UGUI的Image组件的RaycastTarget勾选以后会消耗一些效率,为了节省效率就不要勾选它了,不仅Image组件Text组件也有这样的问题. 一般 ...
摘记史上最强大的40多个纯CSS绘制的图形（一）
今天在国外的网站上看到了很多看似简单却又非常强大的纯CSS绘制的图形,里面有最简单的矩形.圆形和三角形,也有各种常见的多边形,甚至是阴阳太极和网站小图标,真的非常强大,分享给大家. Square(正方 ...
CentOS7：安装Zabbix
参考:CentOS 7 yum安装Zabbix 1. 安装Zabbix Server EPEL源里面有Zabbix的安装包,所以需要先安装EPEL. Zabbix源也可以从这里获得:http://re ...
OpenStreetMap/Google/百度/Bing瓦片地图服务(TMS)
开源与成熟商业的瓦片地图服务(TMS 2 WMTS),都有如下共同的特性,基本成为了标准: (1) 坐标系:WGS84 (2) 投影:墨卡托投影(Marcator,正轴等角圆柱投影) ------ ...
Django 开发投票系统
主要参考官方文档 Windows 10 Python 23.5 Django 1.9 1.创建项目(mysite)与应用(polls) D:\python>django-admin.py st ...

42、lucene和机器学习进行全文搜索，并排序

42、lucene和机器学习进行全文搜索，并排序的更多相关文章

随机推荐

热门专题