package com.lucene.test;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.joone.engine.FullSynapse;
import org.joone.engine.LinearLayer;
import org.joone.engine.Monitor;
import org.joone.engine.NeuralNetEvent;
import org.joone.engine.NeuralNetListener;
import org.joone.engine.SigmoidLayer;
import org.joone.engine.learning.TeachingSynapse;
import org.joone.io.MemoryInputSynapse;
import org.joone.io.MemoryOutputSynapse;
import org.joone.net.NeuralNet;
import org.junit.Test;
import org.wltea.analyzer.lucene.IKAnalyzer; import com.lucene.domain.Article; public class TestLucene implements NeuralNetListener{
private NeuralNet nnet = null;
private MemoryInputSynapse inputSynapse,desireOutputSynapse;
LinearLayer input;
SigmoidLayer hidden,output;
boolean singleThreadMode = true; //XOR input
private double[][] inputArray = new double[][]{
{0.0,0.0},
{0.0,1.0},
{1.0,0.0},
{1.0,1.0}
}; //XOR desired output
private double[][] desiredOutputArray = new double[][]{
{0.0},
{1.0},
{1.0},
{1.0}
}; /**
* 创建索引
* @throws Exception
*/
@Test
public void testCreateIndex() throws Exception{
int fileNum = 1;
List<String> contents = new ArrayList<String>();
InputStream inputStream = null;
String value = null;
File directory = new File("./20_newsgroups");
if(directory.isDirectory()){
File[] files = directory.listFiles();
for (int i = 0; i < 1; i++) {
if(files[i].isDirectory()){
File[] subFiles = files[i].listFiles();
for (int j = 0; j < 10; j++) {
inputStream = new BufferedInputStream(new FileInputStream(subFiles[j]));
StringBuffer tempContent = new StringBuffer();
byte[] bytes = new byte[1024*10];
int len = 0;
while((len = inputStream.read(bytes))!=-1){
tempContent = tempContent.append(new String(bytes));
}
value = tempContent.toString();
System.out.println(value);
inputStream.close();
Article article = new Article(fileNum,subFiles[j].getName(),tempContent.toString());
Directory saveDirectory = FSDirectory.open(Paths.get("./indexDir/"));
//分词器
Analyzer analyzer = new WhitespaceAnalyzer();
IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
Document doc = new Document();
doc.add(new TextField("id", article.getId().toString(), Store.YES));
doc.add(new TextField("title", article.getTitle(), Store.YES));
doc.add(new TextField("content", article.getContent(), Store.YES));
IndexWriter indexWriter = new IndexWriter(saveDirectory,iwc);
System.out.println("have already add file to fileDocment system"+fileNum);
indexWriter.addDocument(doc);
indexWriter.close();//释放资源
fileNum = fileNum+1;
}
}
}
} //1.将需要添加的实体构造成实体对象
Article article = new Article(1,"Lucene是全文检索框架",
"全文检索(Full-Test Retrieval)是以文本作为检索对象,找出含有指定词汇的文本。"+
"全面,准确和快速是衡量全文检索系统的关键指标。"); //2,保存到数据库(此步骤暂时省略) //3、建立索引(lucene)
//索引库目录 //将 Article 转换为Document //保存到索引库中 } /**
* 测试搜索
* @throws IOException
* @throws ParseException
*/
@Test
public void testSearch() throws IOException, ParseException{
//1、搜索条件
String queryCondition = "philosophical"; //2、执行搜索(lucene)
List<Article> articles = new ArrayList<Article>(); //----------搜索代码------------------------
Directory directory = FSDirectory.open(Paths.get("./indexDir/"));
Analyzer analyzer = new WhitespaceAnalyzer();//创建分词器 //把查询字符串转换为Query对象(只在title中查询)
QueryParser queryParser = new QueryParser("content",analyzer);
Query query = queryParser.parse(queryCondition); //2执行搜索得到搜索结果
IndexReader indexReader = DirectoryReader.open(directory);
IndexSearcher indexSearcher = new IndexSearcher(indexReader);
TopDocs topDocs = indexSearcher.search(query, 100); Integer count = topDocs.totalHits;//总结果数量
ScoreDoc[] scoreDocs = topDocs.scoreDocs;//返回前N条结果 //2.3处理结果
for (int i = 0; i < scoreDocs.length; i++) {
ScoreDoc scoreDoc= scoreDocs[i];
int docId = scoreDoc.doc;
System.out.println("得分是:"+scoreDoc.score+"内部编号是:"+docId); //根据内部编号取出真正的Document数据
Document doc = indexSearcher.doc(docId); //将document转化为Article
Article article = new Article(Integer.parseInt(doc.get("id")),doc.get("title"),doc.get("content"));
articles.add(article);
} //------------------------------------------
//3、控制台显示结果
System.err.print("总结果数:"+count);
for (Article article : articles) {
System.out.println("查询结果:ID为:"+article.getId()+",title为:"+article.getTitle());
}
indexSearcher.getIndexReader().close();
} @Test
public void testNeuralNet(){
TestLucene testLucene = new TestLucene();
testLucene.initNeuralNet();
testLucene.train();
testLucene.interrogate();
} public void initNeuralNet(){
//First create the three layers
input = new LinearLayer();
hidden = new SigmoidLayer();
output = new SigmoidLayer(); //set the dimensions of the layers
input.setRows(2);
hidden.setRows(3);
output.setRows(1); input.setLayerName("L.input");
hidden.setLayerName("L.hidden");
output.setLayerName("L.output"); //Now create the two Synapses
FullSynapse synapse_IH = new FullSynapse();//input -->hidden conn
FullSynapse synapse_HO = new FullSynapse();//hidden -->output conn //Connect the input layer whit the hidden layer
input.addOutputSynapse(synapse_IH);
hidden.addInputSynapse(synapse_IH); //Connect the hidden layer whit the output layer
hidden.addOutputSynapse(synapse_HO);
output.addInputSynapse(synapse_HO); //the input to the neural net
inputSynapse = new MemoryInputSynapse();
input.addInputSynapse(inputSynapse); //The Trainer and its desired output
desireOutputSynapse = new MemoryInputSynapse();
TeachingSynapse trainer = new TeachingSynapse(); trainer.setDesired(desireOutputSynapse); //Now we add this structure to a NeuralNet object
nnet = new NeuralNet(); nnet.addLayer(input,NeuralNet.INPUT_LAYER);
nnet.addLayer(hidden,NeuralNet.HIDDEN_LAYER);
nnet.addLayer(output, NeuralNet.OUTPUT_LAYER);
nnet.setTeacher(trainer);
output.addOutputSynapse(trainer);
nnet.addNeuralNetListener(this);
} public void train(){
//set the inputs
inputSynapse.setInputArray(inputArray);
inputSynapse.setAdvancedColumnSelector("1,2");
//set the desired outputs
desireOutputSynapse.setInputArray(desiredOutputArray);
desireOutputSynapse.setAdvancedColumnSelector("1");
//get the monitor object to train or feed forward
Monitor monitor = nnet.getMonitor(); //set the monitor parameters
monitor.setLearningRate(0.8);
monitor.setMomentum(0.3);
monitor.setTrainingPatterns(inputArray.length);
monitor.setTotCicles(5000);
monitor.setLearning(true); long initms = System.currentTimeMillis();
//Run the network in single-thread,synchronized mode
nnet.getMonitor().setSingleThreadMode(singleThreadMode);
nnet.go(true);
System.out.println("Total time="+(System.currentTimeMillis()-initms)+"ms");
} public void interrogate(){
double[][] inputArray = new double[][]{
{0.0,1.0},
{1.0,0.0},
{1.0,1.0},
{0.0,0.0}
};
//set the inputs
inputSynapse.setInputArray(inputArray);
inputSynapse.setAdvancedColumnSelector("1,2");
Monitor monitor = nnet.getMonitor();
monitor.setTrainingPatterns(4);
monitor.setTotCicles(1);
monitor.setLearning(false);
MemoryOutputSynapse memOut = new MemoryOutputSynapse();
//set the output synapse to write the output of the net if(nnet != null){
nnet.addOutputSynapse(memOut);
System.out.println(nnet.check());
nnet.getMonitor().setSingleThreadMode(singleThreadMode);
nnet.go();
for (int i = 0; i < 4; i++) {
double[] pattern = memOut.getNextPattern();
System.out.println("Output pattern #"+(i+1)+"="+pattern[0]);
}
System.out.println("Interrogating Finished");
}
} public void cicleTerminated(NeuralNetEvent arg0) { } public void errorChanged(NeuralNetEvent e) {
Monitor mon=(Monitor) e.getSource();
if(mon.getCurrentCicle()%100==0){
System.out.println("Epoch:"+(mon.getTotCicles()-mon.getCurrentCicle())+"RMSE:"
+mon.getGlobalError());
}
} public void netStarted(NeuralNetEvent e) {
Monitor mon = (Monitor) e.getSource();
System.out.println("Network started for ");
if(mon.isLearning()){
System.out.println("training");
}else{
System.out.println("interrogation.");
}
} public void netStopped(NeuralNetEvent e) {
Monitor mon = (Monitor) e.getSource();
System.out.println("Network stopped . Last RMSE="
+mon.getGlobalError());
} public void netStoppedError(NeuralNetEvent e, String error) {
System.out.println("Network stopped due the following error:"
+error);
}
}

结果

得分是:0.25462872内部编号是:7840
得分是:0.24006625内部编号是:7841
查询结果:ID为:2,title为:51060总结果数:2
查询结果:ID为:1,title为:49960

42、lucene和机器学习进行全文搜索,并排序的更多相关文章

  1. 基于JieBaNet+Lucene.Net实现全文搜索

    实现效果: 上一篇文章有附全文搜索结果的设计图,下面截一张开发完成上线后的实图: 基本风格是模仿的百度搜索结果,绿色的分页略显小清新. 目前已采集并创建索引的文章约3W多篇,索引文件不算太大,查询速度 ...

  2. Apache Solr采用Java开发、基于Lucene的全文搜索服务器

    http://docs.spring.io/spring-data/solr/ 首先介绍一下solr: Apache Solr (读音: SOLer) 是一个开源.高性能.采用Java开发.基于Luc ...

  3. OSCHina技术导向:Java全文搜索框架Lucene

    Lucene 是apache软件基金会一个开放源代码的全文检索引擎工具包,是一个全文检索引擎的架构,提供了完整的查询引擎和索引引擎,部分文本分析引擎.Lucene的目的是为软件开发人员提供一个简单易用 ...

  4. 记一次企业级爬虫系统升级改造(五):基于JieBaNet+Lucene.Net实现全文搜索

    实现效果: 上一篇文章有附全文搜索结果的设计图,下面截一张开发完成上线后的实图: 基本风格是模仿的百度搜索结果,绿色的分页略显小清新. 目前已采集并创建索引的文章约3W多篇,索引文件不算太大,查询速度 ...

  5. lucene全文搜索之四:创建索引搜索器、6种文档搜索器实现以及搜索结果分析(结合IKAnalyzer分词器的搜索器)基于lucene5.5.3

    前言: 前面几章已经很详细的讲解了如何创建索引器对索引进行增删查(没有更新操作).如何管理索引目录以及如何使用分词器,上一章讲解了如何生成索引字段和创建索引文档,并把创建的索引文档保存到索引目录,到这 ...

  6. lucene全文搜索之三:生成索引字段,创建索引文档(给索引字段加权)基于lucene5.5.3

    前言:上一章中我们已经实现了索引器的创建,但是我们没有索引文档,本章将会讲解如何生成字段.创建索引文档,给字段加权以及保存文档到索引器目录 luncene5.5.3集合jar包下载地址:http:// ...

  7. lucene全文搜索之二:创建索引器(创建IKAnalyzer分词器和索引目录管理)基于lucene5.5.3

    前言: lucene全文搜索之一中讲解了lucene开发搜索服务的基本结构,本章将会讲解如何创建索引器.管理索引目录和中文分词器的使用. 包括标准分词器,IKAnalyzer分词器以及两种索引目录的创 ...

  8. lucene全文搜索之一:lucene的主要功能和基本结构(基于lucene5.5.3)

    前言:lucene并不是像solr或elastic那样提供现成的.直接部署可用的系统,而是一套jar包,提供了一些常见语言分词.构建索引和创建搜索器等等功能的API,我们常用到的也就是分词器.索引目录 ...

  9. C# 全文搜索Lucene

    全文出自:https://blog.csdn.net/huangwenhua5000/article/details/9341751 1 lucene简介1.1 什么是luceneLucene是一个全 ...

随机推荐

  1. 基于CentOS与VmwareStation10搭建Oracle11G RAC 64集群环境:5.安装Oracle RAC FAQ-RAC安装DB软件runInstaller看不到节点

    集群安装正常: [root@kmdbrac1 ~]# crs_stat -t -v Name Type R/RA F/FT Target State Host -------------------- ...

  2. asp.net中使用基于角色role的Forms验证

    http://www.cnblogs.com/yao/archive/2006/06/24/434783.html asp.net中使用基于角色role的Forms验证,大致经过几下四步:1.配置系统 ...

  3. [Spring MVC] - JSON

    Spring MVC中使用JSON,先必需引用两个包:jackson-core-asl-1.9.13.jar.jackson-mapper-asl-1.9.13.jar 因为需要使用到jquery测试 ...

  4. sql server远程备份和恢复

    sql server远程备份和恢复 SQLSERVER服务实例名称:192.168.0.2需要备份的数据库名称: a备份机器名称(Client端):192.168.0.3备份机用户:zf 密码:123 ...

  5. Hadoop总结篇之五---模块间是怎么驱动执行的

    在MRv1中,各个模块间驱动运行的方式是函数调用的方式.这是同步的过程,上一模块调用下一模块函数后,等待其执行.效率不高. 在MRv2中做了改进,yarn基于事件驱动的并发模型.在详细介绍前,先看下图 ...

  6. 关于import caffe出错的解决

    [http://blog.csdn.net/wuzuyu365/article/details/52431062]关于在caffe下,import caffe报错的解决:conda install p ...

  7. sqlite升级--浅谈Android数据库版本升级及数据的迁移

    Android开发涉及到的数据库采用的是轻量级的SQLite3,而在实际开发中,在存储一些简单的数据,使用SharedPreferences就足够了,只有在存储数据结构稍微复杂的时候,才会使用数据库来 ...

  8. 样式hack

    1.CSS 重置 html, body, div, span, applet, object, iframe, h1, h2, h3, h4, h5, h6, p, blockquote, pre, ...

  9. NPOI Excel 单元格背景颜色对照表

    NPOI Excel 单元格颜色对照表,在引用了 NPOI.dll 后可通过 ICellStyle 接口的 FillForegroundColor 属性实现 Excel 单元格的背景色设置,FillP ...

  10. [delphi]indy idhttp post方法

    网易 博客 LOFTCam-用心创造滤镜 LOFTER-最美图片社交APP 送20张免费照片冲印 > 注册登录  加关注 techiepc的博客 万事如意 首页 日志 LOFTER 相册 音乐 ...