solr实现动态加载分词

版本是5.3.0

在core（自己创建的模块）的schema.xml里面增加类型：

<fieldType name="text_lj" class="solr.TextField" positionIncrementGap="100" >

      <analyzer type="index" >

      <tokenizer class="org.wltea.analyzer.lucene.IKTokenizerFactory" useSmart="false" conf="ik.conf"/> //同级目录下创建的ik.conf文件

      </analyzer>  

      <analyzer type="query">

       <tokenizer class="org.wltea.analyzer.lucene.IKTokenizerFactory" useSmart="false" conf="ik.conf"/>  //IKTokenizerFactory，这个是我们后面要改造的类

</analyzer> </fieldType>

<field name="desc" type="text_lj" indexed="true" stored="true" required="true"  multiValued="false"/>

ik.conf：

lastupdate=1

files=extDic.txt

lastupdate：表示的是版本，比如我现在添加了新的分词，则将版本号加1。files表示分词的文件，后面可以是多个文件名，用英文的逗号分隔。在同级目录下创建文件extDic.txt

extDic.txt的内容：文件保存格式必须是utf-8

小红帽

华为手机

格力空调

给出一个目录：

配置已经完成，现在最主要的是修改ik分词器的源码，主要的思路是创建一个线程轮询更新分词

源码下载地址：https://codeload.github.com/EugenePig/ik-analyzer-solr5/zip/master

使用ideal打开工程：

主要设计这三个类：UpdateKeeper是新创建的，用于轮询读取配置文件

package org.wltea.analyzer.lucene;

import java.io.IOException;

import java.util.Vector;

//TODO optimize

public class UpdateKeeper implements Runnable{

    public static interface UpdateJob{

        public void update() throws IOException ;

    }

    final static int INTERVAL = 1 * 60 * 1000;

    private static UpdateKeeper singleton;

    Vector<UpdateJob> filterFactorys;

    Thread worker;

    private UpdateKeeper(){

        filterFactorys = new Vector<UpdateJob>();

        worker = new Thread(this);

        worker.setDaemon(true);

        worker.start();

    }

    public static UpdateKeeper getInstance(){

        if(singleton == null){

            synchronized(UpdateKeeper.class){

                if(singleton == null){

                    singleton = new UpdateKeeper();

                    return singleton;

                }

            }

        }

        return singleton;

    }

    /*保留各个FilterFactory实例对象的引用，用于后期更新操作*/

    public void register(UpdateKeeper.UpdateJob filterFactory ){

        filterFactorys.add(filterFactory);

    }

    @Override

    public void run() {

        while(true){

            try {

                Thread.sleep(INTERVAL);

            } catch (InterruptedException e) {

                e.printStackTrace();

            }

      if(!filterFactorys.isEmpty()){

                for(UpdateJob factory: filterFactorys){

                    try {

                        factory.update();

                    } catch (IOException e) {

                        e.printStackTrace();

                    }

                }

            }

        }

    }

}

/*

 * Licensed to the Apache Software Foundation (ASF) under one or more

 * contributor license agreements.  See the NOTICE file distributed with

 * this work for additional information regarding copyright ownership.

 * The ASF licenses this file to You under the Apache License, Version 2.0

 * (the "License"); you may not use this file except in compliance with

 * the License.  You may obtain a copy of the License at

 *

 *     http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing, software

 * distributed under the License is distributed on an "AS IS" BASIS,

 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 * See the License for the specific language governing permissions and

 * limitations under the License.

 */

package org.wltea.analyzer.lucene;

import java.io.IOException;

import java.io.InputStream;

import java.util.*;

import org.apache.lucene.analysis.Tokenizer;

import org.apache.lucene.analysis.util.ResourceLoader;

import org.apache.lucene.analysis.util.ResourceLoaderAware;

import org.apache.lucene.analysis.util.TokenizerFactory;

import org.apache.lucene.util.AttributeFactory;

import org.wltea.analyzer.dic.Dictionary;

/**

 * @author <a href="mailto:su.eugene@gmail.com">Eugene Su</a>

 */

public class IKTokenizerFactory extends TokenizerFactory implements

        ResourceLoaderAware, UpdateKeeper.UpdateJob{

  private boolean useSmart;

  private ResourceLoader loader;

  private long lastUpdateTime = -1;

  private String conf = null;

  public boolean useSmart() {

    return useSmart;

  }

  public void setUseSmart(boolean useSmart) {

    this.useSmart = useSmart;

  }

  public IKTokenizerFactory(Map<String,String> args) {

    super(args);

    String  useSmartArg = args.get("useSmart");

    this.setUseSmart(useSmartArg != null ? Boolean.parseBoolean(useSmartArg) : false);

    conf = get(args, "conf");

  }

  @Override

  public Tokenizer create(AttributeFactory factory) {

    Tokenizer _IKTokenizer = new IKTokenizer(factory , this.useSmart);

    return _IKTokenizer;

  }

  @Override

  public void update() throws IOException {

    Properties p = canUpdate();

    if (p != null){

      List<String> dicPaths = SplitFileNames(p.getProperty("files"));

      List<InputStream> inputStreamList = new ArrayList<InputStream>();

      for (String path : dicPaths) {

        if ((path != null && !path.isEmpty())) {

          InputStream is = loader.openResource(path);if (is != null) {

            inputStreamList.add(is);

          }

        }

      }

      if (!inputStreamList.isEmpty()) {

        Dictionary.addDic2MainDic(inputStreamList); // load dic to MainDic

      }

    }

  }

  @Override

  public void inform(ResourceLoader resourceLoader) throws IOException {

    System.out.println(":::ik:::inform::::::::::::::::::::::::" + conf);

    this.loader = resourceLoader;

    this.update();

    if(conf != null && !conf.trim().isEmpty())

    {

      UpdateKeeper.getInstance().register(this);

    }

  }

  private Properties canUpdate() {

    try{

      if (conf == null)

        return null;

      Properties p = new Properties();

      InputStream confStream = loader.openResource(conf);

      p.load(confStream);

      confStream.close();

      String lastupdate = p.getProperty("lastupdate", "0");

      Long t = new Long(lastupdate);

      if (t > this.lastUpdateTime){

        this.lastUpdateTime = t.longValue();

        String paths = p.getProperty("files");

        if (paths==null || paths.trim().isEmpty()) // 必须有地址

          return null;

        System.out.println("loading conf");

        return p;

      }else{

        this.lastUpdateTime = t.longValue();

        return null;

      }

    }catch(Exception e){

      System.err.println("IK parsing conf NullPointerException~~~~~" + e.getMessage());

      return null;

    }

  }

  public static List<String> SplitFileNames(String fileNames) {

    if (fileNames == null)

      return Collections.<String> emptyList();

    List<String> result = new ArrayList<String>();

    for (String file : fileNames.split("[,\\s]+")) {

      result.add(file);

    }

    return result;

  }

}

Dictionary类里面新增方法：

Dictionary是单例模式

public static void addDic2MainDic(List<InputStream> inputStreams){

        if(singleton == null)

        {

            Configuration cfg = DefaultConfig.getInstance();

            Dictionary.initial(cfg);

        }

        for(InputStream is : inputStreams){

            //如果找不到扩展的字典，则忽略

            if(is == null){

                continue;

            }

            try {

                BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);

                String theWord = null;

                do {

                    theWord = br.readLine();

                    if (theWord != null && !"".equals(theWord.trim())) {

                        //加载扩展词典数据到主内存词典中

                        //System.out.println(theWord);

                        singleton._MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray());

                    }

                } while (theWord != null);

            } catch (IOException ioe) {

                System.err.println("Extension Dictionary loading exception.");

                ioe.printStackTrace();

            }finally{

                try {

                    if(is != null){

                        is.close();

                        is = null;

                    }

                } catch (IOException e) {

                    e.printStackTrace();

                }

            }

        }

    }

最后将工程打成jar放到web-inf的lib目录里面。大功告成！

solr实现动态加载分词的更多相关文章

中文分词实战——基于jieba动态加载字典和调整词频的电子病历分词
分词是自然语言处理中最基本的一个任务,这篇小文章不介绍相关的理论,而是介绍一个电子病历分词的小实践. 开源的分词工具中,我用过的有jieba.hnlp和stanfordnlp,感觉jieba无论安装和 ...
js动态加载css和js
之前写了一个工具类点此链接里面含有这段代码,感觉用处挺多,特意提出来 var loadUtil = { /* * 方法说明:[动态加载js文件css文件] * 使用方法:loadUtil.loadjs ...
geotrellis使用（二十三）动态加载时间序列数据
目录前言实现方法总结一.前言今天要介绍的绝对是华丽的干货.比如我们从互联网上下载到了一系列(每天或者月平均等)的MODIS数据,我们怎么能够对比同一区域不同时间的数据情况,采用 ...
Ext JS 如何动态加载JavaScript创建窗体
JavaScript不需要编译即可运行,这让JavaScript构建的应用程序可以变得很灵活.我们可以根据需要动态从服务器加载JavaScript脚本来创建和控制UI来与用户交互.下面结合Ext JS ...
Ext动态加载Toolbar
在使用Ext的GridPanel时候,有时候需要面板不用重新加载而去更新Store或者Toolbar,Store的方法有很多,例如官方api给我们提供的Store.load(),Store.reLoa ...
Android动态加载框架汇总
几种动态加载的比较 1.Tinker 用途:热修复 GitHub地址:https://github.com/Tencent/tinker/ 使用:http://www.jianshu.com/p/f6 ...
为不同分辨率单独做样式文件，在页面头部用js判断分辨率后动态加载定义好的样式文件
为不同分辨率单独做样式文件,在页面头部用js判断分辨率后动态加载定义好的样式文件.样式文件命名格式如:forms[_屏幕宽度].css,样式文件中只需重新定义文本框和下拉框的宽度即可. 在包含的头文件 ...
html中的图像动态加载问题
首先要说明下文档加载完成是什么概念一个页面http请求访问时,浏览器会将它的html文件内容请求到本地解析,从窗口打开时开始解析这个document,页面初始的html结构和里面的文字等内容加载完成 ...
非常郁闷的 .NET中程序集的动态加载
记载这篇文章的原因是我自己遇到了动态加载程序集的问题,而困扰了一天之久. 最终看到了这篇博客:http://www.cnblogs.com/brucebi/archive/2013/05/22/Ass ...

随机推荐

马上AI全球挑战者大赛-违约用户风险预测
方案概述近年来,互联网金融已经是当今社会上的一个金融发展趋势.在金融领域,无论是投资理财还是借贷放款,风险控制永远是业务的核心基础.对于消费金融来说,其主要服务对象的特点是:额度小.人群大.周期短, ...
实体关系图应用——google ads
实体关系本页展示了 AdWords 实体的关系图,其中的可点击图片可帮助您找到最合适的文档. 表示法图例实体:链接到相关性最高的指南. 基数:允许的实例数量.例如,1..\* 表示允许一个或多个. ...
Learning-Python【30】：基于UDP协议通信的套接字
UDP协议没有粘包问题,但是缓冲区大小要足够装数据包大小,建议不要超过 512 服务端: # 服务端 import socket server = socket.socket(socket.AF_IN ...
inline-block和float 布局的选择
浮动通常表现正常,但有时候搞起来会很纠结.特别是处理内部容器中的浮动,比如对一排图片使用浮动后对齐出现问题.Inline-block是我们的另一种选择.使用这种属性可以模拟部分浮动的特征,而不需要处理 ...
19 中山重现赛 1002 triangle
题意:给一组数据a[0]...a[n], n<5e6, a[i]<2^31-1(1e9)判断是否存在三角形数首先想到的是排序,若a[i]+a[i+1]>a[i+2] , 则存在三 ...
java常用类介绍
1 日期时间.Math.枚举 1.1 日期时间计算机如何表示时间? GMT时间指格林尼治所在地的标准时间,也称为时间协调时(UTC),其他地区的时间都是相对于GMT时间的偏移. 北京位于东八区 = ...
Vue-Router + Vuex 实现单页面应用
效果查看(一个食品安全网,大家也可以发布一些食品安全的见闻,尽举手之劳): 源代码:https://pan.baidu.com/s/1i43H3LV 如果想要服务器端代码可以在评论里说明一下利用vu ...
Flutter采坑之路 Run Configuration error:broken configuration due to unavailable
今天把AndroidStudio升级成了3.3.1 原先还能编译成功的Flutter工程突然连编译都不行了, 错误是 Run Configuration error:broken configurat ...
1. Two Sum&&15. 3Sum&&18. 4Sum
题目: 1. Two Sum Given an array of integers, return indices of the two numbers such that they add up t ...
『TensorFlow』one_hot化标签
tf.one_hot(indices, depth):将目标序列转换成one_hot编码 tf.one_hot(indices, depth, on_value=None, off_value=Non ...

solr实现动态加载分词

solr实现动态加载分词的更多相关文章

随机推荐

热门专题