/**
* DoubleArrayTrie: Java implementation of Darts (Double-ARray Trie System)
*
* <p>
* Copyright(C) 2001-2007 Taku Kudo &lt;taku@chasen.org&gt;<br />
* Copyright(C) 2009 MURAWAKI Yugo &lt;murawaki@nlp.kuee.kyoto-u.ac.jp&gt;
* Copyright(C) 2012 KOMIYA Atsushi &lt;komiya.atsushi@gmail.com&gt;
* </p>
*
* <p>
* The contents of this file may be used under the terms of either of the GNU
* Lesser General Public License Version 2.1 or later (the "LGPL"), or the BSD
* License (the "BSD").
* </p>
*/
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List; public class DoubleArrayTrie {
private final static int BUF_SIZE = 16384;
private final static int UNIT_SIZE = 8; // size of int + int public static class Node {
int code;
int depth;
int left;
int right;
}; private int check[];
private int base[]; private boolean used[];
private int size;
private int allocSize;
private List<String> key;
private int keySize;
private int length[];
private int value[];
private int progress;
private int nextCheckPos;
private Node[] nodes;
// boolean no_delete_;
int error_; // int (*progressfunc_) (size_t, size_t); // inline _resize expanded
private int resize(int newSize) {
int[] base2 = new int[newSize];
int[] check2 = new int[newSize];
boolean used2[] = new boolean[newSize];
Node[] nodes2 = new Node[newSize];
if (allocSize > 0) {
System.arraycopy(base, 0, base2, 0, allocSize);
System.arraycopy(check, 0, check2, 0, allocSize);
System.arraycopy(used, 0, used2, 0, allocSize);
System.arraycopy(nodes, 0, nodes2, 0, allocSize);
} base = base2;
check = check2;
used = used2;
nodes = nodes2; return allocSize = newSize;
} private int fetch(Node parent, List<Node> siblings) {
if (error_ < 0)
return 0; int prev = 0; for (int i = parent.left; i < parent.right; i++) {
if ((length != null ? length[i] : key.get(i).length()) < parent.depth)
continue; String tmp = key.get(i); int cur = 0;
if ((length != null ? length[i] : tmp.length()) != parent.depth)
cur = (int) tmp.charAt(parent.depth) + 1; if (prev > cur) {
error_ = -3;
return 0;
} if (cur != prev || siblings.size() == 0) {
Node tmp_node = new Node();
tmp_node.depth = parent.depth + 1;
tmp_node.code = cur;
tmp_node.left = i;
if (siblings.size() != 0)
siblings.get(siblings.size() - 1).right = i; siblings.add(tmp_node);
} prev = cur;
} if (siblings.size() != 0)
siblings.get(siblings.size() - 1).right = parent.right; return siblings.size();
} private int insert(List<Node> siblings) {
if (error_ < 0)
return 0; int begin = 0;
int pos = ((siblings.get(0).code + 1 > nextCheckPos) ? siblings.get(0).code + 1
: nextCheckPos) - 1;
int nonzero_num = 0;
int first = 0; if (allocSize <= pos)
resize(pos + 1); outer: while (true) {
pos++; if (allocSize <= pos)
resize(pos + 1); if (check[pos] != 0) {
nonzero_num++;
continue;
} else if (first == 0) {
nextCheckPos = pos;
first = 1;
} begin = pos - siblings.get(0).code;
if (allocSize <= (begin + siblings.get(siblings.size() - 1).code)) {
// progress can be zero
double l = (1.05 > 1.0 * keySize / (progress + 1)) ? 1.05
: 1.0 * keySize / (progress + 1);
resize((int) (allocSize * l));
} if (used[begin])
continue; for (int i = 1; i < siblings.size(); i++)
if (check[begin + siblings.get(i).code] != 0)
continue outer; break;
} // -- Simple heuristics --
// if the percentage of non-empty contents in check between the
// index
// 'next_check_pos' and 'check' is greater than some constant value
// (e.g. 0.9),
// new 'next_check_pos' index is written by 'check'.
if (1.0 * nonzero_num / (pos - nextCheckPos + 1) >= 0.95)
nextCheckPos = pos; used[begin] = true;
size = (size > begin + siblings.get(siblings.size() - 1).code + 1) ? size
: begin + siblings.get(siblings.size() - 1).code + 1; for (int i = 0; i < siblings.size(); i++)
check[begin + siblings.get(i).code] = begin; for (int i = 0; i < siblings.size(); i++) {
List<Node> new_siblings = new ArrayList<Node>(); if (fetch(siblings.get(i), new_siblings) == 0) {
base[begin + siblings.get(i).code] = (value != null)
? (-value[siblings.get(i).left] - 1)
: (-siblings.get(i).left - 1); if (value != null && (-value[siblings.get(i).left] - 1) >= 0) {
error_ = -2;
return 0;
} progress++;
// if (progress_func_) (*progress_func_) (progress,
// keySize);
} else {
int h = insert(new_siblings);
base[begin + siblings.get(i).code] = h;
nodes[begin + siblings.get(i).code] = siblings.get(i);
}
}
return begin;
} public DoubleArrayTrie() {
check = null;
base = null;
used = null;
size = 0;
allocSize = 0;
// no_delete_ = false;
error_ = 0;
} // no deconstructor // set_result omitted
// the search methods returns (the list of) the value(s) instead
// of (the list of) the pair(s) of value(s) and length(s) // set_array omitted
// array omitted void clear() {
// if (! no_delete_)
check = null;
base = null;
used = null;
allocSize = 0;
size = 0;
// no_delete_ = false;
} public int getUnitSize() {
return UNIT_SIZE;
} public int getSize() {
return size;
} public int getTotalSize() {
return size * UNIT_SIZE;
} public int getNonzeroSize() {
int result = 0;
for (int i = 0; i < size; i++)
if (check[i] != 0)
result++;
return result;
} public int build(List<String> key) {
int _keySize = key.size();
int _length[] = new int[_keySize];
for (int i = 0; i < _keySize; i++) {
_length[i] = key.get(i).length();
}
return build(key, _length, null, _keySize);
} public int build(List<String> _key, int _length[], int _value[], int _keySize) {
if (_keySize > _key.size() || _key == null)
return 0; // progress_func_ = progress_func;
key = _key;
length = _length;
keySize = _keySize;
value = _value;
progress = 0; resize(65536 * 32); base[0] = 1;
nextCheckPos = 0; Node root_node = new Node();
root_node.left = 0;
root_node.right = keySize;
root_node.depth = 0; List<Node> siblings = new ArrayList<Node>();
fetch(root_node, siblings);
insert(siblings); // size += (1 << 8 * 2) + 1; // ???
// if (size >= allocSize) resize (size); used = null;
key = null; return error_;
} public void open(String fileName) throws IOException {
open(fileName, true);
} public void open(InputStream inputStream, Boolean saveLen) throws IOException {
DataInputStream is = null;
try {
is = new DataInputStream(new BufferedInputStream(inputStream, BUF_SIZE));
size = is.readInt();
check = new int[size];
base = new int[size];
for (int i = 0; i < size; i++) {
base[i] = is.readInt();
check[i] = is.readInt();
}
if (saveLen) {
keySize = is.readInt();
length = new int[keySize];
for (int i = 0; i < keySize; i++) {
length[i] = is.readInt();
}
}
} finally {
if (is != null)
is.close();
}
} public void open(String fileName, Boolean saveLen) throws IOException {
File file = new File(fileName);
//size = (int) file.length() / UNIT_SIZE; DataInputStream is = null;
try {
is = new DataInputStream(new BufferedInputStream(new FileInputStream(file), BUF_SIZE));
size = is.readInt();
check = new int[size];
base = new int[size];
for (int i = 0; i < size; i++) {
base[i] = is.readInt();
check[i] = is.readInt();
}
if (saveLen) {
keySize = is.readInt();
length = new int[keySize];
for (int i = 0; i < keySize; i++) {
length[i] = is.readInt();
}
}
} finally {
if (is != null)
is.close();
}
} public void save(String fileName) throws IOException {
save(fileName, true);
} public void save(String fileName, Boolean saveLen) throws IOException {
DataOutputStream out = null;
try {
out = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(fileName)));
out.writeInt(size);
for (int i = 0; i < size; i++) {
out.writeInt(base[i]);
out.writeInt(check[i]);
}
if (saveLen) {
out.writeInt(keySize);
for (int i = 0; i < keySize; i++) {
out.writeInt(length[i]);
}
}
out.close();
} finally {
if (out != null)
out.close();
}
} /**
* 获取命中词的长度
* @param index
* @return
*/
public int getWordLen(int index) {
return length[index];
} public int exactMatchSearch(String key) {
return exactMatchSearch(key, 0, 0, 0);
} public int exactMatchSearch(String key, int pos, int len, int nodePos) {
if (len <= 0)
len = key.length();
if (nodePos <= 0)
nodePos = 0; int result = -1; char[] keyChars = key.toCharArray(); int b = base[nodePos];
int p; for (int i = pos; i < len; i++) {
p = b + (int) (keyChars[i]) + 1;
if (b == check[p])
b = base[p];
else
return result;
} p = b;
int n = base[p];
if (b == check[p] && n < 0) {
result = -n - 1;
}
return result;
} public List<Integer> commonPrefixSearch(String key) {
return commonPrefixSearch(key, 0, 0, 0);
} public List<Integer> commonPrefixSearch(String key, int pos, int len, int nodePos) {
if (len <= 0)
len = key.length();
if (nodePos <= 0)
nodePos = 0; List<Integer> result = new ArrayList<Integer>(); char[] keyChars = key.toCharArray(); int b = base[nodePos];
int n;
int p; for (int i = pos; i < len; i++) {
p = b;
n = base[p]; if (b == check[p] && n < 0) {
result.add(-n - 1);
} p = b + (int) (keyChars[i]) + 1;
if (p < size && b == check[p])
b = base[p];
else
return result;
} p = b;
n = base[p]; if (b == check[p] && n < 0) {
result.add(-n - 1);
} return result;
} // 部分匹配,例如字典为成都市,待匹配词为成都,则也算匹配上,匹配失败返回-1,匹配到词典结束返回0
public Integer partMatchSearch(String key, int pos, int len, int nodePos) {
if (len <= 0)
len = key.length();
if (nodePos <= 0)
nodePos = 0; int result = -1; char[] keyChars = key.toCharArray(); int b = base[nodePos];
int p = -1; for (int i = pos; i < len; i++) {
p = b + (int) (keyChars[i]) + 1;
if (b == check[p])
b = base[p];
else
return result;
}
return p;
} public Node getNode(Integer index) {
if (index < 0) {
return null;
}
return nodes[index];
} // 判断是否是部分匹配,决定下一个词的匹配逻辑
public boolean isPartMatched(String key, int nodePos) {
int b = base[nodePos];
int p = b;
int n = base[p];
if (b == check[p] && n < 0) {
return false;
} else {
return true;
}
} // debug
//public void dump() {
// for (int i = 0; i < size; i++) {
// System.err.println("i: " + i + " [" + base[i] + ", " + check[i] + "]");
// }
//} @Override
public boolean equals(Object obj) {
if (obj instanceof DoubleArrayTrie) {
DoubleArrayTrie datObj = (DoubleArrayTrie) obj;
if (size != datObj.size) {
return false;
} else if (keySize != datObj.keySize) {
return false;
}
for (int i = 0; i < size; i++) {
if (base[i] != datObj.base[i] || check[i] != datObj.check[i]) {
return false;
}
}
for (int i = 0; i < keySize; i++) {
if (length[i] != datObj.length[i]) {
return false;
}
}
return true;
} else {
return false;
}
} @Override
public int hashCode() {
return super.hashCode();
}
}

DoubleArrayTrie的更多相关文章

  1. 双数组Trie树(DoubleArrayTrie)Java实现

    http://www.hankcs.com/program/java/%E5%8F%8C%E6%95%B0%E7%BB%84trie%E6%A0%91doublearraytriejava%E5%AE ...

  2. An Implementation of Double-Array Trie

    Contents What is Trie? What Does It Take to Implement a Trie? Tripple-Array Trie Double-Array Trie S ...

  3. 中文分词系列(二) 基于双数组Tire树的AC自动机

    秉着能偷懒就偷懒的精神,关于AC自动机本来不想看的,但是HanLp的源码中用户自定义词典的识别是用的AC自动机实现的.唉-没办法,还是看看吧 AC自动机理论 Aho Corasick自动机,简称AC自 ...

  4. 从Trie树到双数组Trie树

    Trie树 原理 又称单词查找树,Trie树,是一种树形结构,是一种哈希树的变种.它的优点是:利用字符串的公共前缀来减少查询时间,最大限度地减少无谓的字符串比较,能在常数时间O(len)内实现插入和查 ...

  5. Trie树(字典树)推荐文章

    Trie树也被称为字典树,通过这个名字,可以明显知道这种树的结构:像字典一样进行查找的树(想想采用拼音法查找汉字的时候的过程,实质上就是一个逐字母匹配的过程).Trie树就是利用了这种思想构造出来的多 ...

  6. HanLP用户自定义词典源码分析

    HanLP用户自定义词典源码分析 1. 官方文档及参考链接 关于词典问题Issue,首先参考:FAQ 自定义词典其实是基于规则的分词,它的用法参考这个issue 如果有些数量词.字母词需要分词,可参考 ...

  7. Hanlp自然语言处理中的词典格式说明

    使用过hanlp的都知道hanlp中有许多词典,它们的格式都是非常相似的,形式都是文本文档,随时可以修改.本篇文章详细介绍了hanlp中的词典格式,以满足用户自定义的需要. 基本格式 词典分为词频词性 ...

  8. python利用Trie(前缀树)实现搜索引擎中关键字输入提示(学习Hash Trie和Double-array Trie)

    python利用Trie(前缀树)实现搜索引擎中关键字输入提示(学习Hash Trie和Double-array Trie) 主要包括两部分内容:(1)利用python中的dict实现Trie:(2) ...

  9. 利用trie树实现前缀输入提示及trie的python实现

    代码来自https://github.com/wklken/suggestion/blob/master/easymap/suggest.py 还实现了缓存功能,搜索某个前缀超过一定次数时,进行缓存, ...

随机推荐

  1. Controller如何进行重定向跳转

    因为在Controller的返回都是默认走视图解析器的InternalResourceViewResolver,而视图解析器都是进行请求转发,需要在返回时地址前加入字符redirect: 视图解析器不 ...

  2. FusionInsight大数据开发---SparkStreaming概述

    SparkStreaming概述 SparkStreaming是Spark核心API的一个扩展,它对实时流式数据的处理具有可扩展性.高吞吐量.可容错性等特点. SparkStreaming原理 Spa ...

  3. TRIO-basic指令--CAM

    大家好,今天更新TRIO的运动指令CAM(也就是CAM函数),CAM指令是控制器直接发送编码器脉冲形成的运动曲线,比如:正弦,余弦曲线,根据自己的精度需求进行描点,但并不一定点数越多精度就越高,以实际 ...

  4. [转]Python实现字符串反转的几种方法

    #第一种:使用字符串切片 result = s[::-1] #第二种:使用列表的reverse方法 l = list(s) l.reverse() result = "".join ...

  5. node_exporte新版本指标名称变化说明

    changelog如下 Breaking changes This release contains major breaking changes to metric names. Many metr ...

  6. Idea中编辑后需要重启问题

    发布的artifact要用exploded 配置On Update action 和On frame deactivation为Update classes and resources. 编辑完成后, ...

  7. 【转载】如何删除Windows远程桌面保存的账号密码数据

    在Windows系统中,无论是win7.win8还是win10系统,都可使用Windows系统自带的远程桌面连接工具来远程服务器,很多时候Windows远程桌面在连接一次后会自动保存连接的账号密码等信 ...

  8. ES6 字符串&正则表达式

    目录 第二章 字符串和正则表达式UTF-16码位codePointAt()方法String.fromCodePoint()方法normalize()方法正则表达式u修饰符其他字符串变更字符串中的字串识 ...

  9. JS JQUERY实现滚动条自动滚到底的方法

    $(function(){ var h = $(document).height()-$(window).height(); $(document).scrollTop(h); }); \ windo ...

  10. mysql编译安装下载地址(官网)

    https://dev.mysql.com/get/Downloads/MySQL-version number/mysql-version number.tar.gz 把这个地址上面的版本号改成自己 ...