/**
* DoubleArrayTrie: Java implementation of Darts (Double-ARray Trie System)
*
* <p>
* Copyright(C) 2001-2007 Taku Kudo &lt;taku@chasen.org&gt;<br />
* Copyright(C) 2009 MURAWAKI Yugo &lt;murawaki@nlp.kuee.kyoto-u.ac.jp&gt;
* Copyright(C) 2012 KOMIYA Atsushi &lt;komiya.atsushi@gmail.com&gt;
* </p>
*
* <p>
* The contents of this file may be used under the terms of either of the GNU
* Lesser General Public License Version 2.1 or later (the "LGPL"), or the BSD
* License (the "BSD").
* </p>
*/
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List; public class DoubleArrayTrie {
private final static int BUF_SIZE = 16384;
private final static int UNIT_SIZE = 8; // size of int + int public static class Node {
int code;
int depth;
int left;
int right;
}; private int check[];
private int base[]; private boolean used[];
private int size;
private int allocSize;
private List<String> key;
private int keySize;
private int length[];
private int value[];
private int progress;
private int nextCheckPos;
private Node[] nodes;
// boolean no_delete_;
int error_; // int (*progressfunc_) (size_t, size_t); // inline _resize expanded
private int resize(int newSize) {
int[] base2 = new int[newSize];
int[] check2 = new int[newSize];
boolean used2[] = new boolean[newSize];
Node[] nodes2 = new Node[newSize];
if (allocSize > 0) {
System.arraycopy(base, 0, base2, 0, allocSize);
System.arraycopy(check, 0, check2, 0, allocSize);
System.arraycopy(used, 0, used2, 0, allocSize);
System.arraycopy(nodes, 0, nodes2, 0, allocSize);
} base = base2;
check = check2;
used = used2;
nodes = nodes2; return allocSize = newSize;
} private int fetch(Node parent, List<Node> siblings) {
if (error_ < 0)
return 0; int prev = 0; for (int i = parent.left; i < parent.right; i++) {
if ((length != null ? length[i] : key.get(i).length()) < parent.depth)
continue; String tmp = key.get(i); int cur = 0;
if ((length != null ? length[i] : tmp.length()) != parent.depth)
cur = (int) tmp.charAt(parent.depth) + 1; if (prev > cur) {
error_ = -3;
return 0;
} if (cur != prev || siblings.size() == 0) {
Node tmp_node = new Node();
tmp_node.depth = parent.depth + 1;
tmp_node.code = cur;
tmp_node.left = i;
if (siblings.size() != 0)
siblings.get(siblings.size() - 1).right = i; siblings.add(tmp_node);
} prev = cur;
} if (siblings.size() != 0)
siblings.get(siblings.size() - 1).right = parent.right; return siblings.size();
} private int insert(List<Node> siblings) {
if (error_ < 0)
return 0; int begin = 0;
int pos = ((siblings.get(0).code + 1 > nextCheckPos) ? siblings.get(0).code + 1
: nextCheckPos) - 1;
int nonzero_num = 0;
int first = 0; if (allocSize <= pos)
resize(pos + 1); outer: while (true) {
pos++; if (allocSize <= pos)
resize(pos + 1); if (check[pos] != 0) {
nonzero_num++;
continue;
} else if (first == 0) {
nextCheckPos = pos;
first = 1;
} begin = pos - siblings.get(0).code;
if (allocSize <= (begin + siblings.get(siblings.size() - 1).code)) {
// progress can be zero
double l = (1.05 > 1.0 * keySize / (progress + 1)) ? 1.05
: 1.0 * keySize / (progress + 1);
resize((int) (allocSize * l));
} if (used[begin])
continue; for (int i = 1; i < siblings.size(); i++)
if (check[begin + siblings.get(i).code] != 0)
continue outer; break;
} // -- Simple heuristics --
// if the percentage of non-empty contents in check between the
// index
// 'next_check_pos' and 'check' is greater than some constant value
// (e.g. 0.9),
// new 'next_check_pos' index is written by 'check'.
if (1.0 * nonzero_num / (pos - nextCheckPos + 1) >= 0.95)
nextCheckPos = pos; used[begin] = true;
size = (size > begin + siblings.get(siblings.size() - 1).code + 1) ? size
: begin + siblings.get(siblings.size() - 1).code + 1; for (int i = 0; i < siblings.size(); i++)
check[begin + siblings.get(i).code] = begin; for (int i = 0; i < siblings.size(); i++) {
List<Node> new_siblings = new ArrayList<Node>(); if (fetch(siblings.get(i), new_siblings) == 0) {
base[begin + siblings.get(i).code] = (value != null)
? (-value[siblings.get(i).left] - 1)
: (-siblings.get(i).left - 1); if (value != null && (-value[siblings.get(i).left] - 1) >= 0) {
error_ = -2;
return 0;
} progress++;
// if (progress_func_) (*progress_func_) (progress,
// keySize);
} else {
int h = insert(new_siblings);
base[begin + siblings.get(i).code] = h;
nodes[begin + siblings.get(i).code] = siblings.get(i);
}
}
return begin;
} public DoubleArrayTrie() {
check = null;
base = null;
used = null;
size = 0;
allocSize = 0;
// no_delete_ = false;
error_ = 0;
} // no deconstructor // set_result omitted
// the search methods returns (the list of) the value(s) instead
// of (the list of) the pair(s) of value(s) and length(s) // set_array omitted
// array omitted void clear() {
// if (! no_delete_)
check = null;
base = null;
used = null;
allocSize = 0;
size = 0;
// no_delete_ = false;
} public int getUnitSize() {
return UNIT_SIZE;
} public int getSize() {
return size;
} public int getTotalSize() {
return size * UNIT_SIZE;
} public int getNonzeroSize() {
int result = 0;
for (int i = 0; i < size; i++)
if (check[i] != 0)
result++;
return result;
} public int build(List<String> key) {
int _keySize = key.size();
int _length[] = new int[_keySize];
for (int i = 0; i < _keySize; i++) {
_length[i] = key.get(i).length();
}
return build(key, _length, null, _keySize);
} public int build(List<String> _key, int _length[], int _value[], int _keySize) {
if (_keySize > _key.size() || _key == null)
return 0; // progress_func_ = progress_func;
key = _key;
length = _length;
keySize = _keySize;
value = _value;
progress = 0; resize(65536 * 32); base[0] = 1;
nextCheckPos = 0; Node root_node = new Node();
root_node.left = 0;
root_node.right = keySize;
root_node.depth = 0; List<Node> siblings = new ArrayList<Node>();
fetch(root_node, siblings);
insert(siblings); // size += (1 << 8 * 2) + 1; // ???
// if (size >= allocSize) resize (size); used = null;
key = null; return error_;
} public void open(String fileName) throws IOException {
open(fileName, true);
} public void open(InputStream inputStream, Boolean saveLen) throws IOException {
DataInputStream is = null;
try {
is = new DataInputStream(new BufferedInputStream(inputStream, BUF_SIZE));
size = is.readInt();
check = new int[size];
base = new int[size];
for (int i = 0; i < size; i++) {
base[i] = is.readInt();
check[i] = is.readInt();
}
if (saveLen) {
keySize = is.readInt();
length = new int[keySize];
for (int i = 0; i < keySize; i++) {
length[i] = is.readInt();
}
}
} finally {
if (is != null)
is.close();
}
} public void open(String fileName, Boolean saveLen) throws IOException {
File file = new File(fileName);
//size = (int) file.length() / UNIT_SIZE; DataInputStream is = null;
try {
is = new DataInputStream(new BufferedInputStream(new FileInputStream(file), BUF_SIZE));
size = is.readInt();
check = new int[size];
base = new int[size];
for (int i = 0; i < size; i++) {
base[i] = is.readInt();
check[i] = is.readInt();
}
if (saveLen) {
keySize = is.readInt();
length = new int[keySize];
for (int i = 0; i < keySize; i++) {
length[i] = is.readInt();
}
}
} finally {
if (is != null)
is.close();
}
} public void save(String fileName) throws IOException {
save(fileName, true);
} public void save(String fileName, Boolean saveLen) throws IOException {
DataOutputStream out = null;
try {
out = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(fileName)));
out.writeInt(size);
for (int i = 0; i < size; i++) {
out.writeInt(base[i]);
out.writeInt(check[i]);
}
if (saveLen) {
out.writeInt(keySize);
for (int i = 0; i < keySize; i++) {
out.writeInt(length[i]);
}
}
out.close();
} finally {
if (out != null)
out.close();
}
} /**
* 获取命中词的长度
* @param index
* @return
*/
public int getWordLen(int index) {
return length[index];
} public int exactMatchSearch(String key) {
return exactMatchSearch(key, 0, 0, 0);
} public int exactMatchSearch(String key, int pos, int len, int nodePos) {
if (len <= 0)
len = key.length();
if (nodePos <= 0)
nodePos = 0; int result = -1; char[] keyChars = key.toCharArray(); int b = base[nodePos];
int p; for (int i = pos; i < len; i++) {
p = b + (int) (keyChars[i]) + 1;
if (b == check[p])
b = base[p];
else
return result;
} p = b;
int n = base[p];
if (b == check[p] && n < 0) {
result = -n - 1;
}
return result;
} public List<Integer> commonPrefixSearch(String key) {
return commonPrefixSearch(key, 0, 0, 0);
} public List<Integer> commonPrefixSearch(String key, int pos, int len, int nodePos) {
if (len <= 0)
len = key.length();
if (nodePos <= 0)
nodePos = 0; List<Integer> result = new ArrayList<Integer>(); char[] keyChars = key.toCharArray(); int b = base[nodePos];
int n;
int p; for (int i = pos; i < len; i++) {
p = b;
n = base[p]; if (b == check[p] && n < 0) {
result.add(-n - 1);
} p = b + (int) (keyChars[i]) + 1;
if (p < size && b == check[p])
b = base[p];
else
return result;
} p = b;
n = base[p]; if (b == check[p] && n < 0) {
result.add(-n - 1);
} return result;
} // 部分匹配,例如字典为成都市,待匹配词为成都,则也算匹配上,匹配失败返回-1,匹配到词典结束返回0
public Integer partMatchSearch(String key, int pos, int len, int nodePos) {
if (len <= 0)
len = key.length();
if (nodePos <= 0)
nodePos = 0; int result = -1; char[] keyChars = key.toCharArray(); int b = base[nodePos];
int p = -1; for (int i = pos; i < len; i++) {
p = b + (int) (keyChars[i]) + 1;
if (b == check[p])
b = base[p];
else
return result;
}
return p;
} public Node getNode(Integer index) {
if (index < 0) {
return null;
}
return nodes[index];
} // 判断是否是部分匹配,决定下一个词的匹配逻辑
public boolean isPartMatched(String key, int nodePos) {
int b = base[nodePos];
int p = b;
int n = base[p];
if (b == check[p] && n < 0) {
return false;
} else {
return true;
}
} // debug
//public void dump() {
// for (int i = 0; i < size; i++) {
// System.err.println("i: " + i + " [" + base[i] + ", " + check[i] + "]");
// }
//} @Override
public boolean equals(Object obj) {
if (obj instanceof DoubleArrayTrie) {
DoubleArrayTrie datObj = (DoubleArrayTrie) obj;
if (size != datObj.size) {
return false;
} else if (keySize != datObj.keySize) {
return false;
}
for (int i = 0; i < size; i++) {
if (base[i] != datObj.base[i] || check[i] != datObj.check[i]) {
return false;
}
}
for (int i = 0; i < keySize; i++) {
if (length[i] != datObj.length[i]) {
return false;
}
}
return true;
} else {
return false;
}
} @Override
public int hashCode() {
return super.hashCode();
}
}

DoubleArrayTrie的更多相关文章

  1. 双数组Trie树(DoubleArrayTrie)Java实现

    http://www.hankcs.com/program/java/%E5%8F%8C%E6%95%B0%E7%BB%84trie%E6%A0%91doublearraytriejava%E5%AE ...

  2. An Implementation of Double-Array Trie

    Contents What is Trie? What Does It Take to Implement a Trie? Tripple-Array Trie Double-Array Trie S ...

  3. 中文分词系列(二) 基于双数组Tire树的AC自动机

    秉着能偷懒就偷懒的精神,关于AC自动机本来不想看的,但是HanLp的源码中用户自定义词典的识别是用的AC自动机实现的.唉-没办法,还是看看吧 AC自动机理论 Aho Corasick自动机,简称AC自 ...

  4. 从Trie树到双数组Trie树

    Trie树 原理 又称单词查找树,Trie树,是一种树形结构,是一种哈希树的变种.它的优点是:利用字符串的公共前缀来减少查询时间,最大限度地减少无谓的字符串比较,能在常数时间O(len)内实现插入和查 ...

  5. Trie树(字典树)推荐文章

    Trie树也被称为字典树,通过这个名字,可以明显知道这种树的结构:像字典一样进行查找的树(想想采用拼音法查找汉字的时候的过程,实质上就是一个逐字母匹配的过程).Trie树就是利用了这种思想构造出来的多 ...

  6. HanLP用户自定义词典源码分析

    HanLP用户自定义词典源码分析 1. 官方文档及参考链接 关于词典问题Issue,首先参考:FAQ 自定义词典其实是基于规则的分词,它的用法参考这个issue 如果有些数量词.字母词需要分词,可参考 ...

  7. Hanlp自然语言处理中的词典格式说明

    使用过hanlp的都知道hanlp中有许多词典,它们的格式都是非常相似的,形式都是文本文档,随时可以修改.本篇文章详细介绍了hanlp中的词典格式,以满足用户自定义的需要. 基本格式 词典分为词频词性 ...

  8. python利用Trie(前缀树)实现搜索引擎中关键字输入提示(学习Hash Trie和Double-array Trie)

    python利用Trie(前缀树)实现搜索引擎中关键字输入提示(学习Hash Trie和Double-array Trie) 主要包括两部分内容:(1)利用python中的dict实现Trie:(2) ...

  9. 利用trie树实现前缀输入提示及trie的python实现

    代码来自https://github.com/wklken/suggestion/blob/master/easymap/suggest.py 还实现了缓存功能,搜索某个前缀超过一定次数时,进行缓存, ...

随机推荐

  1. Docker在linux系统下的安装

    系统要求 本安装教程仅限于CentOS7,其他系统不适用.centos-extras仓库必须是启用状态,这个仓库默认状态是启用,如果不是启用状态,请修改. 卸载旧版本的Docker Docker的旧版 ...

  2. Django CBV和FBV

    Django CBV和FBV Django内部CBV内部接收方法操作: 1.通过客户端返回的请求头RequestMethod与RequesrtURL,会以字符串形式发送到服务器端. 2.取到值后通过d ...

  3. JS异步操作概述(转)

    add by zhj: 只转载了一部分.异步操作的三种模式未转载,因为里面代码比较多,复制过来麻烦 原文:https://wangdoc.com/javascript/async/general.ht ...

  4. UDP比TCP好用的优势

    网络带宽环境变好 在2007年至2015年间,网络的带宽飞速发展,从1.5Mbps的带宽增加到5.1Mbps的带宽,足足增加了4倍,网络环境快速.稳定,所以UDP的丢包率 下降至5%以下,越来越好的网 ...

  5. 关于Java链接c#的webapi的注意事项

    最近写了一个关于ad域的项目,ad域我也是第一次接触,对ad域的总结我会晚一些时间写出来.在此我先总结一下关于Java调用c#的webapi的一个注意点. [HttpPost] public Dict ...

  6. String.Operation

    // 字符串切割 StringField.Split(",".ToCharArray(), StringSplitOptions.RemoveEmptyEntries);

  7. javascript Class.method vs Class.prototype.method(类方法和对象方法)

    在stackoverflow上看到一个这样的提问,以下代码有什么区别? Class.method = function () { /* code */ } Class.prototype.method ...

  8. php批量检查https证书有效期

    function get_cert_info($domain){ $context = stream_context_create(['ssl' => [ 'capture_peer_cert' ...

  9. JS中判断对象是对象还是数组的方法

    https://www.cnblogs.com/ma-shuai/p/7805264.html

  10. JavaScript 之 创建元素

    方式一: 使用  document.write() 语法格式: document.write('新设置的内容<p>标签也可以生成</p>'); 注意:在使用方式的时候,writ ...