DoubleArrayTrie

/**

 * DoubleArrayTrie: Java implementation of Darts (Double-ARray Trie System)

 *

 * <p>

 * Copyright(C) 2001-2007 Taku Kudo &lt;taku@chasen.org&gt;<br />

 * Copyright(C) 2009 MURAWAKI Yugo &lt;murawaki@nlp.kuee.kyoto-u.ac.jp&gt;

 * Copyright(C) 2012 KOMIYA Atsushi &lt;komiya.atsushi@gmail.com&gt;

 * </p>

 *

 * <p>

 * The contents of this file may be used under the terms of either of the GNU

 * Lesser General Public License Version 2.1 or later (the "LGPL"), or the BSD

 * License (the "BSD").

 * </p>

 */

import java.io.BufferedInputStream;

import java.io.BufferedOutputStream;

import java.io.DataInputStream;

import java.io.DataOutputStream;

import java.io.File;

import java.io.FileInputStream;

import java.io.FileOutputStream;

import java.io.IOException;

import java.io.InputStream;

import java.util.ArrayList;

import java.util.List;

public class DoubleArrayTrie {

    private final static int BUF_SIZE  = 16384;

    private final static int UNIT_SIZE = 8;    // size of int + int

    public static class Node {

        int code;

        int depth;

        int left;

        int right;

    };

    private int          check[];

    private int          base[];

    private boolean      used[];

    private int          size;

    private int          allocSize;

    private List<String> key;

    private int          keySize;

    private int          length[];

    private int          value[];

    private int          progress;

    private int          nextCheckPos;

    private Node[]       nodes;

    // boolean no_delete_;

    int                  error_;

    // int (*progressfunc_) (size_t, size_t);

    // inline _resize expanded

    private int resize(int newSize) {

        int[] base2 = new int[newSize];

        int[] check2 = new int[newSize];

        boolean used2[] = new boolean[newSize];

        Node[] nodes2 = new Node[newSize];

        if (allocSize > 0) {

            System.arraycopy(base, 0, base2, 0, allocSize);

            System.arraycopy(check, 0, check2, 0, allocSize);

            System.arraycopy(used, 0, used2, 0, allocSize);

            System.arraycopy(nodes, 0, nodes2, 0, allocSize);

        }

        base = base2;

        check = check2;

        used = used2;

        nodes = nodes2;

        return allocSize = newSize;

    }

    private int fetch(Node parent, List<Node> siblings) {

        if (error_ < 0)

            return 0;

        int prev = 0;

        for (int i = parent.left; i < parent.right; i++) {

            if ((length != null ? length[i] : key.get(i).length()) < parent.depth)

                continue;

            String tmp = key.get(i);

            int cur = 0;

            if ((length != null ? length[i] : tmp.length()) != parent.depth)

                cur = (int) tmp.charAt(parent.depth) + 1;

            if (prev > cur) {

                error_ = -3;

                return 0;

            }

            if (cur != prev || siblings.size() == 0) {

                Node tmp_node = new Node();

                tmp_node.depth = parent.depth + 1;

                tmp_node.code = cur;

                tmp_node.left = i;

                if (siblings.size() != 0)

                    siblings.get(siblings.size() - 1).right = i;

                siblings.add(tmp_node);

            }

            prev = cur;

        }

        if (siblings.size() != 0)

            siblings.get(siblings.size() - 1).right = parent.right;

        return siblings.size();

    }

    private int insert(List<Node> siblings) {

        if (error_ < 0)

            return 0;

        int begin = 0;

        int pos = ((siblings.get(0).code + 1 > nextCheckPos) ? siblings.get(0).code + 1

            : nextCheckPos) - 1;

        int nonzero_num = 0;

        int first = 0;

        if (allocSize <= pos)

            resize(pos + 1);

        outer: while (true) {

            pos++;

            if (allocSize <= pos)

                resize(pos + 1);

            if (check[pos] != 0) {

                nonzero_num++;

                continue;

            } else if (first == 0) {

                nextCheckPos = pos;

                first = 1;

            }

            begin = pos - siblings.get(0).code;

            if (allocSize <= (begin + siblings.get(siblings.size() - 1).code)) {

                // progress can be zero

                double l = (1.05 > 1.0 * keySize / (progress + 1)) ? 1.05

                    : 1.0 * keySize / (progress + 1);

                resize((int) (allocSize * l));

            }

            if (used[begin])

                continue;

            for (int i = 1; i < siblings.size(); i++)

                if (check[begin + siblings.get(i).code] != 0)

                    continue outer;

            break;

        }

        // -- Simple heuristics --

        // if the percentage of non-empty contents in check between the

        // index

        // 'next_check_pos' and 'check' is greater than some constant value

        // (e.g. 0.9),

        // new 'next_check_pos' index is written by 'check'.

        if (1.0 * nonzero_num / (pos - nextCheckPos + 1) >= 0.95)

            nextCheckPos = pos;

        used[begin] = true;

        size = (size > begin + siblings.get(siblings.size() - 1).code + 1) ? size

            : begin + siblings.get(siblings.size() - 1).code + 1;

        for (int i = 0; i < siblings.size(); i++)

            check[begin + siblings.get(i).code] = begin;

        for (int i = 0; i < siblings.size(); i++) {

            List<Node> new_siblings = new ArrayList<Node>();

            if (fetch(siblings.get(i), new_siblings) == 0) {

                base[begin + siblings.get(i).code] = (value != null)

                    ? (-value[siblings.get(i).left] - 1)

                    : (-siblings.get(i).left - 1);

                if (value != null && (-value[siblings.get(i).left] - 1) >= 0) {

                    error_ = -2;

                    return 0;

                }

                progress++;

                // if (progress_func_) (*progress_func_) (progress,

                // keySize);

            } else {

                int h = insert(new_siblings);

                base[begin + siblings.get(i).code] = h;

                nodes[begin + siblings.get(i).code] = siblings.get(i);

            }

        }

        return begin;

    }

    public DoubleArrayTrie() {

        check = null;

        base = null;

        used = null;

        size = 0;

        allocSize = 0;

        // no_delete_ = false;

        error_ = 0;

    }

    // no deconstructor

    // set_result omitted

    // the search methods returns (the list of) the value(s) instead

    // of (the list of) the pair(s) of value(s) and length(s)

    // set_array omitted

    // array omitted

    void clear() {

        // if (! no_delete_)

        check = null;

        base = null;

        used = null;

        allocSize = 0;

        size = 0;

        // no_delete_ = false;

    }

    public int getUnitSize() {

        return UNIT_SIZE;

    }

    public int getSize() {

        return size;

    }

    public int getTotalSize() {

        return size * UNIT_SIZE;

    }

    public int getNonzeroSize() {

        int result = 0;

        for (int i = 0; i < size; i++)

            if (check[i] != 0)

                result++;

        return result;

    }

    public int build(List<String> key) {

        int _keySize = key.size();

        int _length[] = new int[_keySize];

        for (int i = 0; i < _keySize; i++) {

            _length[i] = key.get(i).length();

        }

        return build(key, _length, null, _keySize);

    }

    public int build(List<String> _key, int _length[], int _value[], int _keySize) {

        if (_keySize > _key.size() || _key == null)

            return 0;

        // progress_func_ = progress_func;

        key = _key;

        length = _length;

        keySize = _keySize;

        value = _value;

        progress = 0;

        resize(65536 * 32);

        base[0] = 1;

        nextCheckPos = 0;

        Node root_node = new Node();

        root_node.left = 0;

        root_node.right = keySize;

        root_node.depth = 0;

        List<Node> siblings = new ArrayList<Node>();

        fetch(root_node, siblings);

        insert(siblings);

        // size += (1 << 8 * 2) + 1; // ???

        // if (size >= allocSize) resize (size);

        used = null;

        key = null;

        return error_;

    }

    public void open(String fileName) throws IOException {

        open(fileName, true);

    }

    public void open(InputStream inputStream, Boolean saveLen) throws IOException {

        DataInputStream is = null;

        try {

            is = new DataInputStream(new BufferedInputStream(inputStream, BUF_SIZE));

            size = is.readInt();

            check = new int[size];

            base = new int[size];

            for (int i = 0; i < size; i++) {

                base[i] = is.readInt();

                check[i] = is.readInt();

            }

            if (saveLen) {

                keySize = is.readInt();

                length = new int[keySize];

                for (int i = 0; i < keySize; i++) {

                    length[i] = is.readInt();

                }

            }

        } finally {

            if (is != null)

                is.close();

        }

    }

    public void open(String fileName, Boolean saveLen) throws IOException {

        File file = new File(fileName);

        //size = (int) file.length() / UNIT_SIZE;

        DataInputStream is = null;

        try {

            is = new DataInputStream(new BufferedInputStream(new FileInputStream(file), BUF_SIZE));

            size = is.readInt();

            check = new int[size];

            base = new int[size];

            for (int i = 0; i < size; i++) {

                base[i] = is.readInt();

                check[i] = is.readInt();

            }

            if (saveLen) {

                keySize = is.readInt();

                length = new int[keySize];

                for (int i = 0; i < keySize; i++) {

                    length[i] = is.readInt();

                }

            }

        } finally {

            if (is != null)

                is.close();

        }

    }

    public void save(String fileName) throws IOException {

        save(fileName, true);

    }

    public void save(String fileName, Boolean saveLen) throws IOException {

        DataOutputStream out = null;

        try {

            out = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(fileName)));

            out.writeInt(size);

            for (int i = 0; i < size; i++) {

                out.writeInt(base[i]);

                out.writeInt(check[i]);

            }

            if (saveLen) {

                out.writeInt(keySize);

                for (int i = 0; i < keySize; i++) {

                    out.writeInt(length[i]);

                }

            }

            out.close();

        } finally {

            if (out != null)

                out.close();

        }

    }

    /**

     * 获取命中词的长度

     * @param index

     * @return

     */

    public int getWordLen(int index) {

        return length[index];

    }

    public int exactMatchSearch(String key) {

        return exactMatchSearch(key, 0, 0, 0);

    }

    public int exactMatchSearch(String key, int pos, int len, int nodePos) {

        if (len <= 0)

            len = key.length();

        if (nodePos <= 0)

            nodePos = 0;

        int result = -1;

        char[] keyChars = key.toCharArray();

        int b = base[nodePos];

        int p;

        for (int i = pos; i < len; i++) {

            p = b + (int) (keyChars[i]) + 1;

            if (b == check[p])

                b = base[p];

            else

                return result;

        }

        p = b;

        int n = base[p];

        if (b == check[p] && n < 0) {

            result = -n - 1;

        }

        return result;

    }

    public List<Integer> commonPrefixSearch(String key) {

        return commonPrefixSearch(key, 0, 0, 0);

    }

    public List<Integer> commonPrefixSearch(String key, int pos, int len, int nodePos) {

        if (len <= 0)

            len = key.length();

        if (nodePos <= 0)

            nodePos = 0;

        List<Integer> result = new ArrayList<Integer>();

        char[] keyChars = key.toCharArray();

        int b = base[nodePos];

        int n;

        int p;

        for (int i = pos; i < len; i++) {

            p = b;

            n = base[p];

            if (b == check[p] && n < 0) {

                result.add(-n - 1);

            }

            p = b + (int) (keyChars[i]) + 1;

            if (p < size && b == check[p])

                b = base[p];

            else

                return result;

        }

        p = b;

        n = base[p];

        if (b == check[p] && n < 0) {

            result.add(-n - 1);

        }

        return result;

    }

    // 部分匹配，例如字典为成都市，待匹配词为成都，则也算匹配上，匹配失败返回-1，匹配到词典结束返回0

    public Integer partMatchSearch(String key, int pos, int len, int nodePos) {

        if (len <= 0)

            len = key.length();

        if (nodePos <= 0)

            nodePos = 0;

        int result = -1;

        char[] keyChars = key.toCharArray();

        int b = base[nodePos];

        int p = -1;

        for (int i = pos; i < len; i++) {

            p = b + (int) (keyChars[i]) + 1;

            if (b == check[p])

                b = base[p];

            else

                return result;

        }

        return p;

    }

    public Node getNode(Integer index) {

        if (index < 0) {

            return null;

        }

        return nodes[index];

    }

    // 判断是否是部分匹配，决定下一个词的匹配逻辑

    public boolean isPartMatched(String key, int nodePos) {

        int b = base[nodePos];

        int p = b;

        int n = base[p];

        if (b == check[p] && n < 0) {

            return false;

        } else {

            return true;

        }

    }

    // debug

    //public void dump() {

    //    for (int i = 0; i < size; i++) {

    //        System.err.println("i: " + i + " [" + base[i] + ", " + check[i] + "]");

    //    }

    //}

    @Override

    public boolean equals(Object obj) {

        if (obj instanceof DoubleArrayTrie) {

            DoubleArrayTrie datObj = (DoubleArrayTrie) obj;

            if (size != datObj.size) {

                return false;

            } else if (keySize != datObj.keySize) {

                return false;

            }

            for (int i = 0; i < size; i++) {

                if (base[i] != datObj.base[i] || check[i] != datObj.check[i]) {

                    return false;

                }

            }

            for (int i = 0; i < keySize; i++) {

                if (length[i] != datObj.length[i]) {

                    return false;

                }

            }

            return true;

        } else {

            return false;

        }

    }

    @Override

    public int hashCode() {

        return super.hashCode();

    }

}

DoubleArrayTrie的更多相关文章

双数组Trie树(DoubleArrayTrie)Java实现
http://www.hankcs.com/program/java/%E5%8F%8C%E6%95%B0%E7%BB%84trie%E6%A0%91doublearraytriejava%E5%AE ...
An Implementation of Double-Array Trie
Contents What is Trie? What Does It Take to Implement a Trie? Tripple-Array Trie Double-Array Trie S ...
中文分词系列（二）基于双数组Tire树的AC自动机
秉着能偷懒就偷懒的精神,关于AC自动机本来不想看的,但是HanLp的源码中用户自定义词典的识别是用的AC自动机实现的.唉-没办法,还是看看吧 AC自动机理论 Aho Corasick自动机,简称AC自 ...
从Trie树到双数组Trie树
Trie树原理又称单词查找树,Trie树,是一种树形结构,是一种哈希树的变种.它的优点是:利用字符串的公共前缀来减少查询时间,最大限度地减少无谓的字符串比较,能在常数时间O(len)内实现插入和查 ...
Trie树（字典树）推荐文章
Trie树也被称为字典树,通过这个名字,可以明显知道这种树的结构:像字典一样进行查找的树(想想采用拼音法查找汉字的时候的过程,实质上就是一个逐字母匹配的过程).Trie树就是利用了这种思想构造出来的多 ...
HanLP用户自定义词典源码分析
HanLP用户自定义词典源码分析 1. 官方文档及参考链接关于词典问题Issue,首先参考:FAQ 自定义词典其实是基于规则的分词,它的用法参考这个issue 如果有些数量词.字母词需要分词,可参考 ...
Hanlp自然语言处理中的词典格式说明
使用过hanlp的都知道hanlp中有许多词典,它们的格式都是非常相似的,形式都是文本文档,随时可以修改.本篇文章详细介绍了hanlp中的词典格式,以满足用户自定义的需要. 基本格式词典分为词频词性 ...
python利用Trie(前缀树)实现搜索引擎中关键字输入提示（学习Hash Trie和Double-array Trie）
python利用Trie(前缀树)实现搜索引擎中关键字输入提示(学习Hash Trie和Double-array Trie) 主要包括两部分内容:(1)利用python中的dict实现Trie:(2) ...
利用trie树实现前缀输入提示及trie的python实现
代码来自https://github.com/wklken/suggestion/blob/master/easymap/suggest.py 还实现了缓存功能,搜索某个前缀超过一定次数时,进行缓存, ...

随机推荐

记：使用IScroll.js 开发picker日历组件遇到的问题及经验总结
IScroll中文文档第一个问题: 边界留白就是这种,上边界(最小),下边界(最大)有两个列表的位置是不能选择的.解决的办法是: 在HTML中,添加空白节点就行了. 第二个问题:初始化之后的滚动停 ...
router单页面多个标签tags的用法<router-view></router-view>
<keep-alive><router-view :key="path" /></keep-alive>
ASP.NET Core应用程序的参数配置及使用（转载）
本文结构提前准备参数配置方式 appsettings.json 环境变量命令行参数在控制器中使用配置参数注入IConfiguration对象注入IOptions对象总结应用程序的开发不 ...
Java判断两个时间相差的天数
1.实现目标输入:两个日期输出:两个日期相差的天数 2.代码实现方法1: 通过Calendar类的日期比较.注意:这里需要考虑一下: 日期是跨年份的,如一个是2012年,一个是2015年的 ...
第三章 Maven构建 Java Spring Boot Web项目
3.1 认识Srping Boot Spring Boot是一个框架,是一种全新的编程规范,它的产生简化了对框架的使用,简化了Spring众多的框架中大量的繁琐的配置文件,所以说Spring Bo ...
2019 竞网智赢java面试笔试题（含面试题解析）
本人5年开发经验.18年年底开始跑路找工作,在互联网寒冬下成功拿到阿里巴巴.今日头条.竞网智赢等公司offer,岗位是Java后端开发,因为发展原因最终选择去了竞网智赢,入职一年时间了,也成为了面 ...
kylin安装过程问题排查
问题:日志报错:/usr/local/apps/kylin/tomcat/conf/.keystore (没有那个文件或目录) 解决:在kylin内置tomcat的server.xml中里边有个对ht ...
Android-----实现给图片添加字体
实现给图片添加字体,图片旋转功能:xml布局文件内容如下,一个简单的ImageView布局 <com.example.hsjgapp.RotateImageView //这里存放要展示的图片 a ...
FFMPEG 命令行工具- ffplay
ffplay 简介 ffplay是ffmpeg工程中提供的播放器,功能相当的强大,凡是ffmpeg支持的视音频格式它基本上都支持.甚至连VLC不支持的一些流媒体都可以播放,但是它的缺点是其不是图形化界 ...
HTTP/2和Python的支持现状-2019-10
背景: 大概2019年9月份,天猫全面升级了HTTP/2的支持,并且加强了HTTP/1的访问限制,也可能很早前就这么做了, 但之前一直没限制HTTP/1的访问.之所以发现这个问题,是因为写的爬虫突然失 ...

DoubleArrayTrie

DoubleArrayTrie的更多相关文章

随机推荐

热门专题