大规模字符串检索-压缩trie树
本文使用压缩trie树实现字符串检索的功能。首先将字符串通过编码转化为二进制串,随后将二进制串插入到trie树中,在插入过程中同时实现压缩的功能。
字符编码采用Huffman,但最终测试发现不采用Huffman的方法不仅省下了编码时间,同时trie树的插入时间也有所减少。
/**
程序主函数与编码
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "huffman.h"
#include "compress_trie.h"
//#include <time.h> #define NUM_OF_HUFFMAN 81
#define LENGTH_OF_LINE 10000
#define RESULT_OF_HUFFMAN "result_of_HUFFMAN.dat"
//#define EMAIL "strpool.dat"
//#define CHECKED_EMAIL "checkedemail.dat"
#define RESULT "result.dat" void str_to_bin(char buf[],char binary[],huffman_node hufm[]); int main(int argc, char *argv[])
{
//time_t time_start,time_end;
//time_start = time(NULL); char* EMAIL = argv[];
char* CHECKED_EMAIL = argv[]; huffman_node hufm[NUM_OF_HUFFMAN];
hufm_init(hufm,NUM_OF_HUFFMAN);
char buf[LENGTH_OF_LINE];
char binary[LENGTH_OF_LINE]; FILE* fin_of_huffman;
fin_of_huffman = fopen(RESULT_OF_HUFFMAN,"r");
if(fin_of_huffman == NULL)
{
hufm_init(hufm,NUM_OF_HUFFMAN);
int i;
for(i=;i<(NUM_OF_HUFFMAN+)/;i++)
{
hufm[i].num_of_ch = NUM_OF_HUFFMAN - i;
}
huffman_coding(hufm,NUM_OF_HUFFMAN);
}
else
{
char temp_char;
int i;
for(i=;i<(NUM_OF_HUFFMAN+)/;i++)
{
fgets(buf,sizeof(buf),fin_of_huffman);
sscanf(buf,"%c %d %s",&temp_char,&hufm[i].num_of_ch,hufm[i].code);
}
}
fclose(fin_of_huffman); printf("building trie...");
FILE* fin_of_email;
fin_of_email = fopen(EMAIL,"r");
trie_node *root;
root = (trie_node*)malloc(sizeof(trie_node));
trie_node_init(&root); while(fgets(buf,sizeof(buf),fin_of_email)!=NULL)
{
str_to_bin(buf,binary,hufm);
trie_insert(&root,binary);
}
fclose(fin_of_email);
printf("\r");
printf("build trie success.\n"); FILE *fin_of_checked,*fout_of_result;
fin_of_checked = fopen(CHECKED_EMAIL,"r");
fout_of_result = fopen(RESULT,"w");
int num_yes = ;
int num_no = ;
while(fgets(buf,sizeof(buf),fin_of_checked)!=NULL)
{
str_to_bin(buf,binary,hufm);
if(trie_search(root,binary))
{
fprintf(fout_of_result,"YES\n");
num_yes++;
}
else
{
fprintf(fout_of_result,"NO\n");
num_no++;
}
}
fprintf(fout_of_result,"num of YES is:%d\n",num_yes);
fprintf(fout_of_result,"num of NO is:%d\n",num_no);
printf("search success!\n");
fclose(fin_of_checked);
fclose(fout_of_result);
//time_end = time(NULL);
//printf("用时:%.0lfs\n", difftime(time_end, time_start));
return ;
} void str_to_bin(char buf[],char binary[],huffman_node hufm[])
{
int i;
binary[] = '\0';
for(i=strlen(buf)-;i>=;i--)
{
if(buf[i]>='a' && buf[i]<='z')
{
strcat(binary,hufm[buf[i]-'a'].code);
}
else if(buf[i]>='A' && buf[i]<='Z')
{
strcat(binary,hufm[buf[i]-'A'].code);
}
else if(buf[i]>='' && buf[i]<='')
{
strcat(binary,hufm[+buf[i]-''].code);
}
else if(buf[i]=='_')
{
strcat(binary,hufm[].code);
}
else if(buf[i]=='-')
{
strcat(binary,hufm[].code);
}
else if(buf[i]=='.')
{
strcat(binary,hufm[].code);
}
else if(buf[i]=='@')
{
strcat(binary,hufm[].code);
}
else
{
strcat(binary,hufm[].code);
}
}
}
/**
完成trie树的插入,查找。
*/ typedef struct TRIE_NODE
{
char is_str;
unsigned short num_of_bit;
unsigned char* compress_of_bit;
struct TRIE_NODE *point_of_zero,*point_of_one;
}trie_node; //long int temp_of_new = 0; void trie_node_init(trie_node **root);
int trie_insert(trie_node **root,char* bit_of_insert);
int trie_search(trie_node *root,char* bit_of_insert);
void trie_delete(trie_node *root);
void compress(trie_node *root,char* bit_of_insert);
int compare_of_bit(trie_node *root,char* bit_of_insert);
void pop_bit(trie_node *root,char* bit_of_pop,int len_of_pop); void trie_node_init(trie_node **root)
{
(*root)->is_str = (char);
(*root)->num_of_bit = ;
(*root)->compress_of_bit = NULL;
(*root)->point_of_zero = NULL;
(*root)->point_of_one = NULL;
} void compress(trie_node *root,char* bit_of_insert)
{
int i,j,len_of_insert;
len_of_insert = strlen(bit_of_insert);
root->num_of_bit = len_of_insert;
if(root->num_of_bit<=)
{
int temp;
for(i=len_of_insert-,j=;i>=;i--,j++)
{
if(bit_of_insert[i] == '')
{
clearbit(temp,j);
}
else
{
setbit(temp,j);
}
}
root->compress_of_bit = (unsigned char*)temp;
}
else
{
root->compress_of_bit = (unsigned char*)malloc((len_of_insert%)?(len_of_insert/+):(len_of_insert/));
for(i=len_of_insert-,j=;i>=;i--,j++)
{
if(bit_of_insert[i] == '')
{
clearbit(root->compress_of_bit[j/],j%);
}
else
{
setbit(root->compress_of_bit[j/],j%);
}
}
}
} int trie_insert(trie_node **root,char* bit_of_insert)
{
int ret;
char bit_of_pop[];
if(root == NULL)
{
ret = ;
}
else
{
if((*root)->num_of_bit == )
{
if(!(*bit_of_insert))
{
(*root)->is_str = (char);
ret = ;
}
else
{
if((*root)->is_str ==
&& (*root)->point_of_zero == NULL
&& (*root)->point_of_one == NULL)
{
compress((*root),bit_of_insert);
(*root)->is_str = (char);
ret = ;
}
else
{
if(*bit_of_insert == '')
{
if((*root)->point_of_zero == NULL)
{
(*root)->point_of_zero = (trie_node*)malloc(sizeof(trie_node));
trie_node_init(&(*root)->point_of_zero);
//temp_of_new++;
}
ret = trie_insert(&(*root)->point_of_zero,bit_of_insert+);
}
else
{
if((*root)->point_of_one == NULL)
{
(*root)->point_of_one = (trie_node*)malloc(sizeof(trie_node));
trie_node_init(&(*root)->point_of_one);
//temp_of_new++;
}
ret = trie_insert(&(*root)->point_of_one,bit_of_insert+);
}
}
}
}
else
{
int ans_of_compare = compare_of_bit((*root),bit_of_insert);
if(ans_of_compare == )
{
trie_node *father = (trie_node*)malloc(sizeof(trie_node));
trie_node_init(&father);
//temp_of_new++;
pop_bit((*root),bit_of_pop,);
if(bit_of_pop[] == '')
{
father->point_of_zero = (*root);
}
else
{
father->point_of_one = (*root);
}
if(!(*bit_of_insert))
{
father->is_str = (char);
ret = ;
}
else
{
if(*bit_of_insert == '')
{
father->point_of_zero = (trie_node*)malloc(sizeof(trie_node));
trie_node_init(&father->point_of_zero);
//temp_of_new++;
ret = trie_insert(&father->point_of_zero,bit_of_insert+);
}
else
{
father->point_of_one = (trie_node*)malloc(sizeof(trie_node));
trie_node_init(&father->point_of_one);
//temp_of_new++;
ret = trie_insert(&father->point_of_one,bit_of_insert+);
}
}
(*root) = father;
}
else
{
if(ans_of_compare == (int)(*root)->num_of_bit
&& ans_of_compare == strlen(bit_of_insert))
{
(*root)->is_str = (char);
ret = ;
}
else if(ans_of_compare == (int)(*root)->num_of_bit)
{
bit_of_insert += ans_of_compare;
if(*bit_of_insert == '')
{
if((*root)->point_of_zero == NULL)
{
(*root)->point_of_zero = (trie_node*)malloc(sizeof(trie_node));
trie_node_init(&(*root)->point_of_zero);
//temp_of_new++;
}
ret = trie_insert(&(*root)->point_of_zero,bit_of_insert+);
}
else
{
if((*root)->point_of_one == NULL)
{
(*root)->point_of_one = (trie_node*)malloc(sizeof(trie_node));
trie_node_init(&(*root)->point_of_one);
//temp_of_new++;
}
ret = trie_insert(&(*root)->point_of_one,bit_of_insert+);
}
}
else if(ans_of_compare == strlen(bit_of_insert))
{
trie_node *father = (trie_node*)malloc(sizeof(trie_node));
trie_node_init(&father);
//temp_of_new++;
pop_bit((*root),bit_of_pop,ans_of_compare);
compress(father,bit_of_pop);
father->is_str = (char);
pop_bit((*root),bit_of_pop,);
if(bit_of_pop[] == '')
{
father->point_of_zero = (*root);
}
else
{
father->point_of_one = (*root);
}
(*root) = father;
}
else
{
trie_node *father = (trie_node*)malloc(sizeof(trie_node));
trie_node_init(&father);
//temp_of_new++;
pop_bit((*root),bit_of_pop,ans_of_compare);
compress(father,bit_of_pop);
pop_bit((*root),bit_of_pop,);
bit_of_insert += ans_of_compare+; if(bit_of_pop[] == '')
{
father->point_of_zero = (*root);
father->point_of_one = (trie_node*)malloc(sizeof(trie_node));
trie_node_init(&father->point_of_one);
//temp_of_new++;
ret = trie_insert(&father->point_of_one,bit_of_insert);
}
else
{
father->point_of_one = (*root);
father->point_of_zero = (trie_node*)malloc(sizeof(trie_node));
trie_node_init(&father->point_of_zero);
//temp_of_new++;
ret = trie_insert(&father->point_of_zero,bit_of_insert);
}
(*root) = father;
}
}
}
}
return ret;
} int trie_search(trie_node *root,char *bit_of_search)
{
trie_node *p = root;
while(p!=NULL && *bit_of_search)
{
if(p->num_of_bit!=)
{
if((int)p->num_of_bit == compare_of_bit(p,bit_of_search))
{
bit_of_search += (int)p->num_of_bit;
}
else
{
p=NULL;
break;
}
}
if(!(*bit_of_search))
{
break;
}
if(bit_of_search[]=='')
{
p = p->point_of_zero;
bit_of_search++;
}
else if(bit_of_search[]=='')
{
p = p->point_of_one;
bit_of_search++;
}
if(!(*bit_of_search) && p!=NULL && p->num_of_bit!=)
{
p=NULL;
break;
}
}
if(p!=NULL)
{
return p->is_str;
}
else
{
return ;
}
} void trie_delete(trie_node *root)
{
if(root == NULL)
return;
trie_delete(root->point_of_zero);
trie_delete(root->point_of_one);
free(root);
} int compare_of_bit(trie_node *root,char* bit_of_insert)
{
int len_of_insert = strlen(bit_of_insert);
int i,j,tempbit;
if(root->num_of_bit<=)
{
for(i=,j=root->num_of_bit-;i<len_of_insert && i<root->num_of_bit;i++,j--)
{
tempbit = getbit((int)root->compress_of_bit,j);
if(bit_of_insert[i]-'' != tempbit)
{
break;
}
}
}
else
{
for(i=,j=root->num_of_bit-;i<len_of_insert && i<root->num_of_bit;i++,j--)
{
tempbit = getbit(root->compress_of_bit[j/],j%);
if(bit_of_insert[i]-'' != tempbit)
{
break;
}
}
}
return i;
} void pop_bit(trie_node *root,char* bit_of_pop,int len_of_pop)
{
int i,j;
short num_of_bit = root->num_of_bit - (short)len_of_pop;
if(root->num_of_bit<=)
{
for(i=,j=root->num_of_bit-;i<len_of_pop;i++,j--)
{
bit_of_pop[i] = getbit((int)root->compress_of_bit,j) +'';
}
bit_of_pop[i] = '\0';
}
else
{
for(i=,j=root->num_of_bit-;i<len_of_pop;i++,j--)
{
bit_of_pop[i] = getbit(root->compress_of_bit[j/],j%) +'';
}
bit_of_pop[i] = '\0'; if(num_of_bit == )
{
free(root->compress_of_bit);
}
else if(num_of_bit<=)
{
int temp;
for(j=num_of_bit-;j>=;j--)
{
if(getbit(root->compress_of_bit[j/],j%) == )
{
clearbit(temp,j);
}
else
{
setbit(temp,j);
}
}
free(root->compress_of_bit);
root->compress_of_bit = (unsigned char*)temp;
}
else
{
unsigned char *p;
short num_of_byte = (num_of_bit%)?(num_of_bit/+):(num_of_bit/);
if(((root->num_of_bit%)?(root->num_of_bit/+):(root->num_of_bit/)) != num_of_byte)
{
p = (unsigned char*)malloc(num_of_byte);
short i;
for(i=;i<num_of_byte;i++)
{
p[i] = root->compress_of_bit[i];
}
free(root->compress_of_bit);
root->compress_of_bit = p;
}
}
}
root->num_of_bit = num_of_bit;
}
大规模字符串检索-压缩trie树的更多相关文章
- 835. 字符串统计(Trie树模板题)
维护一个字符串集合,支持两种操作: “I x”向集合中插入一个字符串x: “Q x”询问一个字符串在集合中出现了多少次. 共有N个操作,输入的字符串总长度不超过 105105,字符串仅包含小写英文字母 ...
- Trie树|字典树(字符串排序)
有时,我们会碰到对字符串的排序,若采用一些经典的排序算法,则时间复杂度一般为O(n*lgn),但若采用Trie树,则时间复杂度仅为O(n). Trie树又名字典树,从字面意思即可理解,这种树的结构像英 ...
- Trie树及其应用
Trie树及其应用 Trie树 Trie树,又称单词查找树.字典树,是一种树形结构,是一种哈希树的变种,是一种用于快速检索的多叉树结构.典型应用是用于统计和排序大量的字符串(但不仅限于字符串),所以经 ...
- Trie树(代码),后缀树(代码)
Trie树系列 Trie字典树 压缩的Trie 后缀树Suffix tree 后缀树--ukkonen算法 Trie是通过对字符串进行预先处理,达到加快搜索速度的算法.即把文本中的字符串转换为树结构, ...
- [转]双数组TRIE树原理
原文名称: An Efficient Digital Search Algorithm by Using a Double-Array Structure 作者: JUN-ICHI AOE 译文: 使 ...
- 【动画】看动画轻松理解「Trie树」
Trie树 Trie这个名字取自“retrieval”,检索,因为Trie可以只用一个前缀便可以在一部字典中找到想要的单词. 虽然发音与「Tree」一致,但为了将这种 字典树 与 普通二叉树 以示区别 ...
- Trie树(Prefix Tree)介绍
本文用尽量简洁的语言介绍一种树形数据结构 -- Trie树. 一.什么是Trie树 Trie树,又叫字典树.前缀树(Prefix Tree).单词查找树 或 键树,是一种多叉树结构.如下图: 上图是一 ...
- 数据结构与算法—Trie树
Trie,又经常叫前缀树,字典树等等.它有很多变种,如后缀树,Radix Tree/Trie,PATRICIA tree,以及bitwise版本的crit-bit tree.当然很多名字的意义其实有交 ...
- trie树(前缀树)详解——PHP代码实现
trie树常用于搜索提示.如当输入一个网址,可以自动搜索出可能的选择.当没有完全匹配的搜索结果,可以返回前缀最相似的可能. 一.Tire树的基本性质 根节点不包含字符,除根节点外每一个节点都只包含一个 ...
随机推荐
- 【转】Java ConcurrentModificationException 异常分析与解决方案--还不错
原文网址:http://www.2cto.com/kf/201403/286536.html 一.单线程 1. 异常情况举例 只要抛出出现异常,可以肯定的是代码一定有错误的地方.先来看看都有哪些情况会 ...
- 开发备必:WEB前端开发规范文档
为提高团队协作效率, 便于后台人员添加功能及前端后期优化维护, 输出高质量的文档, 特制订此文档. 本规范文档一经确认, 前端开发人员必 须按本文档规范进行前台页面开发. 本文档如有不对或者不合适的地 ...
- suse linux编译安装GCC报错
gcc编译安装过程 1.先安装三个库 gmp mprc mpc 这三个库的源码要到官网去下载 1)安装gmp:首先建立源码同级目录 gmp-build,输入命令,第一次编译不通过,发现缺少一个叫m4的 ...
- [转载] cookie、JS记录及跳转到页面原来的位置
额....如下 <!-- 定位页面的 Cookie function SetCookie(sName, sValue) { date = new Date(); s = date.getDate ...
- FBReader移植日记 第一天
1.目标是创建两个工程,一个j2se的桌面软件,用于编辑和预览epub等格式的电子书,预览的窗口可以设置分辨率来模拟不同的设备,把编辑的结果实时的显示出来.另一个是Android的应用,用于阅读,管理 ...
- JavaScript高级程序设计7.pdf
function类型 每个函数都是function类型的实例,函数是对象,函数名是指向对象的指针 function sum(num1,num2) { return num1+num2; } //等价于 ...
- About using UTF-8 fields in MySQL
https://www.adayinthelifeof.nl/2010/12/04/about-using-utf-8-fields-in-mysql/ I sometimes hear: “make ...
- 002-python书写规范--消去提示波浪线
强迫症患者面对PyCharm的波浪线是很难受的,针对如下代码去除PyCharm中的波浪线: # _*_coding:utf-8_*_ # /usr/bin/env python3 A_user = & ...
- 【转载】nginx 并发数问题思考:worker_connections,worker_processes与 max clients
注:这个文章主要是作者一直在研究nginx作为http server和反向代理服务器时候所谓最大的max_clients和 worker_connections的计算公式, 其实最后的结论也没有卡上公 ...
- BZOJ 1835: [ZJOI2010]base 基站选址 [序列DP 线段树]
1835: [ZJOI2010]base 基站选址 题目描述 有N个村庄坐落在一条直线上,第i(i>1)个村庄距离第1个村庄的距离为Di.需要在这些村庄中建立不超过K个通讯基站,在第i个村庄建立 ...