大规模字符串检索-压缩trie树
本文使用压缩trie树实现字符串检索的功能。首先将字符串通过编码转化为二进制串,随后将二进制串插入到trie树中,在插入过程中同时实现压缩的功能。
字符编码采用Huffman,但最终测试发现不采用Huffman的方法不仅省下了编码时间,同时trie树的插入时间也有所减少。
/**
程序主函数与编码
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "huffman.h"
#include "compress_trie.h"
//#include <time.h> #define NUM_OF_HUFFMAN 81
#define LENGTH_OF_LINE 10000
#define RESULT_OF_HUFFMAN "result_of_HUFFMAN.dat"
//#define EMAIL "strpool.dat"
//#define CHECKED_EMAIL "checkedemail.dat"
#define RESULT "result.dat" void str_to_bin(char buf[],char binary[],huffman_node hufm[]); int main(int argc, char *argv[])
{
//time_t time_start,time_end;
//time_start = time(NULL); char* EMAIL = argv[];
char* CHECKED_EMAIL = argv[]; huffman_node hufm[NUM_OF_HUFFMAN];
hufm_init(hufm,NUM_OF_HUFFMAN);
char buf[LENGTH_OF_LINE];
char binary[LENGTH_OF_LINE]; FILE* fin_of_huffman;
fin_of_huffman = fopen(RESULT_OF_HUFFMAN,"r");
if(fin_of_huffman == NULL)
{
hufm_init(hufm,NUM_OF_HUFFMAN);
int i;
for(i=;i<(NUM_OF_HUFFMAN+)/;i++)
{
hufm[i].num_of_ch = NUM_OF_HUFFMAN - i;
}
huffman_coding(hufm,NUM_OF_HUFFMAN);
}
else
{
char temp_char;
int i;
for(i=;i<(NUM_OF_HUFFMAN+)/;i++)
{
fgets(buf,sizeof(buf),fin_of_huffman);
sscanf(buf,"%c %d %s",&temp_char,&hufm[i].num_of_ch,hufm[i].code);
}
}
fclose(fin_of_huffman); printf("building trie...");
FILE* fin_of_email;
fin_of_email = fopen(EMAIL,"r");
trie_node *root;
root = (trie_node*)malloc(sizeof(trie_node));
trie_node_init(&root); while(fgets(buf,sizeof(buf),fin_of_email)!=NULL)
{
str_to_bin(buf,binary,hufm);
trie_insert(&root,binary);
}
fclose(fin_of_email);
printf("\r");
printf("build trie success.\n"); FILE *fin_of_checked,*fout_of_result;
fin_of_checked = fopen(CHECKED_EMAIL,"r");
fout_of_result = fopen(RESULT,"w");
int num_yes = ;
int num_no = ;
while(fgets(buf,sizeof(buf),fin_of_checked)!=NULL)
{
str_to_bin(buf,binary,hufm);
if(trie_search(root,binary))
{
fprintf(fout_of_result,"YES\n");
num_yes++;
}
else
{
fprintf(fout_of_result,"NO\n");
num_no++;
}
}
fprintf(fout_of_result,"num of YES is:%d\n",num_yes);
fprintf(fout_of_result,"num of NO is:%d\n",num_no);
printf("search success!\n");
fclose(fin_of_checked);
fclose(fout_of_result);
//time_end = time(NULL);
//printf("用时:%.0lfs\n", difftime(time_end, time_start));
return ;
} void str_to_bin(char buf[],char binary[],huffman_node hufm[])
{
int i;
binary[] = '\0';
for(i=strlen(buf)-;i>=;i--)
{
if(buf[i]>='a' && buf[i]<='z')
{
strcat(binary,hufm[buf[i]-'a'].code);
}
else if(buf[i]>='A' && buf[i]<='Z')
{
strcat(binary,hufm[buf[i]-'A'].code);
}
else if(buf[i]>='' && buf[i]<='')
{
strcat(binary,hufm[+buf[i]-''].code);
}
else if(buf[i]=='_')
{
strcat(binary,hufm[].code);
}
else if(buf[i]=='-')
{
strcat(binary,hufm[].code);
}
else if(buf[i]=='.')
{
strcat(binary,hufm[].code);
}
else if(buf[i]=='@')
{
strcat(binary,hufm[].code);
}
else
{
strcat(binary,hufm[].code);
}
}
}
/**
完成trie树的插入,查找。
*/ typedef struct TRIE_NODE
{
char is_str;
unsigned short num_of_bit;
unsigned char* compress_of_bit;
struct TRIE_NODE *point_of_zero,*point_of_one;
}trie_node; //long int temp_of_new = 0; void trie_node_init(trie_node **root);
int trie_insert(trie_node **root,char* bit_of_insert);
int trie_search(trie_node *root,char* bit_of_insert);
void trie_delete(trie_node *root);
void compress(trie_node *root,char* bit_of_insert);
int compare_of_bit(trie_node *root,char* bit_of_insert);
void pop_bit(trie_node *root,char* bit_of_pop,int len_of_pop); void trie_node_init(trie_node **root)
{
(*root)->is_str = (char);
(*root)->num_of_bit = ;
(*root)->compress_of_bit = NULL;
(*root)->point_of_zero = NULL;
(*root)->point_of_one = NULL;
} void compress(trie_node *root,char* bit_of_insert)
{
int i,j,len_of_insert;
len_of_insert = strlen(bit_of_insert);
root->num_of_bit = len_of_insert;
if(root->num_of_bit<=)
{
int temp;
for(i=len_of_insert-,j=;i>=;i--,j++)
{
if(bit_of_insert[i] == '')
{
clearbit(temp,j);
}
else
{
setbit(temp,j);
}
}
root->compress_of_bit = (unsigned char*)temp;
}
else
{
root->compress_of_bit = (unsigned char*)malloc((len_of_insert%)?(len_of_insert/+):(len_of_insert/));
for(i=len_of_insert-,j=;i>=;i--,j++)
{
if(bit_of_insert[i] == '')
{
clearbit(root->compress_of_bit[j/],j%);
}
else
{
setbit(root->compress_of_bit[j/],j%);
}
}
}
} int trie_insert(trie_node **root,char* bit_of_insert)
{
int ret;
char bit_of_pop[];
if(root == NULL)
{
ret = ;
}
else
{
if((*root)->num_of_bit == )
{
if(!(*bit_of_insert))
{
(*root)->is_str = (char);
ret = ;
}
else
{
if((*root)->is_str ==
&& (*root)->point_of_zero == NULL
&& (*root)->point_of_one == NULL)
{
compress((*root),bit_of_insert);
(*root)->is_str = (char);
ret = ;
}
else
{
if(*bit_of_insert == '')
{
if((*root)->point_of_zero == NULL)
{
(*root)->point_of_zero = (trie_node*)malloc(sizeof(trie_node));
trie_node_init(&(*root)->point_of_zero);
//temp_of_new++;
}
ret = trie_insert(&(*root)->point_of_zero,bit_of_insert+);
}
else
{
if((*root)->point_of_one == NULL)
{
(*root)->point_of_one = (trie_node*)malloc(sizeof(trie_node));
trie_node_init(&(*root)->point_of_one);
//temp_of_new++;
}
ret = trie_insert(&(*root)->point_of_one,bit_of_insert+);
}
}
}
}
else
{
int ans_of_compare = compare_of_bit((*root),bit_of_insert);
if(ans_of_compare == )
{
trie_node *father = (trie_node*)malloc(sizeof(trie_node));
trie_node_init(&father);
//temp_of_new++;
pop_bit((*root),bit_of_pop,);
if(bit_of_pop[] == '')
{
father->point_of_zero = (*root);
}
else
{
father->point_of_one = (*root);
}
if(!(*bit_of_insert))
{
father->is_str = (char);
ret = ;
}
else
{
if(*bit_of_insert == '')
{
father->point_of_zero = (trie_node*)malloc(sizeof(trie_node));
trie_node_init(&father->point_of_zero);
//temp_of_new++;
ret = trie_insert(&father->point_of_zero,bit_of_insert+);
}
else
{
father->point_of_one = (trie_node*)malloc(sizeof(trie_node));
trie_node_init(&father->point_of_one);
//temp_of_new++;
ret = trie_insert(&father->point_of_one,bit_of_insert+);
}
}
(*root) = father;
}
else
{
if(ans_of_compare == (int)(*root)->num_of_bit
&& ans_of_compare == strlen(bit_of_insert))
{
(*root)->is_str = (char);
ret = ;
}
else if(ans_of_compare == (int)(*root)->num_of_bit)
{
bit_of_insert += ans_of_compare;
if(*bit_of_insert == '')
{
if((*root)->point_of_zero == NULL)
{
(*root)->point_of_zero = (trie_node*)malloc(sizeof(trie_node));
trie_node_init(&(*root)->point_of_zero);
//temp_of_new++;
}
ret = trie_insert(&(*root)->point_of_zero,bit_of_insert+);
}
else
{
if((*root)->point_of_one == NULL)
{
(*root)->point_of_one = (trie_node*)malloc(sizeof(trie_node));
trie_node_init(&(*root)->point_of_one);
//temp_of_new++;
}
ret = trie_insert(&(*root)->point_of_one,bit_of_insert+);
}
}
else if(ans_of_compare == strlen(bit_of_insert))
{
trie_node *father = (trie_node*)malloc(sizeof(trie_node));
trie_node_init(&father);
//temp_of_new++;
pop_bit((*root),bit_of_pop,ans_of_compare);
compress(father,bit_of_pop);
father->is_str = (char);
pop_bit((*root),bit_of_pop,);
if(bit_of_pop[] == '')
{
father->point_of_zero = (*root);
}
else
{
father->point_of_one = (*root);
}
(*root) = father;
}
else
{
trie_node *father = (trie_node*)malloc(sizeof(trie_node));
trie_node_init(&father);
//temp_of_new++;
pop_bit((*root),bit_of_pop,ans_of_compare);
compress(father,bit_of_pop);
pop_bit((*root),bit_of_pop,);
bit_of_insert += ans_of_compare+; if(bit_of_pop[] == '')
{
father->point_of_zero = (*root);
father->point_of_one = (trie_node*)malloc(sizeof(trie_node));
trie_node_init(&father->point_of_one);
//temp_of_new++;
ret = trie_insert(&father->point_of_one,bit_of_insert);
}
else
{
father->point_of_one = (*root);
father->point_of_zero = (trie_node*)malloc(sizeof(trie_node));
trie_node_init(&father->point_of_zero);
//temp_of_new++;
ret = trie_insert(&father->point_of_zero,bit_of_insert);
}
(*root) = father;
}
}
}
}
return ret;
} int trie_search(trie_node *root,char *bit_of_search)
{
trie_node *p = root;
while(p!=NULL && *bit_of_search)
{
if(p->num_of_bit!=)
{
if((int)p->num_of_bit == compare_of_bit(p,bit_of_search))
{
bit_of_search += (int)p->num_of_bit;
}
else
{
p=NULL;
break;
}
}
if(!(*bit_of_search))
{
break;
}
if(bit_of_search[]=='')
{
p = p->point_of_zero;
bit_of_search++;
}
else if(bit_of_search[]=='')
{
p = p->point_of_one;
bit_of_search++;
}
if(!(*bit_of_search) && p!=NULL && p->num_of_bit!=)
{
p=NULL;
break;
}
}
if(p!=NULL)
{
return p->is_str;
}
else
{
return ;
}
} void trie_delete(trie_node *root)
{
if(root == NULL)
return;
trie_delete(root->point_of_zero);
trie_delete(root->point_of_one);
free(root);
} int compare_of_bit(trie_node *root,char* bit_of_insert)
{
int len_of_insert = strlen(bit_of_insert);
int i,j,tempbit;
if(root->num_of_bit<=)
{
for(i=,j=root->num_of_bit-;i<len_of_insert && i<root->num_of_bit;i++,j--)
{
tempbit = getbit((int)root->compress_of_bit,j);
if(bit_of_insert[i]-'' != tempbit)
{
break;
}
}
}
else
{
for(i=,j=root->num_of_bit-;i<len_of_insert && i<root->num_of_bit;i++,j--)
{
tempbit = getbit(root->compress_of_bit[j/],j%);
if(bit_of_insert[i]-'' != tempbit)
{
break;
}
}
}
return i;
} void pop_bit(trie_node *root,char* bit_of_pop,int len_of_pop)
{
int i,j;
short num_of_bit = root->num_of_bit - (short)len_of_pop;
if(root->num_of_bit<=)
{
for(i=,j=root->num_of_bit-;i<len_of_pop;i++,j--)
{
bit_of_pop[i] = getbit((int)root->compress_of_bit,j) +'';
}
bit_of_pop[i] = '\0';
}
else
{
for(i=,j=root->num_of_bit-;i<len_of_pop;i++,j--)
{
bit_of_pop[i] = getbit(root->compress_of_bit[j/],j%) +'';
}
bit_of_pop[i] = '\0'; if(num_of_bit == )
{
free(root->compress_of_bit);
}
else if(num_of_bit<=)
{
int temp;
for(j=num_of_bit-;j>=;j--)
{
if(getbit(root->compress_of_bit[j/],j%) == )
{
clearbit(temp,j);
}
else
{
setbit(temp,j);
}
}
free(root->compress_of_bit);
root->compress_of_bit = (unsigned char*)temp;
}
else
{
unsigned char *p;
short num_of_byte = (num_of_bit%)?(num_of_bit/+):(num_of_bit/);
if(((root->num_of_bit%)?(root->num_of_bit/+):(root->num_of_bit/)) != num_of_byte)
{
p = (unsigned char*)malloc(num_of_byte);
short i;
for(i=;i<num_of_byte;i++)
{
p[i] = root->compress_of_bit[i];
}
free(root->compress_of_bit);
root->compress_of_bit = p;
}
}
}
root->num_of_bit = num_of_bit;
}
大规模字符串检索-压缩trie树的更多相关文章
- 835. 字符串统计(Trie树模板题)
维护一个字符串集合,支持两种操作: “I x”向集合中插入一个字符串x: “Q x”询问一个字符串在集合中出现了多少次. 共有N个操作,输入的字符串总长度不超过 105105,字符串仅包含小写英文字母 ...
- Trie树|字典树(字符串排序)
有时,我们会碰到对字符串的排序,若采用一些经典的排序算法,则时间复杂度一般为O(n*lgn),但若采用Trie树,则时间复杂度仅为O(n). Trie树又名字典树,从字面意思即可理解,这种树的结构像英 ...
- Trie树及其应用
Trie树及其应用 Trie树 Trie树,又称单词查找树.字典树,是一种树形结构,是一种哈希树的变种,是一种用于快速检索的多叉树结构.典型应用是用于统计和排序大量的字符串(但不仅限于字符串),所以经 ...
- Trie树(代码),后缀树(代码)
Trie树系列 Trie字典树 压缩的Trie 后缀树Suffix tree 后缀树--ukkonen算法 Trie是通过对字符串进行预先处理,达到加快搜索速度的算法.即把文本中的字符串转换为树结构, ...
- [转]双数组TRIE树原理
原文名称: An Efficient Digital Search Algorithm by Using a Double-Array Structure 作者: JUN-ICHI AOE 译文: 使 ...
- 【动画】看动画轻松理解「Trie树」
Trie树 Trie这个名字取自“retrieval”,检索,因为Trie可以只用一个前缀便可以在一部字典中找到想要的单词. 虽然发音与「Tree」一致,但为了将这种 字典树 与 普通二叉树 以示区别 ...
- Trie树(Prefix Tree)介绍
本文用尽量简洁的语言介绍一种树形数据结构 -- Trie树. 一.什么是Trie树 Trie树,又叫字典树.前缀树(Prefix Tree).单词查找树 或 键树,是一种多叉树结构.如下图: 上图是一 ...
- 数据结构与算法—Trie树
Trie,又经常叫前缀树,字典树等等.它有很多变种,如后缀树,Radix Tree/Trie,PATRICIA tree,以及bitwise版本的crit-bit tree.当然很多名字的意义其实有交 ...
- trie树(前缀树)详解——PHP代码实现
trie树常用于搜索提示.如当输入一个网址,可以自动搜索出可能的选择.当没有完全匹配的搜索结果,可以返回前缀最相似的可能. 一.Tire树的基本性质 根节点不包含字符,除根节点外每一个节点都只包含一个 ...
随机推荐
- 【转】LaTeX 符号命令大全
函数.符号及特殊字符 声调 语法 效果 语法 效果 语法 效果 \bar{x} \acute{\eta} \check{\alpha} \grave{\eta} \breve{a} \ddot{y} ...
- Delphi 编写的Web Service
一编写服务程序 第一步:File----->New----->Other------>WebServices----->Soap Server Application选择I ...
- 用xib文件,配置UITableViewCell
http://www.cnblogs.com/lixingle/p/3287499.html 在运行时候,如果出现“this class is not key value coding-complia ...
- [Python]Pip的安装以及简单的使用
Pip的安装 安装python以后(我的python版本是32位,版本号2.7.10),如果需要安装一些其他的库,一般有两种办法,一种是自己手动去各个库的官网下载,自己安装:另一种方法是安装pip,使 ...
- [Git] Github客户端上publish后一直转圈,web上未上传成功
连续试了几次,publish后一直处于publish状态,点击其它repositories再点回来就没动静了,也看不到Sys按钮...最后发现,是要等很久才会成功,天朝的网络伤不起
- 《Linear Algebra and Its Applications》-chaper6-正交性和最小二乘法-基本概念与定理
这一章节我们主要讨论定义在R^n空间上的向量之间的关系,而这个关系概括来讲其实就是正交,然后引入正交投影.最佳逼近定理等,这些概念将为我们在求无解的线性方程组Ax=b的最优近似解打下基石. 正交性: ...
- hdu 4411 arrest 最小费用流
#include <cstdio> #include <cstring> #include <iostream> #include <cmath> #i ...
- nginx简单双机热备:backup参数的使用
nginx简单双机热备:backup参数的使用 nginx简单双机热备:backup参数的使用
- thinkphp 获取客户端ip地址方法
/** * 获取客户端IP地址 * @param integer $type 返回类型 0 返回IP地址 1 返回IPV4地址数字 * @param boolean $adv 是否进行高级模式获取(有 ...
- Linux命令 — 设置或查看网络配置命令ifconfig
ifconfig命令用于设置或查看网络配置,包括IP地址.网络掩码.广播地址等.它是linux系统中,使用频率最高的关于网络方面的命令. 1. 命令介绍 命令格式: ifconfig [interfa ...