miniblast_hash算法c语言实现

对于一组基因文件中的基因序列，选取一段基因片段，作为索引，利用hash表，查找固定的基因片段。有一定的并且容忍错误。

简单讲就是自己实现一个hashtable，将选出特定字符串建立索引，便于查询。输出时可容忍一定数量的错误。

贴上代码

HashTable.h

#include <iostream>

using namespace std;

typedef int KeyType;

#define NULLKEY -1

struct Entry {

    KeyType _key;

    string    _value;

    Entry(KeyType key = NULLKEY, string value = "") :_key(key), _value(value) {}

};

class hashTable {

public:

    hashTable();

    hashTable::hashTable(int table_size, double occupancy_get);

    //hashTable(int tableSize);

    ~hashTable();

    bool find(const Entry&  e);

    string hashTable::getValue(const KeyType& k);

    bool insert(const Entry& e);

    bool remove(const Entry& e);

    void clear();

    Entry& operator[](KeyType key);//重载下标操作；当找不到key对应的Entry时，插入Entry(key,0)

    int size();

    void display();

protected:

    int hashFunction(KeyType key);//将键值映射到对应地址

    void rehash();//调整hashTable大小

    bool find(const KeyType& k);//按键值查找

    int nextPrime();//p(n) = n^2 - n + 41, n<41, p<1681

private:

    Entry *_pTable;

    int _pos;//当前访问元素的位置

    int _size;

    int _capacity;

    int primeIndex;

};

HashTable.cpp

#include "HashTable.h"

static double _occupancy = 0.5;

//开放定址法

hashTable::hashTable()

{

    _capacity = ;//初始化hashTable容量为100,便于观察rehash过程

    _pTable = new Entry[_capacity];

    _size = ;

    primeIndex = ;

}

hashTable::hashTable(int table_size,double occupancy_get)

{

    _capacity = table_size;

    _occupancy = occupancy_get;

    _pTable = new Entry[_capacity];

    _size = ;

    primeIndex = ;

}

hashTable::~hashTable()

{

    clear();

}

int hashTable::nextPrime()

{

    int p = std::pow(static_cast<float>(primeIndex), ) - primeIndex +;

    primeIndex = primeIndex << ;

    if (primeIndex >= ) {

        cout << "Max capacity reached. exit!" << endl;

        exit(-);

    }

    return p;

}

bool hashTable::find(const Entry&  e)

{

    return(find(e._key));

}

bool hashTable::find(const KeyType& k)

{

    _pos = hashFunction(k);

    if (_pTable[_pos]._key == NULLKEY)

        return false;

    int lastPos = _pos;

    while (_pTable[_pos]._key != k) {

        if (++_pos%_capacity == lastPos)

            return false;

    }

    return true;

}

string hashTable::getValue(const KeyType& k)

{

    _pos = hashFunction(k);

    if (_pTable[_pos]._key == NULLKEY)

        return "";

    int lastPos = _pos;

    while (_pTable[_pos]._key != k) {

        if (++_pos%_capacity == lastPos)

            return "";

    }

    return _pTable[_pos]._value;

}

bool hashTable::insert(const Entry& e)

{

    if ((_size*1.0) / _capacity>_occupancy) //通过occupancy判断是否需要扩容

        rehash();//[OK]插入操作前，需要判断hash table是否需要扩容

    if (find(e))

        return false;

    _pTable[_pos] = e;

    ++_size;

    return true;

}

bool hashTable::remove(const Entry& e)

{

    if (!find(e))

        return false;

    _pTable[_pos]._key = NULLKEY;

    --_size;

    //rehash();//移除操作后，需要判断hash table是否需要缩容

    return true;

}

void hashTable::clear()

{

    delete[]_pTable;

    _size = _capacity = ;

}

Entry& hashTable::operator[](KeyType key)

{

    if (!find(key))

        insert(Entry(key, ));

    return _pTable[_pos];

}

int hashTable::size()

{

    return _size;

}

int hashTable::hashFunction(KeyType key)

{

    return key%_capacity;

}

void hashTable::display()

{

    cout << "capacity = " << _capacity << ", size = " << _size << endl;

    for (int i = ; i<_capacity; i++) {

        if (_pTable[i]._key != NULLKEY);

//            cout << "key=" << _pTable[i]._key << ",value=" << _pTable[i]._value << endl;

    }

}

void hashTable::rehash()

{

    //cout << "begin rehash..." << endl;

    Entry *p = new Entry[_size];//用来暂存原哈希表

    for (int i = ; i<_capacity; i++) {//i<_size不对；元素散列在容量为_capacity的hashTable中

        if (_pTable[i]._key != NULLKEY)

            *(p + i) = _pTable[i];

    }

    delete[]_pTable;

    int lastSize = _size;

    _size = ;

    _capacity = nextPrime();

    _pTable = new Entry[_capacity];

    for (int i = ; i<lastSize; i++)

        insert(*(p + i));

    delete[]p;

}

miniblast_hash.cpp

#include <iostream>

#include <string>

#include <fstream>

#include "HashTable.h"

using namespace std;

const int KMER_MAX = ;

//得到基因组文件中，position开始，长度为len的字符串

string getStringByPos(string file_name, long position,int str_len) {

    //读入数据

    std::ifstream myfile(file_name);

    if (!myfile.is_open())

    {

        cout << "未成功打开文件" << endl;

    }

    string s = "";

    //移动文件指针到基因指定位置，因为有无用字符，需要边移动边判断

    for (long i = ; i < position; i++)

    {

        char c;

        myfile.get(c);

        //移动指针的过程中要忽略无意义字符

        if (c=='/0'||c=='\n')

            i--;

    }

    //截取指定长度的字符串

    for (long i = ; i < str_len; i++)

    {

        char c;

        myfile.get(c);

        //忽略无意义字符

        if (c == '/0' || c == '\n')

            i--;

        else

        {

            switch (c)

            {

            case 'A':

                s.append("A");

                break;

            case 'G':

                s.append("G");

                break;

            case 'C':

                s.append("C");

                break;

            case 'T':

                s.append("T");

                break;

            default:

                break;

            }

        }

    }

    return s;

}

//比较检索基因片段与原始片段相差个数

int compareString(string s1, string s2, int len) {

    int num = ;

    for (int i = ; i < len; i++)

    {

        if (s1[i] != s2[i])

            num++;

    }

    return num;

}

//搜索指定基因片段

void searchGenome(string file_name,hashTable *pTable,int mistakeNum,int kmer,string query_string,string out_name) {

    char *kmer_cut = new char[kmer];

    int length = query_string.length();

    int findNum = ;

    int table_size = pTable->size();

    //std::ofstream out(out_name, ios::app);//输出结果至文件

    for (int i = ; i <kmer; i++)

    {

        kmer_cut[i] = query_string[i];

    }

    string kmer_string(&kmer_cut[],&kmer_cut[kmer]);

    for (int i = ; i <table_size; i++) {

        string get = pTable->getValue(i);

        if (kmer_string == get)

        {

            if (i + length > table_size)

                continue;

            string ss = getStringByPos(file_name, i, length);

            int num = compareString(ss, query_string,length);

            if (num<=mistakeNum)

            {

                findNum++;

                cout <<i<<" "<<num<<" "<<ss << endl;

            //    out << i <<" "<< num <<" " << ss << endl;

            }

        }

    }

    if (findNum==)

    {

        cout<< "No Match" << endl;

    //    out << "No Match" << endl;

    }

    //out.close();

}

//构建基因搜索用的hashtable索引

void GenomeIndex(string genome_file_name,int kmer, hashTable *pTable) {

    //读入数据,建立索引

    std::ifstream myfile(genome_file_name);

    if (!myfile.is_open())

    {

        cout << "未成功打开文件" << endl;

    }

    char *c = new char[KMER_MAX];

    myfile.get(c, kmer + );

    string add(&c[], &c[kmer]);

    int geno_loc = ;

    pTable->insert(Entry(, add));

    //cout << add << endl;

    while (!myfile.eof())

    {

        char next_char;

        myfile.get(next_char);

        //去除文件中无用字符

        if (next_char != '/0'&&next_char != '\n')

        {

            for (int i = ; i < kmer; i++)

                c[i] = c[i + ];

            c[kmer-] = next_char;

            string add(&c[], &c[kmer]);

            geno_loc++;

            pTable->insert(Entry(geno_loc, add));

            //cout << add <<geno_loc<< endl;

        }

    }

    myfile.close();

}

int main(int   argc, char*   argv[])

{

    /*string input_file_name ="";

    string out_file_name = "";

    if (argc>1)

    {

        input_file_name = string(argv[1]);

        out_file_name = string(argv[2]);

    }*/

    //如果不需要命令行参数执行文件，直接赋值input_file_name就好

    //input_file_name = "input_small.txt";

    std::freopen(argv[], "r", stdin);

    std::freopen(argv[], "w", stdout);

    //std::ifstream inputfile(input_file_name);

    hashTable *pTable=new hashTable();

    /*if (!inputfile.is_open())

    {

        cout << "未成功打开文件" << endl;

    }*/

    //先读取命令文件，设置相关参数，初始值为随意指定，无意义

    string str_order = "";

    string genome_file_name = "";

    int table_size =;

    double occupancy =0.4;

    int kmer = ;

    int mistakeNum = ;

    string query_string;

    bool isIndex = false;

    while (str_order!="quit")

    {

        cin >> str_order;

        if (str_order=="genome")

        {

            cin >> genome_file_name;

        }

        else if (str_order == "table_size") {

            cin >> table_size;

        }

        else if (str_order == "occupancy") {

            cin >> occupancy;

            pTable = new hashTable(table_size, occupancy);

        }

        else if (str_order == "kmer") {

            cin >> kmer;

        }

        else if (str_order == "query") {

            cin >> mistakeNum >> query_string;

            //首次进行查询前，要建立索引hashTable

            if (!isIndex)

            {

                isIndex = true;

                GenomeIndex(genome_file_name, kmer, pTable);

            }

            //std::ofstream out(out_file_name,ios::app);//输出结果至文件

            /*if (out.is_open())

            {

                out << "Query: " << query_string <<endl;

            }*/

            cout << "Query: " << query_string << endl;

            //out.close();

            searchGenome(genome_file_name, pTable, mistakeNum, kmer, query_string, "");

        }

    }

    delete pTable;//释放指针，防止内存泄露

    //getchar();//暂停程序，观察结果

    return ;

}

　　因为需要，代码上有详细注解，欢迎交流学习。

miniblast_hash算法c语言实现的更多相关文章

【转】位置式、增量式PID算法C语言实现
位置式.增量式PID算法C语言实现芯片:STM32F107VC 编译器:KEIL4 作者:SY 日期:2017-9-21 15:29:19 概述 PID 算法是一种工控领域常见的控制算法,用于闭环反 ...
PID算法(C语言)
/************ PID算法(C语言) ************/ #include <stdio.h> #include<math.h> struct _pid { ...
PageRank算法R语言实现
PageRank算法R语言实现 Google搜索,早已成为我每天必用的工具,无数次惊叹它搜索结果的准确性.同时,我也在做Google的SEO,推广自己的博客.经过几个月尝试,我的博客PR到2了,外链也 ...
数据挖掘算法R语言实现之决策树
数据挖掘算法R语言实现之决策树最近,看到很多朋友问我如何用数据挖掘算法R语言实现之决策树,想要了解这方面的内容如下: > library("party")导入数据包 > ...
数据结构算法C语言实现（八）--- 3.2栈的应用举例：迷宫求解与表达式求值
一.简介迷宫求解:类似图的DFS.具体的算法思路可以参考书上的50.51页,不过书上只说了粗略的算法,实现起来还是有很多细节需要注意.大多数只是给了个抽象的名字,甚至参数类型,返回值也没说的很清楚, ...
数据结构算法C语言实现（六）---2.4一元多项式的表示及相加
一.简述利用链表表示稀疏多项式,并基于之前的一些操作(编程实现上还是有所不同的)组合新的操作实现一元多项式的表示及相加. 二.ADT 抽象数据类型一元多项式的定义 ADT Polyomail{ 数据 ...
数据结构算法C语言实现（五）---2.3重新定义线性链表及其基本操作
一.简述 ...由于链表在空间的合理利用上和插入.删除时不需要移动等的优点,因此在很多场合下,它是线性表的首选存储结构.然而,它也存在着实现某些基本操作,如求线性表的长度时不如顺序存储结构的缺点:另一 ...
数据结构算法C语言实现（二）---2.3线性表的链式表示和实现之单链表
一.简述 [暂无] 二.头文件 #ifndef _2_3_part1_H_ #define _2_3_part1_H_ //2_3_part1.h /** author:zhaoyu email:zh ...
回溯算法-C#语言解决八皇后问题的写法与优化
结合问题说方案,首先先说问题: 八皇后问题:在8X8格的国际象棋上摆放八个皇后,使其不能互相攻击,即任意两个皇后都不能处于同一行.同一列或同一斜线上,问有多少种摆法. 嗯,这个问题已经被使用各种语言解 ...

随机推荐

display:block、inline、inline-block的区别及应用案例
A.display:block就是将元素显示为块级元素. block元素的特点是: 1.总是在新行上开始: 2.高度,行高以及顶和底边距都可控制: 3.宽度缺省是它的容器的100%,除非设定一个宽度; ...
如何写.gitignore只包含指定的文件扩展名
# .gitignore # 首先忽略所有的文件 * # 但是不忽略目录 !*/ # 忽略一些指定的目录名 ut/ # 不忽略下面指定的文件类型 !*.c++ !*.cc !*.cp !*.cpp ! ...
Metronic 对话 chat
http://keenthemes.com/preview/metronic/theme/admin_1/index.html: jquery让滚动条默认在最底部:$('#content').scro ...
Mac下使用sublime Text打开隐藏目录
我们用 sublime Text 打开时,默认是看到非隐藏的目录和文件,如下图: 这时候在这个节目,按下 command +shift + 句号快捷键,会自动切换隐藏状态的, 这时候就可以切换成下 ...
csharp:A Custom CheckedListBox with Datasource
/// <summary> /// (eraghi) /// Custom CheckedListBox with binding facilities (Value property) ...
基于jQuery日历插件制作日历
这篇文章主要介绍了基于jQuery日历插件制作日历的相关资料,需要的朋友可以参考下来看下最终效果图吧: 是长得丑了一点,不要吐槽我-.- 首先来说说这个日历主要的制作逻辑吧: ·一个月份最多有31天 ...
git杂记-远程仓库的使用
查看远程仓库:克隆自己的仓库,如不命名则默认远程仓库名字为origin: $ git clone https://github.com/OuFeng/JF_WEB.git Cloning into ' ...
web第一章(html)
HTML介绍 HyperText(超文本) Markup(标记) Language(语音) 类似于XML都是由标签组成 xml:是可扩展标记语言,标签可以任意自定义 HTML:不可以使用任意标签,学习 ...
.net C# Sql数据库SQLHelper类
using System;using System.Collections.Generic;using System.Text;using System.Collections;using Syste ...
Android学习笔记(2)----LocationManager的使用
今天使用Android的LocationManager制作了一款获取当前经纬坐标位置的软件. LocationManager获取的只是经纬坐标点,为了解析出当前经纬坐标点的实际位置,可以使用Googl ...

miniblast_hash算法c语言实现

miniblast_hash算法c语言实现的更多相关文章

随机推荐

热门专题