散列查找的C实现

概念

散列查找，类似与查英文字典的过程。如果我们要查找“zoo”（key）对应的释义（value），我们不会从第一页开始逐页查找（顺序查找），而是直接根据大致的推算（Hash函数），找到最后几页。

所以散列查找的基本思想是根据key通过hash函数计算出value的位置，然后得到value。

按理说每一个key都对应不同的地址是最理想的，然而这种hash函数很难找到（或者说没有实用意义，比如线性函数）就会发生多个key对应一个地址的现象，这是就需要去处理冲突。

数字关键字的散列函数的构造方法一般有：

1）直接定址法

h(key) = a * key + b

计算简单，且不会产生冲突，但要求地址集合与关键词集合大小相同，所以并不实用。

数组使用的就是直接定址法。

2）除留余数法

h(key) = key mod p

一般选取key为素数，这样均匀分布的可能性比较大。

3）数字分析法

就是根据具体情况来处理。比如使用身份证号码作为关键字，那么应选取其中比较随机的数值，比如前两位代表省编号的数字就不适合，会导致聚集效应。

字符关键字的散列函数构造

1）ASCII码加和法

h(key) = (∑key[i]) mod TableSize

即将每一位字符对应的ASCII码加起来求和作为特征值。但这种方法太过简单，假设最大长度为8的字符串，(∑key[i])的取值区间为0至1016，而关键词集合包含有63^8种关键词，如此显然会造成严重的冲突。

2）简单改进——前三个字符移位法

h(key) = (key[0] + key[1]*27 + key[2]*27^2) mod TableSize

3）好的散列函数——移位法

只考虑26个字母，那么一个字母至少需要5位去存储，假设字符串最大长度为12，那么取一个64位长度的无符号整数，依次移位将字符串映射到整数上去，便得到了特征值，不会出现重复。

处理冲突的方法

1）开放地址法

1.线性探测法

比如在插入时发生冲突（已被占据），那么逐次向后查找空位。

会产生一次聚集（Primary Clustering）现象

2.平方探测法

2）分离链表法

实现

//头文件

//HashTable.h

#ifndef HASHTABLE_H_

#define HASHTABLE_H_

#include <stdbool.h>

#define KEYLENGTH 20

#define MAXTABLESIZE 100000

//哈希表类型

typedef struct TblNode* HashTable;

//关键字类型为字符串，最大长度由KEYLENGTH宏定义，只能包含大小写字母，数字，及下划线和空格

typedef char KeyType[KEYLENGTH + 1];

//值类型，为void指针

typedef void* ValueType;

//自定义的释放数据函数指针类型

typedef void (*FreeIt)(void* data);

//创建一个容量为N的散列表(N < MAXTABLESIZE)

//表越大性能越好，同时也越占据内存

HashTable CreateTable(int TableSize);

//向表中 插入/修改 键值关系，如果出错则返回false（比如key中有非法字符）

bool InsertEntry(HashTable H, const KeyType key, const ValueType data);

//通过key-value的关系获取值，如果不存在或key中有非法字符则返回NULL

ValueType getValue(const HashTable H, const KeyType key);

//通过key来删除关系，失败返回false

bool DelEntry(HashTable H, const KeyType key);

//释放散列表占据的内存

//如果不需要释放数据，则第二个为NULL

void DestroyTable(HashTable H, FreeIt func);

#endif

//HashTable.c

#include "HashTable.h"

#include <stdint.h>

#include <string.h>

#include <ctype.h>

#include <stdbool.h>

#include <malloc.h>

#include <math.h>

#include <stdio.h>

#define ERROR -1

//一个存储单元包括键，值，及状态标记

typedef struct HashEntry Cell;

struct HashEntry {

	KeyType Key;	//Key是一个数组

	ValueType Data;

	Cell* Next;	//单向链表

};

//哈希表，存放存储单元数组

//主干使用数组，分支使用链表

typedef struct TblNode* HashTable;

struct TblNode {

	int TableSize;

	Cell* Cells;

};

//将单个字符（数字、字母、下划线或空格）映射到0-63上

//返回ERROR表示出错

static int CharMap_(char ch)

{

	//数字映射到0-9

	if (isdigit(ch)) {

		return ch - '0';

	} else

	//大写字母映射到10-35

	if (isupper(ch)) {

		return ch - 'A' + 10;

	} else

	//小写字母映射到36-61

	if(islower(ch)) {

		return ch - 'a' + 36;

	} else

	//空格映射到62

	if (ch == ' ') {

		return 62;

	} else

	//下划线映射到63

	if (ch == '_') {

		return 63;

	} else {

	//超出映射范围

		fprintf(stderr, "Error key format\n");

		return ERROR;

	}

}

//利用移位法计算字符串关键字映射的地址

//每一个字符有26*2+2+10=2^6,64/6=10，

//字符串最多10位，如果超过10位，便只取奇数位数字

//于是有效长度为20位

//如果出错，返回ERROR

static int Hash(const char* key, int TableSize)

{

	//关键字长度

	int len = strlen(key);

	//步长

	int step = (len > 10) ? 2 : 1;

	//字符串尾指针

	const char* const end = key + len;

	//散列函数值，初始化为0

	uint64_t H = 0;

	//每一次左移6位，把字符的6位映射刻上去

	while (key < end) {

		int val = CharMap_(*key);

		if (val == ERROR) {

			//错误传递

			fprintf(stderr, "Error in Hash()\n");

			return ERROR;

		}

		H = (H << 6) + val;

		key += step;

	}

	//将计算出来的字符串特征值对表长取模即为地址

	return (H % TableSize);

}

//返回大于N且小于MAXTABLESIZE的最小素数

static int NextPrime(int N)

{

	//从大于N的下一个奇数开始

	int i, p = (N % 2) ? (N + 2) : (N + 1);

	while (p < MAXTABLESIZE) {

		for (i = (int)sqrt(p); i > 2; --i) {

			if (p % i == 0) break;

		}

		//说明for没有中途退出，p是素数

		if (i == 2) break;

		//否则试探下一个素数

		else p += 2;

	}

	return p;

}

//找到key对应的cell的前一个cell的指针，如果返回NULL说明出了问题

//正常情况下不会返回NULL

static Cell* FindEntryPre(const HashTable H, const KeyType key)

{

	//计算得到数组中的位置

	int p = Hash(key, H->TableSize);

	if (p == ERROR) {

		return NULL;

	}

	Cell* pre = &(H->Cells[p]);

	for (; (pre->Next) && strcmp(key, (pre->Next)->Key); pre = pre->Next);

	return pre;

}

//创建一个容量为N的散列表(N < MAXTABLESIZE)

HashTable CreateTable(int TableSize)

{

	HashTable H = (HashTable)malloc(sizeof(struct TblNode));

	//散列表中的数组的实际大小比输入的TableSize大一点

	H->TableSize = NextPrime(TableSize);

	//为数组分配内存

	H->Cells = (Cell*)malloc(sizeof(Cell) * H->TableSize);

	//初始化数组

	for (int i = 0; i < H->TableSize; ++i) {

		H->Cells[i].Data = NULL;

		H->Cells[i].Key[0] = '\0';

		H->Cells[i].Next = NULL;

	}

	return H;

}

//插入键值关系，如果出错则返回false（比如key中有非法字符）

bool InsertEntry(HashTable H, const KeyType key, const ValueType data)

{

	Cell* pre = FindEntryPre(H, key);

	if (pre == NULL) {

		fprintf(stderr, "Error in InsertEntry()\n");

		return false;

	}

	//如果是空的，那就新建一个

	if (pre->Next == NULL) {

		pre->Next = (Cell*)malloc(sizeof(Cell));

		strcpy((pre->Next)->Key, key);

		pre->Next->Next = NULL;

	}

	//修改对应的value

	pre->Next->Data = data;

	return true;

}

//key-value获取值，如果不存在或key中有非法字符则返回NULL

ValueType getValue(const HashTable H, const KeyType key)

{

	Cell* pre = FindEntryPre(H, key);

	if (pre == NULL) {

		fprintf(stderr, "Error in InsertEntry()\n");

		return NULL;

	}

	//不存在，则返回NULL

	if ((pre->Next) == NULL) {

		return NULL;

	} else {

		return pre->Next->Data;

	}

}

//通过key来删除关系

bool DelEntry(HashTable H, const KeyType key)

{

	Cell* pre = FindEntryPre(H, key);

	if (pre == NULL) {

		fprintf(stderr, "Error in DelEntry()\n");

		return false;

	} else

	if (pre->Next == NULL) {

		printf("Can not find %s!\n", key);

	} else {

		//删除此链表节点

		Cell* mid = pre->Next;

		pre->Next = mid->Next;

		free(mid);

	}

	return true;

}

//从给定位置开始释放单项链表（递归）

static void freeList_(Cell* begin, FreeIt func)

{

	if (begin) {

		freeList_(begin->Next, func);

		if (func) func(begin->Data);

		free(begin);

	}

}

//释放散列表占据的内存

//如果不需要释放数据，则第二个为NULL

void DestroyTable(HashTable H, FreeIt func)

{

	//首先释放分支上的链表

	for (int i = 0; i < H->TableSize; ++i) {

		//跳过链表头，递归释放

		freeList_(H->Cells[i].Next, func);

	}

	//然后释放主干的数组

	free(H->Cells);

	//最后释放表

	free(H);

}